diff --git a/.ci/docker/almalinux/Dockerfile b/.ci/docker/almalinux/Dockerfile
index ce7803cf9acd2..3bc3fd8badc6d 100644
--- a/.ci/docker/almalinux/Dockerfile
+++ b/.ci/docker/almalinux/Dockerfile
@@ -7,13 +7,13 @@ ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8
 
-ARG DEVTOOLSET_VERSION=11
+ARG DEVTOOLSET_VERSION=13
 
 RUN yum -y update
 RUN yum -y install epel-release
 # install glibc-langpack-en make sure en_US.UTF-8 locale is available
 RUN yum -y install glibc-langpack-en
-RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
+RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-gcc gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran gcc-toolset-${DEVTOOLSET_VERSION}-gdb
 # Just add everything as a safe.directory for git since these will be used in multiple places with git
 RUN git config --global --add safe.directory '*'
 ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
@@ -41,6 +41,7 @@ RUN bash ./install_conda.sh && rm install_conda.sh
 # Install CUDA
 FROM base as cuda
 ARG CUDA_VERSION=12.6
+ARG DEVTOOLSET_VERSION=13
 RUN rm -rf /usr/local/cuda-*
 ADD ./common/install_cuda.sh install_cuda.sh
 COPY ./common/install_nccl.sh install_nccl.sh
@@ -50,7 +51,8 @@ ENV CUDA_HOME=/usr/local/cuda-${CUDA_VERSION}
 # Preserve CUDA_VERSION for the builds
 ENV CUDA_VERSION=${CUDA_VERSION}
 # Make things in our path by default
-ENV PATH=/usr/local/cuda-${CUDA_VERSION}/bin:$PATH
+ENV PATH=/usr/local/cuda-${CUDA_VERSION}/bin:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+
 
 FROM cuda as cuda12.6
 RUN bash ./install_cuda.sh 12.6
@@ -68,8 +70,22 @@ FROM cuda as cuda13.0
 RUN bash ./install_cuda.sh 13.0
 ENV DESIRED_CUDA=13.0
 
-FROM ${ROCM_IMAGE} as rocm
+FROM ${ROCM_IMAGE} as rocm_base
+ARG DEVTOOLSET_VERSION=13
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+# Install devtoolset on ROCm base image
+RUN yum -y update && \
+    yum -y install epel-release && \
+    yum -y install glibc-langpack-en && \
+    yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-gcc gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran gcc-toolset-${DEVTOOLSET_VERSION}-gdb
+RUN git config --global --add safe.directory '*'
+ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+
+FROM rocm_base as rocm
 ARG PYTORCH_ROCM_ARCH
+ARG DEVTOOLSET_VERSION=13
 ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
 ADD ./common/install_mkl.sh install_mkl.sh
 RUN bash ./install_mkl.sh && rm install_mkl.sh
@@ -88,6 +104,7 @@ COPY --from=cuda13.0  /usr/local/cuda-13.0 /usr/local/cuda-13.0
 
 # Final step
 FROM ${BASE_TARGET} as final
+ARG DEVTOOLSET_VERSION=13
 COPY --from=openssl            /opt/openssl           /opt/openssl
 COPY --from=patchelf           /patchelf              /usr/local/bin/patchelf
 COPY --from=conda              /opt/conda             /opt/conda
diff --git a/.ci/docker/almalinux/build.sh b/.ci/docker/almalinux/build.sh
index ad234ce1ffb93..468f9b06418f7 100755
--- a/.ci/docker/almalinux/build.sh
+++ b/.ci/docker/almalinux/build.sh
@@ -36,11 +36,7 @@ case ${DOCKER_TAG_PREFIX} in
     ;;
   rocm*)
     BASE_TARGET=rocm
-    PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-    # add gfx950, gfx115x conditionally starting in ROCm 7.0
-    if [[ "$ROCM_VERSION" == *"7.0"* ]]; then
-        PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
-    fi
+    PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151"
     EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
     ;;
   *)
@@ -63,7 +59,7 @@ docker build \
   --target final \
   --progress plain \
   --build-arg "BASE_TARGET=${BASE_TARGET}" \
-  --build-arg "DEVTOOLSET_VERSION=11" \
+  --build-arg "DEVTOOLSET_VERSION=13" \
   ${EXTRA_BUILD_ARGS} \
   -t ${tmp_tag} \
   $@ \
diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index d0500b89780ce..b7e61115e37d6 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -168,6 +168,18 @@ case "$tag" in
     VISION=yes
     TRITON=yes
     ;;
+  pytorch-linux-jammy-py3.11-clang12)
+    ANACONDA_PYTHON_VERSION=3.11
+    CLANG_VERSION=12
+    VISION=no
+    TRITON=no
+    ;;
+  pytorch-linux-jammy-py3.12-clang12)
+    ANACONDA_PYTHON_VERSION=3.12
+    CLANG_VERSION=12
+    VISION=no
+    TRITON=no
+    ;;
   pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-jammy-rocm-n-py3-benchmarks | pytorch-linux-noble-rocm-n-py3)
     if [[ $tag =~ "jammy" ]]; then
       ANACONDA_PYTHON_VERSION=3.10
@@ -176,7 +188,7 @@ case "$tag" in
     fi
     GCC_VERSION=11
     VISION=yes
-    ROCM_VERSION=7.0
+    ROCM_VERSION=7.1
     NINJA_VERSION=1.9.0
     TRITON=yes
     KATEX=yes
@@ -195,9 +207,9 @@ case "$tag" in
     NINJA_VERSION=1.9.0
     TRITON=yes
     ;;
-  pytorch-linux-jammy-xpu-n-py3 | pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks)
+  pytorch-linux-noble-xpu-n-py3 | pytorch-linux-noble-xpu-n-py3-inductor-benchmarks)
     ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=11
+    GCC_VERSION=13
     VISION=yes
     XPU_VERSION=2025.2
     NINJA_VERSION=1.9.0
@@ -248,6 +260,12 @@ case "$tag" in
     HALIDE=yes
     TRITON=yes
     ;;
+  pytorch-linux-jammy-cuda12.8-py3.12-pallas)
+    CUDA_VERSION=12.8.1
+    ANACONDA_PYTHON_VERSION=3.12
+    GCC_VERSION=11
+    PALLAS=yes
+    ;;
   pytorch-linux-jammy-py3.12-triton-cpu)
     CUDA_VERSION=12.6
     ANACONDA_PYTHON_VERSION=3.12
@@ -261,9 +279,9 @@ case "$tag" in
     PYTHON_VERSION=3.10
     CUDA_VERSION=12.8.1
     ;;
-  pytorch-linux-jammy-aarch64-py3.10-gcc11)
+  pytorch-linux-jammy-aarch64-py3.10-gcc13)
     ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=11
+    GCC_VERSION=13
     ACL=yes
     VISION=yes
     OPENBLAS=yes
@@ -271,9 +289,19 @@ case "$tag" in
     # from pytorch/llvm:9.0.1 is x86 specific
     SKIP_LLVM_SRC_BUILD_INSTALL=yes
     ;;
-  pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks)
+  pytorch-linux-jammy-aarch64-py3.10-clang21)
     ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=11
+    CLANG_VERSION=21
+    ACL=yes
+    VISION=yes
+    OPENBLAS=yes
+    # snadampal: skipping llvm src build install because the current version
+    # from pytorch/llvm:9.0.1 is x86 specific
+    SKIP_LLVM_SRC_BUILD_INSTALL=yes
+    ;;
+  pytorch-linux-jammy-aarch64-py3.10-gcc13-inductor-benchmarks)
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=13
     ACL=yes
     VISION=yes
     OPENBLAS=yes
@@ -359,6 +387,7 @@ docker build \
        --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
        --build-arg "EXECUTORCH=${EXECUTORCH}" \
        --build-arg "HALIDE=${HALIDE}" \
+       --build-arg "PALLAS=${PALLAS}" \
        --build-arg "XPU_VERSION=${XPU_VERSION}" \
        --build-arg "UNINSTALL_DILL=${UNINSTALL_DILL}" \
        --build-arg "ACL=${ACL:-}" \
diff --git a/.ci/docker/ci_commit_pins/jax.txt b/.ci/docker/ci_commit_pins/jax.txt
new file mode 100644
index 0000000000000..a3df0a6959e15
--- /dev/null
+++ b/.ci/docker/ci_commit_pins/jax.txt
@@ -0,0 +1 @@
+0.8.0
diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt
index d893bdd32ab34..8fcbc3de469f4 100644
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@@ -1 +1 @@
-ac80c4190aa0321f761a08af97e1e1eee41f01d9
+5df9c723de8c23508773b07fe16dd34e4c444541
diff --git a/.ci/docker/common/install_clang.sh b/.ci/docker/common/install_clang.sh
index 1cb216edf1b38..93daeee919b3d 100755
--- a/.ci/docker/common/install_clang.sh
+++ b/.ci/docker/common/install_clang.sh
@@ -8,8 +8,8 @@ if [ -n "$CLANG_VERSION" ]; then
     # work around ubuntu apt-get conflicts
     sudo apt-get -y -f install
     wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
-    if [[ $CLANG_VERSION == 18 ]]; then
-      apt-add-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-18 main"
+    if [[ $CLANG_VERSION -ge 18 ]]; then
+      apt-add-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-${CLANG_VERSION} main"
     fi
   fi
 
diff --git a/.ci/docker/common/install_gcc.sh b/.ci/docker/common/install_gcc.sh
index 3b96bf6e0ed2f..df1c059bc3869 100644
--- a/.ci/docker/common/install_gcc.sh
+++ b/.ci/docker/common/install_gcc.sh
@@ -7,11 +7,11 @@ if [ -n "$GCC_VERSION" ]; then
   # Need the official toolchain repo to get alternate packages
   add-apt-repository ppa:ubuntu-toolchain-r/test
   apt-get update
-  apt-get install -y g++-$GCC_VERSION
+  apt-get install -y g++-$GCC_VERSION gfortran-$GCC_VERSION
   update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-"$GCC_VERSION" 50
   update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-"$GCC_VERSION" 50
   update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-"$GCC_VERSION" 50
-
+  update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-"$GCC_VERSION" 50
 
   # Cleanup package manager
   apt-get autoclean && apt-get clean
diff --git a/.ci/docker/common/install_jax.sh b/.ci/docker/common/install_jax.sh
new file mode 100755
index 0000000000000..184aedf0f94fe
--- /dev/null
+++ b/.ci/docker/common/install_jax.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+set -ex
+
+source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
+
+# Get the pinned JAX version (same for all CUDA versions)
+JAX_VERSION=$(get_pinned_commit /ci_commit_pins/jax)
+
+function install_jax_12() {
+  echo "Installing JAX ${JAX_VERSION} with CUDA 12 support"
+  pip_install "jax[cuda12]==${JAX_VERSION}" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+
+  # Verify installation
+  python -c "import jax"  # check for errors
+  echo "JAX ${JAX_VERSION} installation completed successfully for CUDA 12"
+}
+
+function install_jax_13() {
+  echo "Installing JAX ${JAX_VERSION} with CUDA 13 support"
+  pip_install "jax[cuda13]==${JAX_VERSION}" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+
+  # Verify installation
+  python -c "import jax"  # check for errors
+  echo "JAX ${JAX_VERSION} installation completed successfully for CUDA 13"
+}
+
+# idiomatic parameter and option handling in sh
+while test $# -gt 0
+do
+    case "$1" in
+    12.4|12.6|12.6.*|12.8|12.8.*|12.9|12.9.*) install_jax_12;
+        ;;
+    13.0|13.0.*) install_jax_13;
+        ;;
+    *) echo "bad argument $1"; exit 1
+        ;;
+    esac
+    shift
+done
diff --git a/.ci/docker/common/install_libgomp.sh b/.ci/docker/common/install_libgomp.sh
new file mode 100644
index 0000000000000..308915ec4f618
--- /dev/null
+++ b/.ci/docker/common/install_libgomp.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+# Script used only in CD pipeline
+
+set -ex
+
+# install dependencies
+dnf -y install gmp-devel libmpc-devel texinfo flex bison
+
+cd /usr/local/src
+# fetch source for gcc 13
+git clone --depth 1 --single-branch -b releases/gcc-13.3.0 https://github.com/gcc-mirror/gcc.git gcc-13.3.0
+
+mkdir -p gcc-13.3.0/build-gomp
+cd gcc-13.3.0/build-gomp
+
+# configure gcc build
+# I got these flags by:
+# 1. downloading the source rpm for gcc-11 on AlmaLinux 8 container
+#    dnf install -y dnf-plugins-core rpmdevtools
+#   dnf download --source libgomp
+# 2. extracting the gcc.spec from the source.
+#    rpmdev-extract gcc-xx.src.rpm
+# 3. extracting optflags and ld_flags from gcc.spec:
+#    rpm --eval '%{optflags}'
+#    rpm --eval '%{build_ldflags}'
+#
+# I had to remove the following flags because they didn't compile for this version of libgomp:
+#   -Werror=format-security
+#   -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1
+#   -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1
+#
+# I added -march=armv8-a -mtune=generic to make them explicit. I don't think they're strictly needed.
+
+OPT_FLAGS='-O2 -march=armv8-a -mtune=generic'\
+' -fexceptions -g -grecord-gcc-switches -pipe -Wall'\
+' -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS'\
+' -fstack-protector-strong -fasynchronous-unwind-tables'\
+' -fstack-clash-protection'
+
+LDFLAGS='-Wl,-z,relro -Wl,--as-needed -Wl,-z,now'
+
+CFLAGS="$OPT_FLAGS" \
+CXXFLAGS="$OPT_FLAGS" \
+LDFLAGS="$LDFLAGS" \
+../configure \
+  --prefix=/usr \
+  --libdir=/usr/lib64 \
+  --enable-languages=c,c++ \
+  --disable-multilib \
+  --disable-bootstrap \
+  --enable-libgomp
+
+# only build libgomp
+make -j$(nproc) all-target-libgomp
+
+make install-target-libgomp
\ No newline at end of file
diff --git a/.ci/docker/common/install_openblas.sh b/.ci/docker/common/install_openblas.sh
index 2f386c6bd523a..5a28068781245 100755
--- a/.ci/docker/common/install_openblas.sh
+++ b/.ci/docker/common/install_openblas.sh
@@ -10,6 +10,7 @@ git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION}" -
 
 OPENBLAS_CHECKOUT_DIR="OpenBLAS"
 OPENBLAS_BUILD_FLAGS="
+CC=gcc
 NUM_THREADS=128
 USE_OPENMP=1
 NO_SHARED=0
diff --git a/.ci/docker/common/install_rocm.sh b/.ci/docker/common/install_rocm.sh
index 9376d259d9cca..988347e28e9d8 100644
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@@ -60,14 +60,16 @@ EOF
         DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev
     fi
 
-    # precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5
-    # search for all unversioned packages
-    # if search fails it will abort this script; use true to avoid case where search fails
-    MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
-    if [[ "x${MIOPENHIPGFX}" = x ]]; then
-      echo "miopen-hip-gfx package not available" && exit 1
-    else
-      DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
+    if [[ $(ver $ROCM_VERSION) -lt $(ver 7.1) ]]; then
+      # precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5, removed in ROCm 7.1
+      # search for all unversioned packages
+      # if search fails it will abort this script; use true to avoid case where search fails
+      MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
+      if [[ "x${MIOPENHIPGFX}" = x ]]; then
+        echo "miopen-hip-gfx package not available" && exit 1
+      else
+        DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
+      fi
     fi
 
     # ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime
diff --git a/.ci/docker/common/install_rocm_magma.sh b/.ci/docker/common/install_rocm_magma.sh
index 2d03c6186b8e5..9bf45e6f1b0a9 100644
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@@ -12,8 +12,8 @@ function do_install() {
 
     rocm_version_nodot=${rocm_version//./}
 
-    # post merge of https://github.com/icl-utk-edu/magma/pull/65
-    MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f
+    # https://github.com/icl-utk-edu/magma/pull/65
+    MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
     magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
 
     rocm_dir="/opt/rocm"
diff --git a/.ci/docker/common/install_xpu.sh b/.ci/docker/common/install_xpu.sh
index 0b150872f93ce..22b7af890c1f6 100644
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@@ -9,7 +9,7 @@ set -xe
 
 function install_ubuntu() {
     . /etc/os-release
-    if [[ ! " jammy " =~ " ${VERSION_CODENAME} " ]]; then
+    if [[ ! " jammy noble " =~ " ${VERSION_CODENAME} " ]]; then
         echo "Ubuntu version ${VERSION_CODENAME} not supported"
         exit
     fi
@@ -35,25 +35,24 @@ function install_ubuntu() {
     # The xpu-smi packages
     apt-get install -y flex bison xpu-smi
 
-    if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
-        # Compute and Media Runtimes
+    # Compute and Media Runtimes
+    if [[ " ${VERSION_CODENAME} " =~ " noble " ]]; then
         apt-get install -y \
-            intel-opencl-icd intel-level-zero-gpu level-zero \
-            intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \
-            libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
+            intel-opencl-icd libze-intel-gpu1 libze1 \
+            intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
+            libegl-mesa0 libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
             libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
-            mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
-        # Development Packages
-        apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
-    else # rolling driver
+            mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc
+    else # jammy
         apt-get install -y \
             intel-opencl-icd libze-intel-gpu1 libze1 \
             intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
             libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
             libglapi-mesa libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
             mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc
-        apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev
     fi
+    # Development Packages
+    apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev
 
     # Install Intel Support Packages
     apt-get install -y ${XPU_PACKAGES}
@@ -66,7 +65,7 @@ function install_ubuntu() {
 function install_rhel() {
     . /etc/os-release
     if [[ "${ID}" == "rhel" ]]; then
-        if [[ ! " 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
+        if [[ ! " 8.8 8.10 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
             echo "RHEL version ${VERSION_ID} not supported"
             exit
         fi
@@ -147,7 +146,7 @@ function install_sles() {
 XPU_DRIVER_VERSION=""
 if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
     # Use GPU driver LTS releases
-    XPU_DRIVER_VERSION="/lts/2350"
+    XPU_DRIVER_VERSION="/lts/2523"
 fi
 
 # Default use Intel® oneAPI Deep Learning Essentials 2025.1
diff --git a/.ci/docker/libtorch/build.sh b/.ci/docker/libtorch/build.sh
index c40896cb5499f..76d3e01e1c38f 100755
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@@ -49,11 +49,7 @@ case ${DOCKER_TAG_PREFIX} in
         fi
         BASE_TARGET=rocm
         GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
-        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-        # add gfx950, gfx115x conditionally starting in ROCm 7.0
-        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
-            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
-        fi
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151"
         DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
         ;;
     *)
diff --git a/.ci/docker/manywheel/Dockerfile_2_28_aarch64 b/.ci/docker/manywheel/Dockerfile_2_28_aarch64
index 768db09929361..78ee09d128cb0 100644
--- a/.ci/docker/manywheel/Dockerfile_2_28_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64
@@ -50,6 +50,10 @@ RUN rm install_ninja.sh
 ENV PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
 
+# Build a newer version of libgomp than that supported in in Almalinux 8.
+COPY ./common/install_libgomp.sh install_libgomp.sh
+RUN bash ./install_libgomp.sh && rm install_libgomp.sh
+
 # git236+ would refuse to run git commands in repos owned by other users
 # Which causes version check to fail, as pytorch repo is bind-mounted into the image
 # Override this behaviour by treating every folder as safe
diff --git a/.ci/docker/manywheel/build.sh b/.ci/docker/manywheel/build.sh
index ac385ce4b29fd..8f9059dc0cc12 100755
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@@ -87,11 +87,7 @@ case ${image} in
         MANY_LINUX_VERSION="2_28"
         DEVTOOLSET_VERSION="11"
         GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
-        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-        # add gfx950, gfx115x conditionally starting in ROCm 7.0
-        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
-            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
-        fi
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151"
         DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
         ;;
     manylinux2_28-builder:xpu)
diff --git a/.ci/docker/requirements-docs.txt b/.ci/docker/requirements-docs.txt
index 6e623b4c56949..de71919012e13 100644
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@@ -1,15 +1,11 @@
-sphinx==5.3.0
+sphinx==7.2.6
 #Description: This is used to generate PyTorch docs
-#Pinned versions: 5.3.0
+#Pinned versions: 7.2.6
 
-standard-imghdr==3.13.0; python_version >= "3.13"
-#Description: This is needed by Sphinx, so it needs to be added here.
-# The reasons are as follows:
-# 1) This module has been removed from the Python standard library since Python 3.13(https://peps.python.org/pep-0594/#imghdr);
-# 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13.
-# Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency.
+pytorch_sphinx_theme2==0.2.0
+#Description: This is needed to generate PyTorch docs
+#Pinned versions: 0.2.0
 
--e git+https://github.com/pytorch/pytorch_sphinx_theme.git@71e55749be14ceb56e7f8211a9fb649866b87ad4#egg=pytorch_sphinx_theme2
 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
 # something related to Docker setup. We can investigate this later.
@@ -36,17 +32,17 @@ tensorboard==2.18.0 ; python_version >= "3.13"
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 2.13.0
 
-breathe==4.34.0
+breathe==4.36.0
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 4.34.0
+#Pinned versions: 4.36.0
 
-exhale==0.2.3
+exhale==0.3.7
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 0.2.3
+#Pinned versions: 0.3.7
 
-docutils==0.16
+docutils==0.20
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 0.16
+#Pinned versions: 0.20
 
 bs4==0.0.1
 #Description: This is used to generate PyTorch C++ docs
@@ -56,13 +52,13 @@ IPython==8.12.0
 #Description: This is used to generate PyTorch functorch docs
 #Pinned versions: 8.12.0
 
-myst-nb==0.17.2
+myst-nb==1.3.0
 #Description: This is used to generate PyTorch functorch and torch.compile docs.
-#Pinned versions: 0.17.2
+#Pinned versions: 1.3.0
 
 # The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
 python-etcd==0.4.5
 sphinx-copybutton==0.5.0
-sphinx-design==0.4.0
+sphinx-design==0.6.1
 sphinxcontrib-mermaid==1.0.0
-myst-parser==0.18.1
+myst-parser==4.0.1
diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
index 84a74114c381e..2081dcbdffd17 100644
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@@ -143,6 +143,15 @@ COPY ci_commit_pins/halide.txt halide.txt
 RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
 RUN rm install_halide.sh common_utils.sh halide.txt
 
+ARG PALLAS
+ARG CUDA_VERSION
+# Install JAX with CUDA support (for Pallas)
+COPY ./common/install_jax.sh install_jax.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./ci_commit_pins/jax.txt /ci_commit_pins/jax.txt
+RUN if [ -n "${PALLAS}" ]; then bash ./install_jax.sh ${CUDA_VERSION}; fi
+RUN rm -f install_jax.sh common_utils.sh /ci_commit_pins/jax.txt
+
 ARG ONNX
 # Install ONNX dependencies
 COPY ./common/install_onnx.sh ./common/common_utils.sh ./
diff --git a/.ci/lumen_cli/cli/lib/common/cli_helper.py b/.ci/lumen_cli/cli/lib/common/cli_helper.py
index 927ca09fe7230..4086eb7d46e81 100644
--- a/.ci/lumen_cli/cli/lib/common/cli_helper.py
+++ b/.ci/lumen_cli/cli/lib/common/cli_helper.py
@@ -8,9 +8,11 @@
 
 
 try:
-    from typing import Any, Callable, Required, TypedDict  # Python 3.11+
+    from collections.abc import Callable  # Python 3.11+
+    from typing import Any, Required, TypedDict
 except ImportError:
-    from typing import Any, Callable, TypedDict
+    from collections.abc import Callable
+    from typing import Any, TypedDict
 
     from typing_extensions import Required  # Fallback for Python <3.11
 
diff --git a/.ci/magma-rocm/README.md b/.ci/magma-rocm/README.md
index cfc3cd3ab1632..3fe1e5d976ccd 100644
--- a/.ci/magma-rocm/README.md
+++ b/.ci/magma-rocm/README.md
@@ -30,7 +30,6 @@ into a tarball, with the following structure:
 More specifically, `build_magma.sh` copies over the relevant files from the `package_files` directory depending on the ROCm version.
 Outputted binaries should be in the `output` folder.
 
-
 ## Pushing
 
 Packages can be uploaded to an S3 bucket using:
diff --git a/.ci/magma-rocm/build_magma.sh b/.ci/magma-rocm/build_magma.sh
index 7d95fed873dc0..c7c7780227ea5 100755
--- a/.ci/magma-rocm/build_magma.sh
+++ b/.ci/magma-rocm/build_magma.sh
@@ -6,8 +6,8 @@ set -eou pipefail
 # The script expects DESIRED_CUDA and PACKAGE_NAME to be set
 ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 
-# post merge of https://github.com/icl-utk-edu/magma/pull/65
-MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f
+# https://github.com/icl-utk-edu/magma/pull/65
+MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
 
 # Folders for the build
 PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata
@@ -20,7 +20,7 @@ mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RE
 
 # Fetch magma sources and verify checksum
 pushd ${PACKAGE_DIR}
-git clone https://github.com/icl-utk-edu/magma
+git clone https://github.com/jeffdaily/magma
 pushd magma
 git checkout ${MAGMA_VERSION}
 popd
diff --git a/.ci/onnx/common.sh b/.ci/onnx/common.sh
index 3de5836a02858..b8f912fbbb4e6 100644
--- a/.ci/onnx/common.sh
+++ b/.ci/onnx/common.sh
@@ -21,3 +21,87 @@ if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
 fi
 
 mkdir -p "$pytest_reports_dir" || true
+
+##########################################
+# copied from .ci/pytorch/common_utils.sh
+##########################################
+
+function get_pinned_commit() {
+  cat .github/ci_commit_pins/"${1}".txt
+}
+
+function pip_install_whl() {
+  # This is used to install PyTorch and other build artifacts wheel locally
+  # without using any network connection
+
+  # Convert the input arguments into an array
+  local args=("$@")
+
+  # Check if the first argument contains multiple paths separated by spaces
+  if [[ "${args[0]}" == *" "* ]]; then
+    # Split the string by spaces into an array
+    IFS=' ' read -r -a paths <<< "${args[0]}"
+    # Loop through each path and install individually
+    for path in "${paths[@]}"; do
+      echo "Installing $path"
+      python3 -mpip install --no-index --no-deps "$path"
+    done
+  else
+    # Loop through each argument and install individually
+    for path in "${args[@]}"; do
+      echo "Installing $path"
+      python3 -mpip install --no-index --no-deps "$path"
+    done
+  fi
+}
+
+function pip_build_and_install() {
+  local build_target=$1
+  local wheel_dir=$2
+
+  local found_whl=0
+  for file in "${wheel_dir}"/*.whl
+  do
+    if [[ -f "${file}" ]]; then
+      found_whl=1
+      break
+    fi
+  done
+
+  # Build the wheel if it doesn't exist
+  if [ "${found_whl}" == "0" ]; then
+    python3 -m pip wheel \
+      --no-build-isolation \
+      --no-deps \
+      -w "${wheel_dir}" \
+      "${build_target}"
+  fi
+
+  for file in "${wheel_dir}"/*.whl
+  do
+    pip_install_whl "${file}"
+  done
+}
+
+function install_torchvision() {
+  local orig_preload
+  local commit
+  commit=$(get_pinned_commit vision)
+  orig_preload=${LD_PRELOAD}
+  if [ -n "${LD_PRELOAD}" ]; then
+    # Silence dlerror to work-around glibc ASAN bug, see https://sourceware.org/bugzilla/show_bug.cgi?id=27653#c9
+    echo 'char* dlerror(void) { return "";}'|gcc -fpic -shared -o "${HOME}/dlerror.so" -x c -
+    LD_PRELOAD=${orig_preload}:${HOME}/dlerror.so
+  fi
+
+  if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then
+    # Not sure if both are needed, but why not
+    export FORCE_CUDA=1
+    export WITH_CUDA=1
+  fi
+  pip_build_and_install "git+https://github.com/pytorch/vision.git@${commit}" dist/vision
+
+  if [ -n "${LD_PRELOAD}" ]; then
+    LD_PRELOAD=${orig_preload}
+  fi
+}
diff --git a/.ci/onnx/test.sh b/.ci/onnx/test.sh
index d42ca2c218dec..1f2a23b49dc45 100755
--- a/.ci/onnx/test.sh
+++ b/.ci/onnx/test.sh
@@ -19,7 +19,7 @@ git config --global --add safe.directory /var/lib/jenkins/workspace
 
 if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
   # TODO: This can be removed later once vision is also part of the Docker image
-  pip install -q --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)"
+  install_torchvision
   # JIT C++ extensions require ninja, so put it into PATH.
   export PATH="/var/lib/jenkins/.local/bin:$PATH"
   # NB: ONNX test is fast (~15m) so it's ok to retry it few more times to avoid any flaky issue, we
diff --git a/.ci/pytorch/build.sh b/.ci/pytorch/build.sh
index d66aa1120fb30..071f14700def4 100755
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@@ -168,14 +168,16 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
   # shellcheck disable=SC1091
   source /opt/intel/oneapi/compiler/latest/env/vars.sh
   # shellcheck disable=SC1091
+  source /opt/intel/oneapi/umf/latest/env/vars.sh
+  # shellcheck disable=SC1091
   source /opt/intel/oneapi/ccl/latest/env/vars.sh
   # shellcheck disable=SC1091
   source /opt/intel/oneapi/mpi/latest/env/vars.sh
+  # shellcheck disable=SC1091
+  source /opt/intel/oneapi/pti/latest/env/vars.sh
   # Enable XCCL build
   export USE_XCCL=1
   export USE_MPI=0
-  # XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA
-  export USE_KINETO=0
   export TORCH_XPU_ARCH_LIST=pvc
 fi
 
diff --git a/.ci/pytorch/common_utils.sh b/.ci/pytorch/common_utils.sh
index 9c9d223777466..323ac6cacd889 100644
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@@ -96,7 +96,6 @@ function pip_build_and_install() {
     python3 -m pip wheel \
       --no-build-isolation \
       --no-deps \
-      --no-use-pep517 \
       -w "${wheel_dir}" \
       "${build_target}"
   fi
@@ -308,6 +307,28 @@ function install_torchao() {
   pip_build_and_install "git+https://github.com/pytorch/ao.git@${commit}" dist/ao
 }
 
+function install_flash_attn_cute() {
+  echo "Installing FlashAttention CuTe from GitHub..."
+  # Grab latest main til we have a pinned commit
+  local flash_attn_commit
+  flash_attn_commit=$(git ls-remote https://github.com/Dao-AILab/flash-attention.git HEAD | cut -f1)
+
+  # Clone the repo to a temporary directory
+  rm -rf flash-attention-build
+  git clone --depth 1 --recursive https://github.com/Dao-AILab/flash-attention.git flash-attention-build
+
+  pushd flash-attention-build
+  git checkout "${flash_attn_commit}"
+
+  # Install only the 'cute' sub-directory
+  pip_install -e flash_attn/cute/
+  popd
+
+  # remove the local repo
+  rm -rf flash-attention-build
+  echo "FlashAttention CuTe installation complete."
+}
+
 function print_sccache_stats() {
   echo 'PyTorch Build Statistics'
   sccache --show-stats
diff --git a/.ci/pytorch/python_doc_push_script.sh b/.ci/pytorch/python_doc_push_script.sh
index ec1187b3fe4c4..6bcd46c4815a6 100755
--- a/.ci/pytorch/python_doc_push_script.sh
+++ b/.ci/pytorch/python_doc_push_script.sh
@@ -89,23 +89,41 @@ if [ "$is_main_doc" = true ]; then
 
   make coverage
   # Now we have the coverage report, we need to make sure it is empty.
-  # Count the number of lines in the file and turn that number into a variable
-  # $lines. The `cut -f1 ...` is to only parse the number, not the filename
-  # Skip the report header by subtracting 2: the header will be output even if
-  # there are no undocumented items.
+  # Sphinx 7.2.6+ format: python.txt contains a statistics table with a TOTAL row
+  # showing the undocumented count in the third column.
+  # Example: | TOTAL | 99.83% | 2 |
   #
   # Also: see docs/source/conf.py for "coverage_ignore*" items, which should
   # be documented then removed from there.
-  lines=$(wc -l build/coverage/python.txt 2>/dev/null |cut -f1 -d' ')
-  undocumented=$((lines - 2))
-  if [ $undocumented -lt 0 ]; then
+
+  # Extract undocumented count from TOTAL row in Sphinx 7.2.6 statistics table
+  # The table format is: | Module | Coverage | Undocumented |
+  # Extract the third column (undocumented count) from the TOTAL row
+  undocumented=$(grep "| TOTAL" build/coverage/python.txt | awk -F'|' '{print $4}' | tr -d ' ')
+
+  if [ -z "$undocumented" ] || ! [[ "$undocumented" =~ ^[0-9]+$ ]]; then
     echo coverage output not found
     exit 1
-  elif [ $undocumented -gt 0 ]; then
-    echo undocumented objects found:
-    cat build/coverage/python.txt
+  elif [ "$undocumented" -gt 0 ]; then
+    set +x  # Disable command echoing for cleaner output
+    echo ""
+    echo "====================="
+    echo "UNDOCUMENTED OBJECTS:"
+    echo "====================="
+    echo ""
+    # Find the line number of the TOTAL row and print only what comes after it
+    total_line=$(grep -n "| TOTAL" build/coverage/python.txt | cut -d: -f1)
+    if [ -n "$total_line" ]; then
+      # Print only the detailed list (skip the statistics table)
+      tail -n +$((total_line + 2)) build/coverage/python.txt
+    else
+      # Fallback to showing entire file if TOTAL line not found
+      cat build/coverage/python.txt
+    fi
+    echo ""
     echo "Make sure you've updated relevant .rsts in docs/source!"
-    echo "You can reproduce locally by running 'cd docs && make coverage && cat build/coverage/python.txt'"
+    echo "You can reproduce locally by running 'cd docs && make coverage && tail -n +\$((grep -n \"| TOTAL\" build/coverage/python.txt | cut -d: -f1) + 2)) build/coverage/python.txt'"
+    set -x  # Re-enable command echoing
     exit 1
   fi
 else
diff --git a/.ci/pytorch/smoke_test/smoke_test.py b/.ci/pytorch/smoke_test/smoke_test.py
index 675d58a3e283d..e760340bebb12 100644
--- a/.ci/pytorch/smoke_test/smoke_test.py
+++ b/.ci/pytorch/smoke_test/smoke_test.py
@@ -353,6 +353,17 @@ def test_linalg(device="cpu") -> None:
             torch.linalg.svd(A)
 
 
+def test_sdpa(device="cpu", dtype=torch.float16) -> None:
+    """Regression test for https://github.com/pytorch/pytorch/issues/167602
+    Without nvrtc_builtins on CuDNN-9.13 on CUDA-13 fails with ` No valid execution plans built.`
+    """
+    print(f"Testing SDPA on {device} using type {dtype}")
+    k, q, v = torch.rand(3, 1, 16, 77, 64, dtype=dtype, device=device).unbind(0)
+    attn = torch.rand(1, 1, 77, 77, dtype=dtype, device=device)
+    rc = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn)
+    assert rc.isnan().any().item() is False
+
+
 def smoke_test_compile(device: str = "cpu") -> None:
     supported_dtypes = [torch.float16, torch.float32, torch.float64]
 
@@ -489,10 +500,12 @@ def main() -> None:
     smoke_test_conv2d()
     test_linalg()
     test_numpy()
+    test_sdpa()
 
     if is_cuda_system:
         test_linalg("cuda")
         test_cuda_gds_errors_captured()
+        test_sdpa("cuda")
 
     if options.package == "all":
         smoke_test_modules()
diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
index 9ae2578758939..01075259e9fe9 100755
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@@ -208,6 +208,8 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
   source /opt/intel/oneapi/ccl/latest/env/vars.sh
   # shellcheck disable=SC1091
   source /opt/intel/oneapi/mpi/latest/env/vars.sh
+  # shellcheck disable=SC1091
+  source /opt/intel/oneapi/pti/latest/env/vars.sh
   # Check XPU status before testing
   timeout 30 xpu-smi discovery || true
 fi
@@ -342,8 +344,18 @@ test_python_smoke() {
 }
 
 test_python_smoke_b200() {
-  # Targeted smoke tests for B200 - staged approach to avoid too many failures
-  time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  # Targeted smoke tests for B200 including FlashAttention CuTe coverage
+  install_flash_attn_cute
+  time python test/run_test.py \
+    --include \
+      test_matmul_cuda \
+      test_scaled_matmul_cuda \
+      inductor/test_fp8 \
+      nn/attention/test_fa4 \
+      nn/attention/test_open_registry \
+      inductor/test_flex_flash \
+    $PYTHON_TEST_EXTRA_OPTION \
+    --upload-artifacts-while-running
   assert_git_not_dirty
 }
 
@@ -377,6 +389,13 @@ test_lazy_tensor_meta_reference_disabled() {
   export -n TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE
 }
 
+test_dynamo_core() {
+  time python test/run_test.py \
+    --include-dynamo-core-tests \
+    --verbose \
+    --upload-artifacts-while-running
+  assert_git_not_dirty
+}
 
 test_dynamo_wrapped_shard() {
   if [[ -z "$NUM_TEST_SHARDS" ]]; then
@@ -824,6 +843,11 @@ test_inductor_halide() {
   assert_git_not_dirty
 }
 
+test_inductor_pallas() {
+  python test/run_test.py --include inductor/test_pallas.py --verbose
+  assert_git_not_dirty
+}
+
 test_inductor_triton_cpu() {
   python test/run_test.py --include inductor/test_triton_cpu_backend.py inductor/test_torchinductor_strided_blocks.py --verbose
   assert_git_not_dirty
@@ -1226,6 +1250,97 @@ test_custom_script_ops() {
   assert_git_not_dirty
 }
 
+test_libtorch_agnostic_targetting() {
+    echo "Testing libtorch_agnostic runs correctly on TORCH_TARGET_VERSION"
+
+    REPO_DIR=$(pwd)
+    WHEEL_DIR="${REPO_DIR}/test/cpp_extensions/.wheels"
+
+    # Build wheel with current PyTorch (this has TORCH_TARGET_VERSION 2_9_0)
+    echo "Building 2.9 extension wheel with current PyTorch..."
+    pushd test/cpp_extensions/libtorch_agnostic_2_9_extension
+    time python setup.py bdist_wheel
+
+    # Save the wheel
+    mkdir -p "$WHEEL_DIR"
+    cp dist/*.whl "$WHEEL_DIR/"
+    WHEEL_FILE=$(find "$WHEEL_DIR" -maxdepth 1 -name "*.whl" -type f | head -1)
+    echo "Built wheel: $(basename "$WHEEL_FILE")"
+    popd
+
+    # Create venv and install PyTorch 2.9
+    python -m venv venv_pytorch_2_9
+    # shellcheck disable=SC1091
+    . venv_pytorch_2_9/bin/activate
+
+    # Clear PYTHONPATH to avoid using the development PyTorch
+    echo "Clearing PYTHONPATH to use only venv packages..."
+    unset PYTHONPATH
+
+    # Upgrade pip to latest version
+    echo "Upgrading pip to latest version..."
+    pip install --upgrade pip
+    pip --version
+
+    echo "Installing PyTorch 2.9..."
+
+    # Install from release channel only
+    PYTORCH_VERSION="2.9.0"
+
+    # Extract CUDA version from BUILD_ENVIRONMENT (e.g., "cuda12.1" -> "cu121")
+    if [[ "$BUILD_ENVIRONMENT" =~ cuda([0-9]+)\.([0-9]+) ]]; then
+        CUDA_MAJOR="${BASH_REMATCH[1]}"
+        CUDA_MINOR="${BASH_REMATCH[2]}"
+        CUDA_VERSION="cu${CUDA_MAJOR}${CUDA_MINOR}"
+        echo "  Detected CUDA ${CUDA_MAJOR}.${CUDA_MINOR} from BUILD_ENVIRONMENT, using ${CUDA_VERSION}"
+    else
+        # Default to CPU build
+        CUDA_VERSION="cpu"
+        echo "  No CUDA detected in BUILD_ENVIRONMENT, using CPU build"
+    fi
+
+    if pip install torch=="${PYTORCH_VERSION}" --index-url https://download.pytorch.org/whl/${CUDA_VERSION}/; then
+        echo "Installed PyTorch ${PYTORCH_VERSION} from release channel (${CUDA_VERSION})"
+    else
+        echo "  FAILED to install PyTorch 2.9.0 from release channel"
+        echo "  URL: https://download.pytorch.org/whl/${CUDA_VERSION}/"
+        deactivate
+        rm -rf venv_pytorch_2_9
+        return 1
+    fi
+
+    INSTALLED_VERSION=$(python -c "import torch; print(torch.__version__)" 2>/dev/null || echo "unknown")
+    echo "  Installed version: $INSTALLED_VERSION"
+
+    # Install test dependencies
+    echo "Installing test dependencies..."
+    pip install expecttest numpy unittest-xml-reporting
+
+    # Install the pre-built wheel
+    echo ""
+    echo "Installing pre-built 2.9 extension wheel (built with PyTorch 2.10)..."
+    pip install "$WHEEL_FILE"
+    echo "Installed $(basename "$WHEEL_FILE") into PyTorch 2.9 environment"
+
+    # Run tests with PyTorch 2.9 runtime (2.10 tests will be skipped automatically)
+    echo ""
+    echo "Running tests with PyTorch 2.9 runtime (using wheel built on PyTorch 2.10)..."
+    if time python test/cpp_extensions/test_libtorch_agnostic.py -v; then
+        echo ""
+        echo "  Wheel built with current torch and TORCH_TARGET_VERSION 2_9_0 works with PyTorch 2.9 runtime!"
+    else
+        echo "targeting test failed"
+        deactivate
+        rm -rf venv_pytorch_2_9 "$WHEEL_DIR"
+        return 1
+    fi
+
+    deactivate
+    rm -rf venv_pytorch_2_9 "$WHEEL_DIR"
+
+    assert_git_not_dirty
+}
+
 test_jit_hooks() {
   echo "Testing jit hooks in cpp"
   HOOK_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/jit-hook-build"
@@ -1663,6 +1778,22 @@ test_operator_microbenchmark() {
   done
 }
 
+test_attention_microbenchmark() {
+  TEST_REPORTS_DIR=$(pwd)/test/test-reports
+  mkdir -p "$TEST_REPORTS_DIR"
+  TEST_DIR=$(pwd)
+
+  # Install attention-gym dependency
+  echo "Installing attention-gym..."
+  python -m pip install git+https://github.com/meta-pytorch/attention-gym.git@main
+  pip show triton
+
+  cd "${TEST_DIR}"/benchmarks/transformer
+
+  $TASKSET python score_mod.py --config configs/config_basic.yaml \
+    --output-json-for-dashboard "${TEST_REPORTS_DIR}/attention_microbenchmark.json"
+}
+
 if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
   (cd test && python -c "import torch; print(torch.__config__.show())")
   (cd test && python -c "import torch; print(torch.__config__.parallel_info())")
@@ -1682,6 +1813,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" == 'default' ]];
 elif [[ "${TEST_CONFIG}" == *backward* ]]; then
   test_forward_backward_compatibility
   # Do NOT add tests after bc check tests, see its comment.
+elif [[ "${TEST_CONFIG}" == *libtorch_agnostic_targetting* ]]; then
+  test_libtorch_agnostic_targetting
 elif [[ "${TEST_CONFIG}" == *xla* ]]; then
   install_torchvision
   build_xla
@@ -1720,10 +1853,14 @@ elif [[ "${TEST_CONFIG}" == *operator_benchmark* ]]; then
   fi
 elif [[ "${TEST_CONFIG}" == *operator_microbenchmark* ]]; then
   test_operator_microbenchmark
+elif [[ "${TEST_CONFIG}" == *attention_microbenchmark* ]]; then
+  test_attention_microbenchmark
 elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
   test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
   test_inductor_halide
+elif [[ "${TEST_CONFIG}" == *inductor-pallas* ]]; then
+  test_inductor_pallas
 elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then
   test_inductor_triton_cpu
 elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
@@ -1777,6 +1914,8 @@ elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
   test_inductor_shard "${SHARD_NUMBER}"
 elif [[ "${TEST_CONFIG}" == *einops* ]]; then
   test_einops
+elif [[ "${TEST_CONFIG}" == *dynamo_core* ]]; then
+  test_dynamo_core
 elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
   install_torchvision
   test_dynamo_wrapped_shard "${SHARD_NUMBER}"
diff --git a/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1 b/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1
index a165f2a222caf..f0eabed170d25 100644
--- a/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1
+++ b/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1
@@ -70,7 +70,7 @@ sccache --zero-stats
 sccache --show-stats
 
 # Build the wheel
-python -m build --wheel --no-build-isolation
+python -m build --wheel --no-isolation
 if ($LASTEXITCODE -ne 0) { exit 1 }
 
 # Install the wheel locally
diff --git a/.github/ISSUE_TEMPLATE/release-feature-request.yml b/.github/ISSUE_TEMPLATE/release-feature-request.yml
index 80f10807ae56b..090a41d1942f6 100644
--- a/.github/ISSUE_TEMPLATE/release-feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/release-feature-request.yml
@@ -1,11 +1,11 @@
-name: 🚀 Release highlight for proposed Feature
+name: 🚀 New Feature for Release
 description: Submit a Release highlight for proposed Feature
 labels: ["release-feature-request"]
 
 body:
 - type: textarea
   attributes:
-    label: Release highlight for proposed Feature
+    label: New Feature for Release
     description: >
       Example: “A torch.special module, analogous to SciPy's special module.”
 - type: input
diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml
index d021371ca8863..dfb30e155b162 100644
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -63,7 +63,7 @@ self-hosted-runner:
     - linux.rocm.gpu.gfx942.1
     - linux.rocm.gpu.gfx942.2
     - linux.rocm.gpu.gfx942.4
-    - rocm-docker
+    - linux.rocm.gfx942.docker-cache
     # Org wise AWS `mac2.metal` runners (2020 Mac mini hardware powered by Apple silicon M1 processors)
     - macos-m1-stable
     - macos-m1-14
diff --git a/.github/actions/pytest-cache-download/action.yml b/.github/actions/pytest-cache-download/action.yml
index 1406f962c4ca8..3f51f6a5525bc 100644
--- a/.github/actions/pytest-cache-download/action.yml
+++ b/.github/actions/pytest-cache-download/action.yml
@@ -38,9 +38,9 @@ runs:
       run: |
         python3 .github/scripts/pytest_cache.py \
           --download \
-          --cache_dir $GITHUB_WORKSPACE/$CACHE_DIR \
-          --pr_identifier $GITHUB_REF \
-          --job_identifier $JOB_IDENTIFIER \
-          --temp_dir $RUNNER_TEMP \
-          --repo $REPO \
-          --bucket $BUCKET \
+          --cache_dir "$GITHUB_WORKSPACE/$CACHE_DIR" \
+          --pr_identifier "$GITHUB_REF" \
+          --job_identifier "$JOB_IDENTIFIER" \
+          --temp_dir "$RUNNER_TEMP" \
+          --repo "$REPO" \
+          --bucket "$BUCKET" \
diff --git a/.github/actions/pytest-cache-upload/action.yml b/.github/actions/pytest-cache-upload/action.yml
index 2652d019075f7..9fbb63a760f27 100644
--- a/.github/actions/pytest-cache-upload/action.yml
+++ b/.github/actions/pytest-cache-upload/action.yml
@@ -47,11 +47,11 @@ runs:
       run: |
         python3 .github/scripts/pytest_cache.py \
           --upload \
-          --cache_dir $GITHUB_WORKSPACE/$CACHE_DIR \
-          --pr_identifier $GITHUB_REF \
-          --job_identifier $JOB_IDENTIFIER \
-          --sha $SHA \
-          --test_config $TEST_CONFIG \
-          --shard $SHARD \
-          --repo $REPO \
-          --temp_dir $RUNNER_TEMP \
+          --cache_dir "$GITHUB_WORKSPACE/$CACHE_DIR" \
+          --pr_identifier "$GITHUB_REF" \
+          --job_identifier "$JOB_IDENTIFIER" \
+          --sha "$SHA" \
+          --test_config "$TEST_CONFIG" \
+          --shard "$SHARD" \
+          --repo "$REPO" \
+          --temp_dir "$RUNNER_TEMP" \
diff --git a/.github/ci_commit_pins/audio.txt b/.github/ci_commit_pins/audio.txt
index 966f6bcfc0d94..616dfd88ce812 100644
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@@ -1 +1 @@
-3b0e7a6f192ca2715e7e6cbe5db007aea7165fe2
+ee1a1350eb37804b94334768f328144f058f14e9
diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
index 183e9fb4b06e1..64ee992f566b7 100644
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-cfbc5c2f1c798991715a6b06bb3ce46478c4487c
+2d82dc5caa336d179d9b46ac4a0fb8c43d84c5cc
diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index 01f0673fcf802..803ba72d9ac92 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-c8b09f5f77d6bf6fb7ed7a9aa83e5d8156b3a5e9
+94631807d22c09723dd006f7be5beb649d5f88d0
diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
new file mode 100644
index 0000000000000..06c3f32abd5e1
--- /dev/null
+++ b/.github/copilot-instructions.md
@@ -0,0 +1,125 @@
+# PyTorch Copilot Instructions
+
+This is the PyTorch machine learning framework codebase. These instructions help AI agents navigate and contribute effectively.
+
+## Architecture Overview
+
+### Core Components
+
+- **c10/** - Core library (C++-10 compatible) for essential, binary-size-conscious functionality
+- **aten/** - ATen tensor library (C++), PyTorch's foundation without autograd
+  - `aten/src/ATen/native/` - Modern operator implementations (CPU/CUDA/MPS/sparse)
+  - `aten/src/ATen/native/native_functions.yaml` - **Critical**: Declarative operator registry
+- **torch/** - Python bindings and public API
+  - `torch/csrc/` - C++ Python bindings (hand-written and generated)
+  - `torch/csrc/autograd/` - Reverse-mode automatic differentiation
+  - `torch/csrc/jit/` - TorchScript JIT compiler
+- **torchgen/** - Code generation tooling that reads `native_functions.yaml`
+- **tools/** - Build scripts, autograd derivatives, code generation
+
+### The Code Generation Workflow
+
+**Most operator changes require editing `native_functions.yaml`**, not direct C++ files. This YAML file:
+1. Declares operator signatures, variants (function/method), and dispatch behavior
+2. Gets processed by `torchgen/` to generate C++/Python bindings
+3. Produces headers in `build/aten/src/ATen/` during compilation
+
+Example entry structure:
+```yaml
+- func: my_op(Tensor self, Scalar alpha=1) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU: my_op_cpu
+    CUDA: my_op_cuda
+```
+
+After editing `native_functions.yaml`, implement kernels in `aten/src/ATen/native/` (see `aten/src/ATen/native/README.md`).
+
+## Development Workflows
+
+### Building from Source
+
+**Never run `setup.py` directly** - use pip with editable install:
+```bash
+python -m pip install --no-build-isolation -v -e .
+```
+
+Speed up builds:
+- `DEBUG=1` - Debug symbols with `-g -O0`
+- `USE_CUDA=0` - Skip CUDA compilation
+- `BUILD_TEST=0` - Skip C++ test binaries
+- Install `ninja` (`pip install ninja`) for faster builds
+- Use `ccache` for incremental compilation caching
+
+Rebuild specific targets: `(cd build && ninja <target>)`
+
+### Testing
+
+**Critical**: DO NOT run entire test suites. Run specific tests only:
+```bash
+python test/test_torch.py TestTorch.test_specific_case
+```
+
+**Test structure**: All tests use `torch.testing._internal.common_utils`:
+```python
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+class TestFeature(TestCase):
+    def test_something(self):
+        # Use self.assertEqual for tensor comparisons
+        pass
+
+if __name__ == "__main__":
+    run_tests()
+```
+
+**For bug fixes**: Create a standalone reproduction script first, verify it fails, then fix and add to appropriate test file.
+
+### Linting
+
+Run linter (not pre-commit): `lintrunner -a` (auto-applies fixes)
+
+## Project-Specific Conventions
+
+### Memory and Storage
+- **Storage is never nullptr** (but `StorageImpl.data` may be nullptr for unallocated outputs)
+- CUDA device info lives in storage objects
+
+### Python-C++ Integration (`torch/csrc/`)
+- Always include `Python.h` **first** to avoid `_XOPEN_SOURCE` redefinition errors
+- Use `pybind11::gil_scoped_acquire` before calling Python API or using `THPObjectPtr`
+- Wrap entry points with `HANDLE_TH_ERRORS` / `END_HANDLE_TH_ERRORS` for exception conversion
+
+### Dispatch System
+- PyTorch uses operator dispatch to route calls to backend-specific kernels
+- Prefer `CompositeExplicitAutograd` dispatch when writing device-agnostic compound ops
+- See `aten/src/ATen/native/README.md` for dispatch keyword guidance
+
+## Git Workflow (AI Agent Specific)
+
+When preparing PRs from this environment:
+```bash
+git stash -u
+git reset --hard $(cat /tmp/orig_work.txt)  # Reset to LOCAL branch
+git stash pop
+# Resolve conflicts if necessary
+```
+
+## Common Gotchas
+
+1. **Editing generated files** - If it's in `build/`, don't edit it. Edit the source template or `native_functions.yaml`
+2. **NVCC template compilation** - NVCC is stricter about C++ than gcc/clang; code working on Linux may fail Windows CI
+3. **Windows symbol visibility** - Use `TORCH_API` macros for exported symbols (required on Windows, optional on Linux)
+4. **No internet access** - DO NOT attempt to install dependencies during development
+
+## Key Files Reference
+
+- `AGENTS.md` - Instructions specific to AI coding agents
+- `CONTRIBUTING.md` - Comprehensive human contributor guide
+- `GLOSSARY.md` - Terminology (ATen, kernels, operations, JIT, TorchScript)
+- `aten/src/ATen/native/README.md` - Operator implementation guide
+- `tools/autograd/derivatives.yaml` - Gradient definitions for autograd
+
+## Performance Debugging
+
+Use `TORCH_SHOW_CPP_STACKTRACES=1` for C++ traces in Python errors. For profiling, prefer `py-spy` over manual instrumentation.
diff --git a/.github/labeler.yml b/.github/labeler.yml
index 7b47b9fefb5dc..cd3b9c809039e 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -91,13 +91,6 @@
 "ciflow/trunk":
 - .ci/docker/ci_commit_pins/triton.txt
 
-"oncall: distributed":
-- torch/csrc/distributed/**
-- torch/distributed/**
-- torch/nn/parallel/**
-- test/distributed/**
-- torch/testing/_internal/distributed/**
-
 "release notes: distributed (checkpoint)":
 - torch/distributed/checkpoint/**
 - test/distributed/checkpoint/**
@@ -138,7 +131,8 @@
 - test/test_matmul_cuda.py
 - test/test_scaled_matmul_cuda.py
 - test/inductor/test_fp8.py
-- aten/src/ATen/native/cuda/Blas.cpp
+- aten/src/ATen/native/cuda/*Blas.cpp
+- aten/src/ATen/cuda/CUDA*Blas.*
 - torch/**/*cublas*
 - torch/_inductor/kernel/mm.py
 - test/inductor/test_max_autotune.py
@@ -148,7 +142,8 @@
 - test/test_matmul_cuda.py
 - test/test_scaled_matmul_cuda.py
 - test/inductor/test_fp8.py
-- aten/src/ATen/native/cuda/Blas.cpp
+- aten/src/ATen/native/cuda/*Blas.cpp
+- aten/src/ATen/cuda/CUDA*Blas.*
 - torch/**/*cublas*
 - torch/_inductor/kernel/mm.py
 - test/inductor/test_max_autotune.py
@@ -158,7 +153,21 @@
 - test/test_matmul_cuda.py
 - test/test_scaled_matmul_cuda.py
 - test/inductor/test_fp8.py
-- aten/src/ATen/native/cuda/Blas.cpp
+- aten/src/ATen/native/cuda/*Blas.cpp
+- aten/src/ATen/cuda/CUDA*Blas.*
 - torch/_inductor/kernel/mm.py
 - test/inductor/test_max_autotune.py
 - third_party/fbgemm
+
+"ciflow/mps":
+- aten/src/ATen/mps/**
+- aten/src/ATen/native/mps/**
+- torch/_inductor/codegen/mps.py
+- test/test_mps.py
+- test/inductor/test_mps_basic.py
+
+"ciflow/h100-symm-mem":
+- torch/csrc/distributed/c10d/symm_mem/**
+- torch/distributed/_symmetric_memory/**
+- test/distributed/**/*mem*
+- test/distributed/**/*mem*/**
diff --git a/.github/nitpicks.yml b/.github/nitpicks.yml
index 1d08a36abf1d5..e3fe5d4725587 100644
--- a/.github/nitpicks.yml
+++ b/.github/nitpicks.yml
@@ -10,3 +10,4 @@
   pathFilter:
     - 'torch/csrc/inductor/aoti_torch/c/*'
     - 'torch/csrc/inductor/aoti_torch/generated/*'
+    - 'torch/csrc/stable/c/*'
diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
index c15ba606398f6..1258f4b8d8088 100644
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@@ -2,11 +2,12 @@ tracking_issue: 24422
 ciflow_tracking_issue: 64124
 ciflow_push_tags:
 - ciflow/b200
-- ciflow/b200-symm-mem
 - ciflow/b200-distributed
+- ciflow/b200-symm-mem
 - ciflow/binaries
 - ciflow/binaries_libtorch
 - ciflow/binaries_wheel
+- ciflow/dynamo
 - ciflow/h100
 - ciflow/h100-cutlass-backend
 - ciflow/h100-distributed
@@ -22,6 +23,8 @@ ciflow_push_tags:
 - ciflow/inductor-perf-test-nightly-xpu
 - ciflow/inductor-periodic
 - ciflow/inductor-rocm
+- ciflow/inductor-rocm-mi200
+- ciflow/inductor-rocm-mi300
 - ciflow/linux-aarch64
 - ciflow/mps
 - ciflow/nightly
@@ -33,11 +36,13 @@ ciflow_push_tags:
 - ciflow/quantization-periodic
 - ciflow/riscv64
 - ciflow/rocm
+- ciflow/rocm-mi200
 - ciflow/rocm-mi300
 - ciflow/rocm-mi355
 - ciflow/rocm-navi31
 - ciflow/s390
 - ciflow/slow
+- ciflow/slow-rocm-mi200
 - ciflow/torchbench
 - ciflow/triton_binaries
 - ciflow/trunk
diff --git a/.github/scripts/delete_old_branches.py b/.github/scripts/delete_old_branches.py
index 8032008edf122..42cd851f8e338 100644
--- a/.github/scripts/delete_old_branches.py
+++ b/.github/scripts/delete_old_branches.py
@@ -1,10 +1,11 @@
 # Delete old branches
 import os
 import re
+from collections.abc import Callable
 from datetime import datetime
 from functools import lru_cache
 from pathlib import Path
-from typing import Any, Callable
+from typing import Any
 
 from github_utils import gh_fetch_json_dict, gh_graphql
 from gitutils import GitRepo
diff --git a/.github/scripts/filter_test_configs.py b/.github/scripts/filter_test_configs.py
index 592c7aab6d933..ee102d3f560f9 100755
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@@ -8,10 +8,11 @@
 import subprocess
 import sys
 import warnings
+from collections.abc import Callable
 from enum import Enum
 from functools import cache
 from logging import info
-from typing import Any, Callable, Optional
+from typing import Any, Optional
 from urllib.request import Request, urlopen
 
 import yaml
diff --git a/.github/scripts/generate_pytorch_version.py b/.github/scripts/generate_pytorch_version.py
index b35ccf6bcd38a..85be79c762e28 100755
--- a/.github/scripts/generate_pytorch_version.py
+++ b/.github/scripts/generate_pytorch_version.py
@@ -50,7 +50,7 @@ def get_tag() -> str:
 
 def get_base_version() -> str:
     root = get_pytorch_root()
-    dirty_version = open(root / "version.txt").read().strip()
+    dirty_version = Path(root / "version.txt").read_text().strip()
     # Strips trailing a0 from version.txt, not too sure why it's there in the
     # first place
     return re.sub(LEGACY_BASE_VERSION_SUFFIX_PATTERN, "", dirty_version)
diff --git a/.github/scripts/get_workflow_job_id.py b/.github/scripts/get_workflow_job_id.py
index b04cbed76e955..54e66621c9fd0 100644
--- a/.github/scripts/get_workflow_job_id.py
+++ b/.github/scripts/get_workflow_job_id.py
@@ -11,7 +11,8 @@
 import time
 import urllib
 import urllib.parse
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from typing import Any, Optional
 from urllib.request import Request, urlopen
 
 
diff --git a/.github/scripts/github_utils.py b/.github/scripts/github_utils.py
index 110015988a5c3..6479fb64ddbaf 100644
--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@@ -3,8 +3,9 @@
 import json
 import os
 import warnings
+from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Any, Callable, cast, Optional, Union
+from typing import Any, cast, Optional, Union
 from urllib.error import HTTPError
 from urllib.parse import quote
 from urllib.request import Request, urlopen
diff --git a/.github/scripts/gitutils.py b/.github/scripts/gitutils.py
index 3a90ddb5f4c6b..6e3bb3f209177 100644
--- a/.github/scripts/gitutils.py
+++ b/.github/scripts/gitutils.py
@@ -4,10 +4,10 @@
 import re
 import tempfile
 from collections import defaultdict
-from collections.abc import Iterator
+from collections.abc import Callable, Iterator
 from datetime import datetime
 from functools import wraps
-from typing import Any, Callable, cast, Optional, TypeVar, Union
+from typing import Any, cast, Optional, TypeVar, Union
 
 
 T = TypeVar("T")
diff --git a/.github/scripts/lintrunner.sh b/.github/scripts/lintrunner.sh
index b353617a45b2b..58cda19cfeb43 100755
--- a/.github/scripts/lintrunner.sh
+++ b/.github/scripts/lintrunner.sh
@@ -34,6 +34,9 @@ python3 torch/utils/data/datapipes/gen_pyi.py
 # Also check generated pyi files
 find torch -name '*.pyi' -exec git add --force -- "{}" +
 
+# Print current environment
+python3 -m pip freeze
+
 RC=0
 # Run lintrunner on all files
 if ! lintrunner --force-color --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS} 2> /dev/null; then
diff --git a/.github/scripts/trymerge.py b/.github/scripts/trymerge.py
index c258284a00d83..697ab6992793d 100755
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@@ -17,12 +17,12 @@
 import time
 import urllib.parse
 from collections import defaultdict
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from dataclasses import dataclass
 from functools import cache
 from pathlib import Path
 from re import Pattern
-from typing import Any, Callable, cast, NamedTuple, Optional
+from typing import Any, cast, NamedTuple, Optional
 from warnings import warn
 
 import yaml
diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml
index 29c2fc8e08476..b52ec158dd6d6 100644
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@@ -326,7 +326,7 @@ jobs:
           SCCACHE_BUCKET: ${{ !contains(matrix.runner, 'b200') && 'ossci-compiler-cache-circleci-v2' || '' }}
           SCCACHE_REGION: ${{ !contains(matrix.runner, 'b200') && 'us-east-1' || '' }}
           SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
-          DOCKER_IMAGE: ${{ inputs.docker-image }}
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
           XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
           XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
           PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml
index 43ed76a63cc67..608aeba53e6d8 100644
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@@ -97,8 +97,8 @@ jobs:
         shell: bash
         run: |
           ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
-          if [[ $ngpu -lt 4 ]]; then
-            echo "Error: only $ngpu GPU(s) detected, at least 4 GPUs are needed for distributed jobs"
+          if [[ $ngpu -lt 2 ]]; then #We are temporarily reducing this down to 2 from 4 so that we can run tests on nodes with less gpus.
+            echo "Error: only $ngpu GPU(s) detected, at least 2 GPUs are needed for distributed jobs"
             exit 1
           fi
 
diff --git a/.github/workflows/_xpu-test.yml b/.github/workflows/_xpu-test.yml
index e68bc6ead3a26..d27325b8a63dc 100644
--- a/.github/workflows/_xpu-test.yml
+++ b/.github/workflows/_xpu-test.yml
@@ -344,5 +344,21 @@ jobs:
           if-no-files-found: ignore
           path: ./**/core.[1-9]*
 
+      - name: Authenticate with AWS
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
+          # The max duration enforced by the server side
+          role-duration-seconds: 18000
+          aws-region: us-east-1
+
+      - name: Upload the benchmark results
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+        with:
+          benchmark-results-dir: test/test-reports
+          dry-run: false
+          schema-version: v3
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
       - name: Teardown XPU
         uses: ./.github/actions/teardown-xpu
diff --git a/.github/workflows/attention_op_microbenchmark.yml b/.github/workflows/attention_op_microbenchmark.yml
new file mode 100644
index 0000000000000..e01bc49621dcf
--- /dev/null
+++ b/.github/workflows/attention_op_microbenchmark.yml
@@ -0,0 +1,73 @@
+name: attention_op_microbenchmark
+
+on:
+  push:
+    tags:
+      - ciflow/op-benchmark/*
+  workflow_dispatch:
+  schedule:
+    # Run at 06:00 UTC everyday
+    - cron: 0 7 * * *
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  attn-microbenchmark-build:
+    if: github.repository_owner == 'pytorch'
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      runner: linux.12xlarge.memory
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '8.0 9.0'
+      test-matrix: |
+        { include: [
+          { config: "attention_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
+          { config: "attention_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.h100" },
+        ]}
+    secrets: inherit
+
+  attn-microbenchmark-test:
+    name: attn-microbenchmark-test
+    uses: ./.github/workflows/_linux-test.yml
+    needs: attn-microbenchmark-build
+    with:
+      timeout-minutes: 500
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image: ${{ needs.attn-microbenchmark-build.outputs.docker-image }}
+      test-matrix: ${{ needs.attn-microbenchmark-build.outputs.test-matrix }}
+    secrets: inherit
+
+  # B200 runner
+  opmicrobenchmark-build-b200:
+    if: github.repository_owner == 'pytorch'
+    name: opmicrobenchmark-build-b200
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      runner: linux.12xlarge.memory
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '10.0'
+      test-matrix: |
+        { include: [
+          { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
+        ]}
+    secrets: inherit
+
+  opmicrobenchmark-test-b200:
+    name: opmicrobenchmark-test-b200
+    uses: ./.github/workflows/_linux-test.yml
+    needs: opmicrobenchmark-build-b200
+    with:
+      timeout-minutes: 500
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
+      docker-image: ${{ needs.opmicrobenchmark-build-b200.outputs.docker-image }}
+      test-matrix: ${{ needs.opmicrobenchmark-build-b200.outputs.test-matrix }}
+      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+    secrets: inherit
diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index 6fbe2e846d40b..408a8f0000504 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -56,6 +56,8 @@ jobs:
           pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
           pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
           pytorch-linux-jammy-py3.10-clang12,
+          pytorch-linux-jammy-py3.11-clang12,
+          pytorch-linux-jammy-py3.12-clang12,
           pytorch-linux-jammy-py3.13-clang12,
           pytorch-linux-jammy-py3.14-clang12,
           pytorch-linux-jammy-rocm-n-py3,
@@ -65,9 +67,10 @@ jobs:
           pytorch-linux-jammy-py3.10-gcc11,
           pytorch-linux-jammy-py3-gcc11-inductor-benchmarks,
           pytorch-linux-jammy-py3.12-halide,
+          pytorch-linux-jammy-cuda12.8-py3.12-pallas,
           pytorch-linux-jammy-xpu-n-1-py3,
-          pytorch-linux-jammy-xpu-n-py3,
-          pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks,
+          pytorch-linux-noble-xpu-n-py3,
+          pytorch-linux-noble-xpu-n-py3-inductor-benchmarks,
           pytorch-linux-jammy-py3-clang18-asan,
           pytorch-linux-jammy-py3-clang12-onnx,
           pytorch-linux-jammy-linter,
@@ -77,9 +80,11 @@ jobs:
           pytorch-linux-noble-riscv64-py3.12-gcc14
         ]
         include:
-          - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11
+          - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc13
             runner: linux.arm64.m7g.4xlarge
-          - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
+          - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-clang21
+            runner: linux.arm64.m7g.4xlarge
+          - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc13-inductor-benchmarks
             runner: linux.arm64.m7g.4xlarge
             timeout-minutes: 600
     # Docker uploads fail from LF runners, see https://github.com/pytorch/pytorch/pull/137358
@@ -114,6 +119,22 @@ jobs:
         with:
           docker-image: ${{ steps.build-docker-image.outputs.docker-image }}
 
+      - name: Generate output
+        if: contains(matrix.docker-image-name, 'rocm')
+        id: generate_output
+        run: |
+          docker_image_name="${{ matrix.docker-image-name }}"
+          docker_image_tag="${{ steps.build-docker-image.outputs.docker-image }}"
+          echo "${docker_image_name}=${docker_image_tag}" >> docker-builds-output-${docker_image_name}.txt
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4.4.0
+        if: contains(matrix.docker-image-name, 'rocm')
+        with:
+          name: docker-builds-artifacts-${{ matrix.docker-image-name }}
+          retention-days: 14
+          path: ./docker-builds-output-${{ matrix.docker-image-name }}.txt
+
       - uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0
         name: Push to https://ghcr.io/
         id: push-to-ghcr-io
diff --git a/.github/workflows/docker-cache-mi300.yml b/.github/workflows/docker-cache-mi300.yml
deleted file mode 100644
index 02c1171c567aa..0000000000000
--- a/.github/workflows/docker-cache-mi300.yml
+++ /dev/null
@@ -1,55 +0,0 @@
-name: docker-cache-mi300
-
-on:
-  # run every 6 hours
-  schedule:
-    - cron: 0 0,6,12,18 * * *
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }}
-  cancel-in-progress: true
-
-permissions:
-  id-token: write
-  contents: read
-
-jobs:
-  docker-cache:
-    if: github.repository_owner == 'pytorch'
-    runs-on: rocm-docker
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-        with:
-          no-sudo: true
-
-      - name: configure aws credentials
-        id: aws_creds
-        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-          aws-region: us-east-1
-          role-duration-seconds: 18000
-
-      - name: Login to Amazon ECR
-        id: login-ecr
-        continue-on-error: false
-        uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
-
-      - name: Calculate docker image
-        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-          docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
-          push: false
-
-      - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
-
-      - name: Tar and upload to S3 bucket
-        run: |
-          sudo docker save -o ~/docker-data/pytorch/pytorch_docker_image.tar ${{ steps.calculate-docker-image.outputs.docker-image }}
-          sudo rclone copy -P --s3-upload-concurrency 64 --s3-chunk-size 200M --s3-upload-cutoff 300M ~/docker-data/pytorch/pytorch_docker_image.tar oci:pytorchbucket0002/pytorch_docker_image --progress
diff --git a/.github/workflows/docker-cache-rocm.yml b/.github/workflows/docker-cache-rocm.yml
new file mode 100644
index 0000000000000..78d38de3ac69a
--- /dev/null
+++ b/.github/workflows/docker-cache-rocm.yml
@@ -0,0 +1,105 @@
+name: docker-cache-rocm
+
+on:
+  workflow_run:
+    workflows: [docker-builds]
+    branches: [main, release]
+    types:
+      - completed
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+  actions: read
+
+jobs:
+  download-docker-builds-artifacts:
+    if: github.repository_owner == 'pytorch'
+    name: download-docker-builds-artifacts
+    runs-on: ubuntu-latest
+    outputs:
+      pytorch-linux-jammy-rocm-n-py3: ${{ steps.process-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3 }}
+      pytorch-linux-noble-rocm-n-py3: ${{ steps.process-artifacts.outputs.pytorch-linux-noble-rocm-n-py3 }}
+      pytorch-linux-jammy-rocm-n-py3-benchmarks: ${{ steps.process-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3-benchmarks }}
+    steps:
+      - name: Download artifacts
+        uses: actions/download-artifact@v4.1.7
+        with:
+          run-id: ${{ github.event.workflow_run.id }}
+          path: ./docker-builds-artifacts
+          merge-multiple: true
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Process artifacts
+        id: process-artifacts
+        run: |
+          ls -R ./docker-builds-artifacts
+          cat ./docker-builds-artifacts/*txt >> "${GITHUB_OUTPUT}"
+          cat "${GITHUB_OUTPUT}"
+
+  docker-cache:
+    if: github.repository_owner == 'pytorch'
+    needs: download-docker-builds-artifacts
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux.rocm.gfx942.docker-cache]
+        docker-image: [
+          "${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3 }}",
+          "${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-noble-rocm-n-py3 }}",
+          "${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3-benchmarks }}"
+        ]
+    runs-on: "${{ matrix.runner }}"
+    steps:
+      - name: debug
+        run: |
+          JSON_STRINGIFIED="${{ toJSON(needs.download-docker-builds-artifacts.outputs) }}"
+          echo "Outputs of download-docker-builds-artifacts job: ${JSON_STRINGIFIED}"
+
+      - name: configure aws credentials
+        id: aws_creds
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+
+      - name: Login to Amazon ECR
+        id: login-ecr
+        continue-on-error: false
+        uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
+
+      - name: Generate ghrc.io tag
+        id: ghcr-io-tag
+        run: |
+            ecr_image="${{ matrix.docker-image }}"
+            ghcr_image="ghcr.io/pytorch/ci-image:${ecr_image##*:}"
+            echo "ghcr_image=${ghcr_image}" >> "$GITHUB_OUTPUT"
+
+      - name: Pull docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: ${{ steps.ghcr-io-tag.outputs.ghcr_image }}
+
+      - name: Save as tarball
+        run: |
+          docker_image_tag=${{ matrix.docker-image }}
+          docker_image_tag="${docker_image_tag#*:}" # Remove everything before and including first ":"
+          docker_image_tag="${docker_image_tag%-*}" # Remove everything after and including last "-"
+          ref_name=${{ github.event.workflow_run.head_branch }}
+          if [[ $ref_name =~ "release/" ]]; then
+            ref_suffix="release"
+          elif [[ $ref_name == "main" ]]; then
+            ref_suffix="main"
+          else
+            echo "Unexpected branch in ref_name: ${ref_name}" && exit 1
+          fi
+          docker tag ${{ steps.ghcr-io-tag.outputs.ghcr_image }} ${{ matrix.docker-image }}
+          # mv is atomic operation, so we use intermediate tar.tmp file to prevent read-write contention
+          docker save -o ~/pytorch-data/docker/${docker_image_tag}.tar.tmp ${{ matrix.docker-image }}
+          mv ~/pytorch-data/docker/${docker_image_tag}.tar.tmp ~/pytorch-data/docker/${docker_image_tag}_${ref_suffix}.tar
diff --git a/.github/workflows/dynamo-unittest.yml b/.github/workflows/dynamo-unittest.yml
new file mode 100644
index 0000000000000..e1399b1376de4
--- /dev/null
+++ b/.github/workflows/dynamo-unittest.yml
@@ -0,0 +1,70 @@
+# Workflow: Dynamo Unit Test
+# runs unit tests for dynamo.
+name: dynamo-unittest
+
+on:
+  push:
+    tags:
+      - ciflow/dynamo/*
+  workflow_call:
+  schedule:
+    - cron: 29 8 * * * # about 1:29am PDT
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  get-label-type:
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+      opt_out_experiments: lf
+
+  dynamo-build:
+    name: dynamo-build
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    strategy:
+      matrix:
+        python-version: ['3.11', '3.12']
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-py${{ matrix.python-version }}-clang12
+      docker-image-name: ci-image:pytorch-linux-jammy-py${{ matrix.python-version }}-clang12
+      test-matrix: |
+        { include: [
+          { config: "dynamo_core", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
+          { config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
+          { config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
+          { config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
+        ]}
+    secrets: inherit
+
+  dynamo-test:
+    name: dynamo-test
+    uses: ./.github/workflows/_linux-test.yml
+    needs: [get-label-type, dynamo-build]
+    strategy:
+      matrix:
+        python-version: ['3.11', '3.12']
+    with:
+      build-environment: linux-jammy-py${{ matrix.python-version }}-clang12
+      docker-image: ci-image:pytorch-linux-jammy-py${{ matrix.python-version }}-clang12
+      test-matrix: |
+        { include: [
+          { config: "dynamo_core", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
+          { config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
+          { config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
+          { config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
+        ]}
+    secrets: inherit
diff --git a/.github/workflows/h100-distributed.yml b/.github/workflows/h100-distributed.yml
index be19b8f961f4d..c05b61e30a635 100644
--- a/.github/workflows/h100-distributed.yml
+++ b/.github/workflows/h100-distributed.yml
@@ -37,7 +37,6 @@ jobs:
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: "linux.c7i.12xlarge"
       build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-dist
       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
       cuda-arch-list: '9.0'
diff --git a/.github/workflows/inductor-perf-test-nightly-aarch64.yml b/.github/workflows/inductor-perf-test-nightly-aarch64.yml
index e16c8be79130d..46a1966570c63 100644
--- a/.github/workflows/inductor-perf-test-nightly-aarch64.yml
+++ b/.github/workflows/inductor-perf-test-nightly-aarch64.yml
@@ -72,7 +72,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runner: linux.arm64.m7g.4xlarge
       build-environment: linux-jammy-aarch64-py3.10
-      docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc13-inductor-benchmarks
       test-matrix: |
         { include: [
           { config: "inductor_huggingface_perf_cpu_aarch64", shard: 1, num_shards: 9, runner: "linux.arm64.m7g.metal" },
diff --git a/.github/workflows/inductor-perf-test-nightly-xpu.yml b/.github/workflows/inductor-perf-test-nightly-xpu.yml
index c2db8c310e368..28b10996bf38a 100644
--- a/.github/workflows/inductor-perf-test-nightly-xpu.yml
+++ b/.github/workflows/inductor-perf-test-nightly-xpu.yml
@@ -83,8 +83,8 @@ jobs:
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-xpu-n-py3.10
-      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks
+      build-environment: linux-noble-xpu-n-py3.10
+      docker-image-name: ci-image:pytorch-linux-noble-xpu-n-py3-inductor-benchmarks
       runner: linux.c7i.12xlarge
       test-matrix: |
         { include: [
@@ -117,7 +117,7 @@ jobs:
     uses: ./.github/workflows/_xpu-test.yml
     needs: xpu-n-py3_10-inductor-benchmark-build
     with:
-      build-environment: linux-jammy-xpu-n-py3.10
+      build-environment: linux-noble-xpu-n-py3.10
       dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-false-cppwrapper-true-aotinductor-true-freezing_cudagraphs-false-cudagraphs_low_precision-false
       docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
       test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
@@ -137,7 +137,7 @@ jobs:
     uses: ./.github/workflows/_xpu-test.yml
     needs: xpu-n-py3_10-inductor-benchmark-build
     with:
-      build-environment: linux-jammy-xpu-n-py3.10
+      build-environment: linux-noble-xpu-n-py3.10
       dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
       docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
       test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
diff --git a/.github/workflows/inductor-rocm.yml b/.github/workflows/inductor-rocm-mi200.yml
similarity index 95%
rename from .github/workflows/inductor-rocm.yml
rename to .github/workflows/inductor-rocm-mi200.yml
index b2ff53a645481..55de9a2121cf6 100644
--- a/.github/workflows/inductor-rocm.yml
+++ b/.github/workflows/inductor-rocm-mi200.yml
@@ -1,13 +1,13 @@
-name: inductor-rocm
+name: inductor-rocm-mi200
 
 on:
   schedule:
-    - cron: 0 * * * *
+    - cron: 0 */3 * * *
   push:
     branches:
       - release/*
     tags:
-      - ciflow/inductor-rocm/*
+      - ciflow/inductor-rocm-mi200/*
   workflow_dispatch:
 
 concurrency:
diff --git a/.github/workflows/inductor-rocm-mi300.yml b/.github/workflows/inductor-rocm-mi300.yml
index 732ec7eb85f3e..dee10a0db3c16 100644
--- a/.github/workflows/inductor-rocm-mi300.yml
+++ b/.github/workflows/inductor-rocm-mi300.yml
@@ -7,6 +7,7 @@ on:
       - release/*
     tags:
       - ciflow/inductor-rocm/*
+      - ciflow/inductor-rocm-mi300/*
   workflow_dispatch:
 
 concurrency:
diff --git a/.github/workflows/inductor-unittest.yml b/.github/workflows/inductor-unittest.yml
index 6ab276a57fc4d..ca9b57cab2ddb 100644
--- a/.github/workflows/inductor-unittest.yml
+++ b/.github/workflows/inductor-unittest.yml
@@ -81,6 +81,32 @@ jobs:
       test-matrix: ${{ needs.inductor-halide-build.outputs.test-matrix }}
     secrets: inherit
 
+  inductor-pallas-build:
+    name: inductor-pallas-build
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.12-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-py3.12-pallas
+      cuda-arch-list: '8.9'
+      runner: linux.8xlarge.memory
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      test-matrix: |
+        { include: [
+          { config: "inductor-pallas", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  inductor-pallas-test:
+    name: inductor-pallas-test
+    uses: ./.github/workflows/_linux-test.yml
+    needs: inductor-pallas-build
+    with:
+      build-environment: linux-jammy-py3.12-gcc11
+      docker-image: ${{ needs.inductor-pallas-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-pallas-build.outputs.test-matrix }}
+    secrets: inherit
+
   inductor-triton-cpu-build:
     name: inductor-triton-cpu-build
     uses: ./.github/workflows/_linux-build.yml
@@ -115,10 +141,10 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       test-matrix: |
         { include: [
-          { config: "inductor_amx", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
-          { config: "inductor_amx", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
-          { config: "inductor_avx2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" },
-          { config: "inductor_avx2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" },
+          { config: "inductor_amx", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" },
+          { config: "inductor_amx", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" },
+          { config: "inductor_avx2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.avx2" },
+          { config: "inductor_avx2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.avx2" },
         ]}
     secrets: inherit
 
diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml
index 2616141c0dc2a..8a913c3b36a11 100644
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@@ -84,13 +84,13 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       test-matrix: |
         { include: [
-          { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
-          { config: "cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
-          { config: "dynamic_cpu_inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
-          { config: "dynamic_cpu_inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
-          { config: "dynamic_cpu_inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
-          { config: "dynamic_cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
-          { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" },
+          { config: "cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" },
+          { config: "dynamic_cpu_inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" },
+          { config: "dynamic_cpu_inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" },
+          { config: "dynamic_cpu_inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" },
+          { config: "dynamic_cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" },
+          { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" },
           { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.24xl.spr-metal" },
         ]}
       build-additional-packages: "vision audio torchao"
diff --git a/.github/workflows/linux-aarch64.yml b/.github/workflows/linux-aarch64.yml
index 2b840a39a5c21..e6690b1043006 100644
--- a/.github/workflows/linux-aarch64.yml
+++ b/.github/workflows/linux-aarch64.yml
@@ -33,7 +33,7 @@ jobs:
     with:
       runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
       build-environment: linux-jammy-aarch64-py3.10
-      docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc13
       runner: linux.arm64.m7g.4xlarge
       test-matrix: |
         { include: [
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 0682dd2144afd..c47b0c5763078 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -5,9 +5,11 @@ on:
     - cron: 0 0 * * *
   push:
     tags:
-      # NOTE: Doc build pipelines should only get triggered on release candidate builds
-      # Release candidate tags look like: v1.11.0-rc1
-      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+      # NOTE: Doc build pipelines should only get triggered on:
+      # Major or minor release candidates builds
+      - v[0-9]+.[0-9]+.0+-rc[0-9]+
+      # Final RC for major, minor and patch releases
+      - v[0-9]+.[0-9]+.[0-9]+
       - ciflow/nightly/*
   workflow_dispatch:
 
diff --git a/.github/workflows/operator_benchmark.yml b/.github/workflows/operator_benchmark.yml
index 40fb3b8d0c85f..758147f5fe18e 100644
--- a/.github/workflows/operator_benchmark.yml
+++ b/.github/workflows/operator_benchmark.yml
@@ -60,7 +60,7 @@ jobs:
     with:
       build-environment: linux-jammy-aarch64-py3.10
       runner: linux.arm64.m7g.4xlarge
-      docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc13
       test-matrix: |
         { include: [
           { config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.arm64.m8g.4xlarge" },
diff --git a/.github/workflows/periodic-rocm-mi200.yml b/.github/workflows/periodic-rocm-mi200.yml
index 6b65bf05cbde0..18e7b60570bf8 100644
--- a/.github/workflows/periodic-rocm-mi200.yml
+++ b/.github/workflows/periodic-rocm-mi200.yml
@@ -11,7 +11,6 @@ on:
     - cron: 29 8 * * *  # about 1:29am PDT, for mem leak check and rerun disabled tests
   push:
     tags:
-      - ciflow/periodic/*
       - ciflow/periodic-rocm-mi200/*
     branches:
       - release/*
diff --git a/.github/workflows/periodic-rocm-mi300.yml b/.github/workflows/periodic-rocm-mi300.yml
index 4d8890e69fc73..ce68ee8bc8e03 100644
--- a/.github/workflows/periodic-rocm-mi300.yml
+++ b/.github/workflows/periodic-rocm-mi300.yml
@@ -11,6 +11,7 @@ on:
     - cron: 29 8 * * *  # about 1:29am PDT, for mem leak check and rerun disabled tests
   push:
     tags:
+      - ciflow/periodic/*
       - ciflow/periodic-rocm-mi300/*
     branches:
       - release/*
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index e3af55e736503..51e211a5ad2ad 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -70,6 +70,7 @@ jobs:
           { config: "distributed", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
           { config: "distributed", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
           { config: "numpy_2_x", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
+          { config: "libtorch_agnostic_targetting", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
         ]}
     secrets: inherit
 
@@ -342,16 +343,16 @@ jobs:
       test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
     secrets: inherit
 
-  linux-jammy-xpu-n-py3_10-build:
-    name: linux-jammy-xpu-n-py3.10
+  linux-noble-xpu-n-py3_10-build:
+    name: linux-noble-xpu-n-py3.10
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       # This should sync with the build in xpu.yml but xpu uses a larger runner
       # sync-tag: linux-xpu-n-build
       runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
-      build-environment: linux-jammy-xpu-n-py3.10
-      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
+      build-environment: linux-noble-xpu-n-py3.10
+      docker-image-name: ci-image:pytorch-linux-noble-xpu-n-py3
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 4, runner: "linux.idc.xpu" },
diff --git a/.github/workflows/rocm.yml b/.github/workflows/rocm-mi200.yml
similarity index 97%
rename from .github/workflows/rocm.yml
rename to .github/workflows/rocm-mi200.yml
index ffe6efbe0433c..c947e361bfcb5 100644
--- a/.github/workflows/rocm.yml
+++ b/.github/workflows/rocm-mi200.yml
@@ -1,15 +1,16 @@
-name: rocm
+name: rocm-mi200
 
 on:
   push:
     branches:
       - release/*
     tags:
-      - ciflow/rocm/*
+      - ciflow/rocm-mi200/*
   workflow_dispatch:
   schedule:
     - cron: 29 8 * * *  # about 1:29am PDT
-    - cron: 0 * * * *
+    - cron: 0 */3 * * *
+
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
diff --git a/.github/workflows/rocm-mi300.yml b/.github/workflows/rocm-mi300.yml
index c50111d068d24..d20b37be20876 100644
--- a/.github/workflows/rocm-mi300.yml
+++ b/.github/workflows/rocm-mi300.yml
@@ -6,6 +6,7 @@ on:
       - main
       - release/*
     tags:
+      - ciflow/rocm/*
       - ciflow/rocm-mi300/*
   workflow_dispatch:
   schedule:
diff --git a/.github/workflows/slow-rocm-mi200.yml b/.github/workflows/slow-rocm-mi200.yml
new file mode 100644
index 0000000000000..c564857dca9ce
--- /dev/null
+++ b/.github/workflows/slow-rocm-mi200.yml
@@ -0,0 +1,81 @@
+# This workflow is dedicated to host slow jobs that are run only periodically because
+# they are too slow to run in every commit.  The list of slow tests can be found in
+# https://github.com/pytorch/test-infra/blob/generated-stats/stats/slow-tests.json
+name: slow-rocm-mi200
+
+on:
+  push:
+    branches:
+      - release/*
+    tags:
+      - ciflow/slow/*
+      - ciflow/slow-rocm-mi200/*
+  schedule:
+    - cron: 0 */3 * * *
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  llm-td:
+    if: github.repository_owner == 'pytorch'
+    name: before-test
+    uses: ./.github/workflows/llm_td_retrieval.yml
+    permissions:
+      id-token: write
+      contents: read
+
+  target-determination:
+    name: before-test
+    uses: ./.github/workflows/target_determination.yml
+    needs: llm-td
+    permissions:
+      id-token: write
+      contents: read
+
+  get-label-type:
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
+  linux-jammy-rocm-py3_10-build:
+    name: linux-jammy-rocm-py3.10
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+      sync-tag: rocm-build
+      test-matrix: |
+        { include: [
+          { config: "slow", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
+          { config: "slow", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
+        ]}
+    secrets: inherit
+
+  linux-jammy-rocm-py3_10-test:
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-jammy-rocm-py3.10
+    uses: ./.github/workflows/_rocm-test.yml
+    needs:
+      - linux-jammy-rocm-py3_10-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
+    secrets: inherit
diff --git a/.github/workflows/slow.yml b/.github/workflows/slow.yml
index d4992a2ddb2cf..c14caee9a336c 100644
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@@ -105,36 +105,6 @@ jobs:
       test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }}
     secrets: inherit
 
-  linux-jammy-rocm-py3_10-build:
-    name: linux-jammy-rocm-py3.10
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-rocm-py3.10
-      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
-      test-matrix: |
-        { include: [
-          { config: "slow", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
-          { config: "slow", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
-        ]}
-    secrets: inherit
-
-  linux-jammy-rocm-py3_10-test:
-    permissions:
-      id-token: write
-      contents: read
-    name: linux-jammy-rocm-py3.10
-    uses: ./.github/workflows/_rocm-test.yml
-    needs:
-      - linux-jammy-rocm-py3_10-build
-      - target-determination
-    with:
-      build-environment: linux-jammy-rocm-py3.10
-      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
-    secrets: inherit
-
   linux-jammy-py3_10-clang18-asan-build:
     name: linux-jammy-py3.10-clang18-asan
     uses: ./.github/workflows/_linux-build.yml
diff --git a/.github/workflows/test-b200.yml b/.github/workflows/test-b200.yml
index ef7f75bc4b2b4..07fd9b18fdada 100644
--- a/.github/workflows/test-b200.yml
+++ b/.github/workflows/test-b200.yml
@@ -5,7 +5,9 @@
 # Flow:
 # 1. Builds PyTorch with CUDA 12.8+ and sm100 architecture for B200
 # 2. Runs smoke tests on linux.dgx.b200 runner
-# 3. Tests executed are defined in .ci/pytorch/test.sh -> test_python_smoke() function
+# 3. Tests executed are defined in .ci/pytorch/test.sh -> test_python_smoke_b200() function
+#    - Includes matmul, scaled_matmul, FP8, and FlashAttention CuTe tests
+#    - FlashAttention CuTe DSL is installed as part of test execution
 #
 # Triggered by:
 # - Pull requests modifying this workflow file
diff --git a/.github/workflows/test-h100.yml b/.github/workflows/test-h100.yml
index ec99f4473bb0b..510473d5306ad 100644
--- a/.github/workflows/test-h100.yml
+++ b/.github/workflows/test-h100.yml
@@ -41,7 +41,6 @@ jobs:
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: linux.12xlarge.memory
       build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
       cuda-arch-list: '9.0'
diff --git a/.github/workflows/trunk-rocm-mi300.yml b/.github/workflows/trunk-rocm-mi300.yml
new file mode 100644
index 0000000000000..23ab5e9260a3e
--- /dev/null
+++ b/.github/workflows/trunk-rocm-mi300.yml
@@ -0,0 +1,83 @@
+name: trunk-rocm-mi300
+
+on:
+  push:
+    branches:
+      - main
+      - release/*
+  workflow_dispatch:
+  schedule:
+    - cron: 29 8 * * *  # about 1:29am PDT
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  llm-td:
+    if: github.repository_owner == 'pytorch'
+    name: before-test
+    uses: ./.github/workflows/llm_td_retrieval.yml
+    permissions:
+      id-token: write
+      contents: read
+
+  target-determination:
+    name: before-test
+    uses: ./.github/workflows/target_determination.yml
+    needs: llm-td
+    permissions:
+      id-token: write
+      contents: read
+
+  get-label-type:
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
+  linux-jammy-rocm-py3_10-build:
+    name: linux-jammy-rocm-py3.10
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+      sync-tag: rocm-build
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" },
+          { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" },
+          { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" },
+          { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" },
+          { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" },
+          { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" },
+          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4.b" },
+          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4.b" },
+          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4.b" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-rocm-py3_10-test:
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-jammy-rocm-py3.10
+    uses: ./.github/workflows/_rocm-test.yml
+    needs:
+      - linux-jammy-rocm-py3_10-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
+    secrets: inherit
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 6ba810c3a9582..667c37727045b 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -83,6 +83,7 @@ jobs:
           { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
           { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
           { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
+          { config: "libtorch_agnostic_targetting", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
         ]}
     secrets: inherit
 
diff --git a/.github/workflows/upload-test-stats.yml b/.github/workflows/upload-test-stats.yml
index 24c3ab3db84f3..b3d8073aad3b3 100644
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@@ -5,21 +5,23 @@ on:
     workflows:
       - pull
       - trunk
+      - trunk-rocm-mi300
       - periodic
       - periodic-rocm-mi200
       - periodic-rocm-mi300
       - inductor
       - unstable
       - slow
+      - slow-rocm-mi200
       - unstable-periodic
       - inductor-periodic
-      - rocm
+      - rocm-mi200
       - rocm-mi300
       - rocm-mi355
       - inductor-micro-benchmark
       - inductor-micro-benchmark-x86
       - inductor-cu124
-      - inductor-rocm
+      - inductor-rocm-mi200
       - inductor-rocm-mi300
       - mac-mps
       - linux-aarch64
diff --git a/.github/workflows/xpu.yml b/.github/workflows/xpu.yml
index 36f603f70fde7..d9a1ba13d2b59 100644
--- a/.github/workflows/xpu.yml
+++ b/.github/workflows/xpu.yml
@@ -47,15 +47,15 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-xpu-n-py3_10-build:
-    name: linux-jammy-xpu-n-py3.10
+  linux-noble-xpu-n-py3_10-build:
+    name: linux-noble-xpu-n-py3.10
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       sync-tag: linux-xpu-n-build
       runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
-      build-environment: linux-jammy-xpu-n-py3.10
-      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
+      build-environment: linux-noble-xpu-n-py3.10
+      docker-image-name: ci-image:pytorch-linux-noble-xpu-n-py3
       runner: linux.c7i.12xlarge
       test-matrix: |
         { include: [
@@ -74,17 +74,17 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-xpu-n-py3_10-test:
-    name: linux-jammy-xpu-n-py3.10
+  linux-noble-xpu-n-py3_10-test:
+    name: linux-noble-xpu-n-py3.10
     uses: ./.github/workflows/_xpu-test.yml
-    needs: linux-jammy-xpu-n-py3_10-build
+    needs: linux-noble-xpu-n-py3_10-build
     permissions:
       id-token: write
       contents: read
     with:
-      build-environment: linux-jammy-xpu-n-py3.10
-      docker-image: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.test-matrix }}
+      build-environment: linux-noble-xpu-n-py3.10
+      docker-image: ${{ needs.linux-noble-xpu-n-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-noble-xpu-n-py3_10-build.outputs.test-matrix }}
     secrets: inherit
 
   windows-xpu-n-1-build:
diff --git a/.lintrunner.toml b/.lintrunner.toml
index cee0249ad96eb..7a6e241f90c8d 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -143,7 +143,8 @@ init_command = [
     'tools/linter/adapters/pip_init.py',
     '--dry-run={{DRYRUN}}',
     'numpy==1.26.4 ; python_version >= "3.10" and python_version <= "3.11"',
-    'numpy==2.1.0 ; python_version >= "3.12"',
+    'numpy==2.1.0 ; python_version >= "3.12" and python_version <= "3.13"',
+    'numpy==2.3.4 ; python_version >= "3.14"',
     'expecttest==0.3.0',
     'pyrefly==0.36.2',
     'sympy==1.13.3',
@@ -185,6 +186,8 @@ include_patterns = [
     'aten/src/ATen/native/nested/cuda/*.h',
     'aten/src/ATen/native/nested/*.cpp',
     'aten/src/ATen/native/nested/*.h',
+    'aten/src/ATen/xpu/**/*.h',
+    'aten/src/ATen/xpu/**/*.cpp',
     'c10/**/*.cpp',
     'c10/**/*.h',
     'torch/*.h',
@@ -1401,7 +1404,7 @@ init_command = [
     '--dry-run={{DRYRUN}}',
     'usort==1.0.8.post1',
     'isort==6.0.1',
-    'ruff==0.13.1',  # sync with RUFF
+    'ruff==0.14.4',  # sync with RUFF
 ]
 is_formatter = true
 
@@ -1536,7 +1539,7 @@ init_command = [
     'python3',
     'tools/linter/adapters/pip_init.py',
     '--dry-run={{DRYRUN}}',
-    'ruff==0.13.1',  # sync with PYFMT
+    'ruff==0.14.4',  # sync with PYFMT
 ]
 is_formatter = true
 
diff --git a/.spin/cmds.py b/.spin/cmds.py
new file mode 100644
index 0000000000000..a81717c7423be
--- /dev/null
+++ b/.spin/cmds.py
@@ -0,0 +1,330 @@
+import hashlib
+import subprocess
+import sys
+from pathlib import Path
+
+import click
+import spin
+
+
+def file_digest(file, algorithm: str):
+    try:
+        return hashlib.file_digest(file, algorithm)
+    except AttributeError:
+        pass  # Fallback to manual implementation below
+    hash = hashlib.new(algorithm)
+    while chunk := file.read(8192):
+        hash.update(chunk)
+    return hash
+
+
+def _hash_file(file):
+    with open(file, "rb") as f:
+        hash = file_digest(f, "sha256")
+    return hash.hexdigest()
+
+
+def _hash_files(files):
+    hashes = {file: _hash_file(file) for file in files}
+    return hashes
+
+
+def _read_hashes(hash_file: Path):
+    if not hash_file.exists():
+        return {}
+    with hash_file.open("r") as f:
+        lines = f.readlines()
+    hashes = {}
+    for line in lines:
+        hash = line[:64]
+        file = line[66:].strip()
+        hashes[file] = hash
+    return hashes
+
+
+def _updated_hashes(hash_file, files_to_hash):
+    old_hashes = _read_hashes(hash_file)
+    new_hashes = _hash_files(files_to_hash)
+    if new_hashes != old_hashes:
+        return new_hashes
+    return None
+
+
+@click.command()
+def regenerate_version():
+    """Regenerate version.py."""
+    cmd = [
+        sys.executable,
+        "-m",
+        "tools.generate_torch_version",
+        "--is-debug=false",
+    ]
+    spin.util.run(cmd)
+
+
+TYPE_STUBS = [
+    (
+        "Pytorch type stubs",
+        Path(".lintbin/.pytorch-type-stubs.sha256"),
+        [
+            "aten/src/ATen/native/native_functions.yaml",
+            "aten/src/ATen/native/tags.yaml",
+            "tools/autograd/deprecated.yaml",
+        ],
+        [
+            sys.executable,
+            "-m",
+            "tools.pyi.gen_pyi",
+            "--native-functions-path",
+            "aten/src/ATen/native/native_functions.yaml",
+            "--tags-path",
+            "aten/src/ATen/native/tags.yaml",
+            "--deprecated-functions-path",
+            "tools/autograd/deprecated.yaml",
+        ],
+    ),
+    (
+        "Datapipes type stubs",
+        None,
+        [],
+        [
+            sys.executable,
+            "torch/utils/data/datapipes/gen_pyi.py",
+        ],
+    ),
+]
+
+
+@click.command()
+def regenerate_type_stubs():
+    """Regenerate type stubs."""
+    for name, hash_file, files_to_hash, cmd in TYPE_STUBS:
+        if hash_file:
+            if hashes := _updated_hashes(hash_file, files_to_hash):
+                click.echo(
+                    f"Changes detected in type stub files for {name}. Regenerating..."
+                )
+                spin.util.run(cmd)
+                hash_file.parent.mkdir(parents=True, exist_ok=True)
+                with hash_file.open("w") as f:
+                    for file, hash in hashes.items():
+                        f.write(f"{hash}  {file}\n")
+                click.echo("Type stubs and hashes updated.")
+            else:
+                click.echo(f"No changes detected in type stub files for {name}.")
+        else:
+            click.echo(f"No hash file for {name}. Regenerating...")
+            spin.util.run(cmd)
+            click.echo("Type stubs regenerated.")
+
+
+@click.command()
+def regenerate_clangtidy_files():
+    """Regenerate clang-tidy files."""
+    cmd = [
+        sys.executable,
+        "-m",
+        "tools.linter.clang_tidy.generate_build_files",
+    ]
+    spin.util.run(cmd)
+
+
+#: These linters are expected to need less than 3s cpu time total
+VERY_FAST_LINTERS = {
+    "ATEN_CPU_GPU_AGNOSTIC",
+    "BAZEL_LINTER",
+    "C10_NODISCARD",
+    "C10_UNUSED",
+    "CALL_ONCE",
+    "CMAKE_MINIMUM_REQUIRED",
+    "CONTEXT_DECORATOR",
+    "COPYRIGHT",
+    "CUBINCLUDE",
+    "DEPLOY_DETECTION",
+    "ERROR_PRONE_ISINSTANCE",
+    "EXEC",
+    "HEADER_ONLY_LINTER",
+    "IMPORT_LINTER",
+    "INCLUDE",
+    "LINTRUNNER_VERSION",
+    "MERGE_CONFLICTLESS_CSV",
+    "META_NO_CREATE_UNBACKED",
+    "NEWLINE",
+    "NOQA",
+    "NO_WORKFLOWS_ON_FORK",
+    "ONCE_FLAG",
+    "PYBIND11_INCLUDE",
+    "PYBIND11_SPECIALIZATION",
+    "PYPIDEP",
+    "PYPROJECT",
+    "RAWCUDA",
+    "RAWCUDADEVICE",
+    "ROOT_LOGGING",
+    "TABS",
+    "TESTOWNERS",
+    "TYPEIGNORE",
+    "TYPENOSKIP",
+    "WORKFLOWSYNC",
+}
+
+
+#: These linters are expected to take a few seconds, but less than 10s cpu time total
+FAST_LINTERS = {
+    "CMAKE",
+    "DOCSTRING_LINTER",
+    "GHA",
+    "NATIVEFUNCTIONS",
+    "RUFF",
+    "SET_LINTER",
+    "SHELLCHECK",
+    "SPACES",
+}
+
+
+#: These linters are expected to take more than 10s cpu time total;
+#: some need more than 1 hour.
+SLOW_LINTERS = {
+    "ACTIONLINT",
+    "CLANGFORMAT",
+    "CLANGTIDY",
+    "CODESPELL",
+    "FLAKE8",
+    "GB_REGISTRY",
+    "PYFMT",
+    "PYREFLY",
+    "TEST_DEVICE_BIAS",
+    "TEST_HAS_MAIN",
+}
+
+
+ALL_LINTERS = VERY_FAST_LINTERS | FAST_LINTERS | SLOW_LINTERS
+
+
+LINTRUNNER_CACHE_INFO = (
+    Path(".lintbin/.lintrunner.sha256"),
+    [
+        "requirements.txt",
+        "pyproject.toml",
+        ".lintrunner.toml",
+    ],
+)
+
+
+LINTRUNNER_BASE_CMD = [
+    "uvx",
+    "--python",
+    "3.10",
+    "lintrunner@0.12.7",
+]
+
+
+@click.command()
+def setup_lint():
+    """Set up lintrunner with current CI version."""
+    cmd = LINTRUNNER_BASE_CMD + ["init"]
+    subprocess.run(cmd, check=True, capture_output=True, text=True)
+
+
+def _check_linters():
+    cmd = LINTRUNNER_BASE_CMD + ["list"]
+    ret = spin.util.run(cmd, output=False, stderr=subprocess.PIPE)
+    linters = {l.strip() for l in ret.stdout.decode().strip().split("\n")[1:]}
+    unknown_linters = linters - ALL_LINTERS
+    missing_linters = ALL_LINTERS - linters
+    if unknown_linters:
+        click.secho(
+            f"Unknown linters found; please add them to the correct category "
+            f"in .spin/cmds.py: {', '.join(unknown_linters)}",
+            fg="yellow",
+        )
+    if missing_linters:
+        click.secho(
+            f"Missing linters found; please update the corresponding category "
+            f"in .spin/cmds.py: {', '.join(missing_linters)}",
+            fg="yellow",
+        )
+    return unknown_linters, missing_linters
+
+
+@spin.util.extend_command(
+    setup_lint,
+    doc=f"""
+        If configuration has changed, update lintrunner.
+
+        Compares the stored old hashes of configuration files with new ones and
+        performs setup via setup-lint if the hashes have changed.
+        Hashes are stored in {LINTRUNNER_CACHE_INFO[0]}; the following files are
+        considered: {", ".join(LINTRUNNER_CACHE_INFO[1])}.
+        """,
+)
+@click.pass_context
+def lazy_setup_lint(ctx, parent_callback, **kwargs):
+    if hashes := _updated_hashes(*LINTRUNNER_CACHE_INFO):
+        click.echo(
+            "Changes detected in lint configuration files. Setting up linting tools..."
+        )
+        parent_callback(**kwargs)
+        hash_file = LINTRUNNER_CACHE_INFO[0]
+        hash_file.parent.mkdir(parents=True, exist_ok=True)
+        with hash_file.open("w") as f:
+            for file, hash in hashes.items():
+                f.write(f"{hash}  {file}\n")
+        click.echo("Linting tools set up and hashes updated.")
+    else:
+        click.echo("No changes detected in lint configuration files. Skipping setup.")
+    click.echo("Regenerating version...")
+    ctx.invoke(regenerate_version)
+    click.echo("Regenerating type stubs...")
+    ctx.invoke(regenerate_type_stubs)
+    click.echo("Done.")
+    _check_linters()
+
+
+@click.command()
+@click.option("-a", "--apply-patches", is_flag=True)
+@click.pass_context
+def lint(ctx, apply_patches, **kwargs):
+    """Lint all files."""
+    ctx.invoke(lazy_setup_lint)
+    all_files_linters = VERY_FAST_LINTERS | FAST_LINTERS
+    changed_files_linters = SLOW_LINTERS
+    cmd = LINTRUNNER_BASE_CMD
+    if apply_patches:
+        cmd += ["--apply-patches"]
+    all_files_cmd = cmd + [
+        "--take",
+        ",".join(all_files_linters),
+        "--all-files",
+    ]
+    spin.util.run(all_files_cmd)
+    changed_files_cmd = cmd + [
+        "--take",
+        ",".join(changed_files_linters),
+    ]
+    spin.util.run(changed_files_cmd)
+
+
+@click.command()
+@click.pass_context
+def fixlint(ctx, **kwargs):
+    """Autofix all files."""
+    ctx.invoke(lint, apply_patches=True)
+
+
+@click.command()
+@click.option("-a", "--apply-patches", is_flag=True)
+@click.pass_context
+def quicklint(ctx, apply_patches, **kwargs):
+    """Lint changed files."""
+    ctx.invoke(lazy_setup_lint)
+    cmd = LINTRUNNER_BASE_CMD
+    if apply_patches:
+        cmd += ["--apply-patches"]
+    spin.util.run(cmd)
+
+
+@click.command()
+@click.pass_context
+def quickfix(ctx, **kwargs):
+    """Autofix changed files."""
+    ctx.invoke(quicklint, apply_patches=True)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ca1e4164be9b8..0e020abda3925 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -234,7 +234,17 @@ option(USE_COLORIZE_OUTPUT "Colorize output during compilation" ON)
 option(USE_ASAN "Use Address+Undefined Sanitizers" OFF)
 option(USE_LSAN "Use Leak Sanitizer" OFF)
 option(USE_TSAN "Use Thread Sanitizer" OFF)
+
+# Track whether USE_CUDA was explicitly set by the user (before option() is called)
+# If USE_CUDA is already defined in cache, it means user explicitly set it
+if(DEFINED CACHE{USE_CUDA})
+  set(_USE_CUDA_EXPLICITLY_SET TRUE)
+else()
+  set(_USE_CUDA_EXPLICITLY_SET FALSE)
+endif()
+
 option(USE_CUDA "Use CUDA" ON)
+
 option(USE_XPU "Use XPU" ON)
 cmake_dependent_option(
   BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON
@@ -726,6 +736,44 @@ if(NOT DEFINED USE_BLAS)
   set(USE_BLAS ON)
 endif()
 
+# Prioritized Text Linker Optimization
+if(USE_PRIORITIZED_TEXT_FOR_LD)
+
+  set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt")
+  set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld")
+
+  execute_process(
+    COMMAND ${Python_EXECUTABLE}
+            ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py
+            --filein "${LINKER_SCRIPT_FILE_IN}"
+            --fout  "${LINKER_SCRIPT_FILE_OUT}"
+    RESULT_VARIABLE _gen_result
+    OUTPUT_VARIABLE _gen_output
+    ERROR_VARIABLE  _gen_error
+  )
+
+  if(NOT _gen_result EQUAL 0)
+    message(FATAL_ERROR
+      "Failed to generate linker script:\n${_gen_output}\n${_gen_error}")
+  endif()
+
+  append_cxx_flag_if_supported("-ffunction-sections" CMAKE_CXX_FLAGS)
+  append_cxx_flag_if_supported("-fdata-sections" CMAKE_CXX_FLAGS)
+  append_c_flag_if_supported("-ffunction-sections" CMAKE_C_FLAGS)
+  append_c_flag_if_supported("-fdata-sections" CMAKE_C_FLAGS)
+
+  set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -T${LINKER_SCRIPT_FILE_OUT}")
+  set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -T${LINKER_SCRIPT_FILE_OUT}")
+
+else()
+  if(LINUX AND CPU_AARCH64)
+    message(WARNING [[
+    It is strongly recommend to enable linker script optimization for all AArch64 Linux builds.
+    To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
+    ]])
+  endif()
+endif()
+
 # Build libtorch mobile library, which contains ATen/TH ops and native support
 # for TorchScript model, but doesn't contain not-yet-unified caffe2 ops;
 if(INTERN_BUILD_MOBILE)
@@ -1392,9 +1440,6 @@ if(BUILD_JNI)
   add_subdirectory(android/pytorch_android)
 endif()
 
-include(cmake/Summary.cmake)
-caffe2_print_configuration_summary()
-
 # Parse custom debug info
 if(DEFINED USE_CUSTOM_DEBINFO)
   string(REPLACE ";" " " SOURCE_FILES "${USE_CUSTOM_DEBINFO}")
@@ -1434,56 +1479,5 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA)
           DESTINATION "${CMAKE_INSTALL_BINDIR}")
 endif()
 
-if(USE_PRIORITIZED_TEXT_FOR_LD)
-  add_compile_options(
-    $<$<COMPILE_LANGUAGE:C,CXX>:-ffunction-sections>
-    $<$<COMPILE_LANGUAGE:C,CXX>:-fdata-sections>
-  )
-  set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld")
-  set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt")
-
-  add_custom_command(
-    OUTPUT "${LINKER_SCRIPT_FILE_OUT}"
-    COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}"
-    DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}"
-    COMMENT "Generating prioritized text linker files"
-    VERBATIM
-  )
-
-  add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
-
-  if(BUILD_PYTHON)
-    set(LINKER_OPT_TARGETS torch_python)
-  endif()
-
-  if(NOT BUILD_LIBTORCHLESS)
-    list(APPEND LINKER_OPT_TARGETS torch_cpu c10)
-    if(USE_CUDA)
-      list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda)
-    endif()
-    if(USE_XPU)
-      list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu)
-    endif()
-    if(USE_ROCM)
-      list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip)
-    endif()
-  endif()
-
-  foreach(tgt IN LISTS LINKER_OPT_TARGETS)
-    if(TARGET ${tgt})
-      add_dependencies("${tgt}" generate_linker_script)
-      target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}")
-      set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
-    else()
-       message(WARNING "Requested target '${tgt}' for linker script optimization was not found.")
-    endif()
-  endforeach()
-
-else()
-  if(LINUX AND CPU_AARCH64)
-    message(WARNING [[
-    It is strongly recommend to enable linker script optimization for all AArch64 Linux builds.
-    To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
-    ]])
-  endif()
-endif()
+include(cmake/Summary.cmake)
+caffe2_print_configuration_summary()
diff --git a/CODEOWNERS b/CODEOWNERS
index cc249dc4f43a2..137031066090e 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -210,8 +210,12 @@ torch/backends/cudnn/ @eqy @syed-ahmed @Aidyn-A
 /test/inductor/test_flex_attention.py @drisspg
 /test/inductor/test_flex_decoding.py @drisspg
 
-# Low Precision GEMMs
+# Low Precision & Grouped GEMMs
 /aten/src/ATen/native/cuda/Blas.cpp @drisspg @slayton58
+/aten/src/ATen/native/cuda/GroupedBlas.cpp @drisspg @slayton58
+/aten/src/ATen/native/cuda/ScaledBlas.cpp @drisspg @slayton58
 /aten/src/ATen/cuda/CUDABlas.cpp @drisspg @slayton58
 /aten/src/ATen/cuda/CUDABlas.h @drisspg @slayton58
+/aten/src/ATen/cuda/CUDAScaledBlas.cpp @drisspg @slayton58
+/aten/src/ATen/cuda/CUDAScaledBlas.h @drisspg @slayton58
 /test/test_scaled_matmul_cuda.py @drisspg @slayton58
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 9df55ca6acd5c..bc0b0fc9bb00f 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -18,7 +18,7 @@ aspects of contributing to PyTorch.
   - [Python Unit Testing](#python-unit-testing)
   - [Better local unit tests with `pytest`](#better-local-unit-tests-with-pytest)
   - [Local linting](#local-linting)
-    - [Running `mypy`](#running-mypy)
+    - [Running `pyrefly`](#running-pyrefly)
   - [C++ Unit Testing](#c-unit-testing)
   - [Run Specific CI Jobs](#run-specific-ci-jobs)
 - [Merging your Change](#merging-your-change)
@@ -281,7 +281,7 @@ dependencies as well as the nightly binaries into the repo directory.
 **Prerequisites**:
 The following packages should be installed with `pip`:
 - `expecttest` and `hypothesis` - required to run tests
-- `mypy` - recommended for linting
+- `pyrefly` - recommended for type checking. [Pyrefly](https://pyrefly.org/)
 - `pytest` - recommended to run tests more selectively
 Running
 ```
@@ -350,15 +350,32 @@ make lint
 
 Learn more about the linter on the [lintrunner wiki page](https://github.com/pytorch/pytorch/wiki/lintrunner)
 
-#### Running `mypy`
+#### Running `pyrefly`
 
-`mypy` is an optional static type checker for Python. We have multiple `mypy`
-configs for the PyTorch codebase that are automatically validated against whenever the linter is run.
+[Pyrefly](https://pyrefly.org/) is a high-performance static type checker for Python. It provides fast type checking along with IDE features like autocomplete and instant error feedback.
+
+PyTorch uses Pyrefly for type checking across the codebase. The configuration is managed in `pyrefly.toml` at the root of the repository.
+
+**Getting Started with Pyrefly:**
+
+To run type checking on the PyTorch codebase:
+```bash
+pyrefly check
+```
+
+For more detailed error information with summaries:
+```bash
+pyrefly check --summarize-errors
+```
+
+**Learn More:**
+- [Pyrefly Configuration](https://pyrefly.org/en/docs/configuration/) - Detailed configuration options
+- [Pyrefly IDE Features](https://pyrefly.org/en/docs/IDE-features/) - Set up Pyrefly in your editor for real-time type checking
+- [Python Typing Tutorial](https://pyrefly.org/en/docs/typing-for-python-developers/) - Learn about Python type annotations
 
 See [Guide for adding type annotations to
 PyTorch](https://github.com/pytorch/pytorch/wiki/Guide-for-adding-type-annotations-to-PyTorch)
-for more information on how to set up `mypy` and tackle type annotation
-tasks.
+for PyTorch-specific guidance on how to set up `pyrefly` and tackle type annotation tasks in this codebase.
 
 ### C++ Unit Testing
 
diff --git a/LICENSE b/LICENSE
index 966a609b61e53..c23172f7aff02 100644
--- a/LICENSE
+++ b/LICENSE
@@ -37,7 +37,7 @@ Copyright (c) 2024 Tri Dao.
 All rights reserved.
 
 All contributions by Arm:
-Copyright (c) 2021, 2023-2024 Arm Limited and/or its affiliates
+Copyright (c) 2021, 2023-2025 Arm Limited and/or its affiliates
 
 All contributions from Caffe:
 Copyright(c) 2013, 2014, 2015, the respective contributors
diff --git a/SECURITY.md b/SECURITY.md
index ed8228af36724..2d2c8a0c5f1c5 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -1,7 +1,7 @@
 # Security Policy
 
  - [**Reporting a Vulnerability**](#reporting-a-vulnerability)
- - [**Using Pytorch Securely**](#using-pytorch-securely)
+ - [**Using PyTorch Securely**](#using-pytorch-securely)
    - [Untrusted models](#untrusted-models)
    - [TorchScript models](#torchscript-models)
    - [Untrusted inputs](#untrusted-inputs)
@@ -10,28 +10,30 @@
 - [**CI/CD security principles**](#cicd-security-principles)
 ## Reporting Security Issues
 
-Beware that none of the topics under [Using Pytorch Securely](#using-pytorch-securely) are considered vulnerabilities of Pytorch.
+Beware that none of the topics under [Using PyTorch Securely](#using-pytorch-securely) are considered vulnerabilities of PyTorch.
 
 However, if you believe you have found a security vulnerability in PyTorch, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
 
 Please report security issues using https://github.com/pytorch/pytorch/security/advisories/new
 
-All reports submitted thru the security advisories mechanism would **either be made public or dismissed by the team within 90 days of the submission**. If advisory has been closed on the grounds that it is not a security issue, please do not hesitate to create an [new issue](https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml) as it is still likely a valid issue within the framework.
+All reports submitted through the security advisories mechanism would **either be made public or dismissed by the team within 90 days of the submission**. If advisory has been closed on the grounds that it is not a security issue, please do not hesitate to create an [new issue](https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml) as it is still likely a valid issue within the framework.
+
+**Note on crashes and out of bounds access**: PyTorch is a computational framework that performs operations on behalf of the caller. Like many low-level libraries, PyTorch generally does not validate all inputs to every function—the responsibility for providing valid arguments lies with the calling code. While crashes and out of bounds memory access should be reported as bugs, they are generally not considered security vulnerabilities in PyTorch's threat model.
 
 Please refer to the following page for our responsible disclosure policy, reward guidelines, and those things that should not be reported:
 
 https://www.facebook.com/whitehat
 
 
-## Using Pytorch Securely
-**Pytorch models are programs**, so treat its security seriously -- running untrusted models is equivalent to running untrusted code. In general we recommend that model weights and the python code for the model are distributed independently. That said, be careful about where you get the python code from and who wrote it (preferentially check for a provenance or checksums, do not run any pip installed package).
+## Using PyTorch Securely
+**PyTorch models are programs**, so treat its security seriously -- running untrusted models is equivalent to running untrusted code. In general we recommend that model weights and the python code for the model are distributed independently. That said, be careful about where you get the python code from and who wrote it (preferentially check for a provenance or checksums, do not run any pip installed package).
 
 ### Untrusted models
 Be careful when running untrusted models. This classification includes models created by unknown developers or utilizing data obtained from unknown sources[^data-poisoning-sources].
 
 **Prefer to execute untrusted models within a secure, isolated environment such as a sandbox** (e.g., containers, virtual machines). This helps protect your system from potentially malicious code. You can find further details and instructions in [this page](https://developers.google.com/code-sandboxing).
 
-**Be mindful of risky model formats**. Give preference to share and load weights with the appropriate format for your use case. [safetensors](https://huggingface.co/docs/safetensors/en/index) gives the most safety but is the most restricted in what it supports. [`torch.load`](https://pytorch.org/docs/stable/generated/torch.load.html#torch.load) has a significantly larger surface of attack but is more flexible in what it can serialize. See the documentation for more details.
+**Be mindful of risky model formats**. Give preference to share and load weights with the appropriate format for your use case. [Safetensors](https://huggingface.co/docs/safetensors/en/index) gives the most safety but is the most restricted in what it supports. [`torch.load`](https://pytorch.org/docs/stable/generated/torch.load.html#torch.load) has a significantly larger surface of attack but is more flexible in what it can serialize. See the documentation for more details.
 
 Even for more secure serialization formats, unexpected inputs to the downstream system can cause diverse security threats (e.g. denial of service, out of bound reads/writes) and thus we recommend extensive validation of any untrusted inputs.
 
@@ -43,7 +45,7 @@ Important Note: The trustworthiness of a model is not binary. You must always de
 
 ### TorchScript models
 
-TorchScript models should treated the same way as locally executable code from an unknown source. Only run TorchScript models if you trust the provider. Please note, that tools for introspecting TorchScript models (such as `torch.utils.model_dump`) may also execute partial or full code stored in those models, therefore they should be used only if you trust the provider of the binary you are about to load.
+TorchScript models should be treated the same way as locally executable code from an unknown source. Only run TorchScript models if you trust the provider. Please note, that tools for introspecting TorchScript models (such as `torch.utils.model_dump`) may also execute partial or full code stored in those models, therefore they should be used only if you trust the provider of the binary you are about to load.
 
 ### Untrusted inputs during training and prediction
 
@@ -59,9 +61,9 @@ If applicable, prepare your model against bad inputs and prompt injections. Some
 
 ### Data privacy
 
-**Take special security measures if your model if you train models with sensitive data**. Prioritize [sandboxing](https://developers.google.com/code-sandboxing) your models and:
-- Do not feed sensitive data to untrusted model (even if runs in a sandboxed environment)
-- If you consider publishing a model that was partially trained with sensitive data, be aware that data can potentially be recovered from the trained weights (especially if model overfits).
+**Take special security measures if you train your models with sensitive data**. Prioritize [sandboxing](https://developers.google.com/code-sandboxing) your models and:
+- Do not feed sensitive data to an untrusted model (even if runs in a sandboxed environment)
+- If you consider publishing a model that was partially trained with sensitive data, be aware that data can potentially be recovered from the trained weights (especially if the model overfits).
 
 ### Using distributed features
 
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index 8b283c417b74b..ae762e1def3ec 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -260,7 +260,7 @@ IF(USE_FBGEMM_GENAI)
   if(USE_CUDA)
     # To avoid increasing the build time/binary size unnecessarily, use an allow-list of kernels to build.
     # If you want to integrate a kernel from FBGEMM into torch, you have to add it here.
-    set(FBGEMM_CUTLASS_KERNELS_REGEX ".*(mx8mx8bf16_grouped|f4f4bf16_grouped).*")
+    set(FBGEMM_CUTLASS_KERNELS_REGEX ".*(mx8mx8bf16_grouped|f4f4bf16_grouped|f4f4bf16).*")
     file(GLOB_RECURSE fbgemm_genai_native_cuda_cu
       "${FBGEMM_GENAI_SRCS}/cutlass_extensions/*.cu"
       "${FBGEMM_GENAI_SRCS}/cutlass_extensions/**/*.cu")
diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index a354b41912406..6bc321887502d 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -23,8 +23,6 @@ C10_DIAGNOSTIC_POP()
 #endif
 namespace at {
 
-namespace {
-
 /*
   These const variables defined the fp32 precisions for different backend
   We have "generic", "cuda", "mkldnn" backend now and we can choose fp32
@@ -41,16 +39,6 @@ namespace {
                 ->rnn
 */
 
-  C10_ALWAYS_INLINE void warn_deprecated_fp32_precision_api(){
-    TORCH_WARN_ONCE(
-      "Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' "
-      "or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, "
-      "torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see "
-      "https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices"
-    );
-  }
-} // namespace
-
 Float32Backend str2backend(const std::string& name) {
   if (name == "generic")
     return Float32Backend::GENERIC;
@@ -206,7 +194,6 @@ bool Context::allowTF32CuDNN(std::optional<Float32Op> op) const {
   } else {
     return float32Precision(Float32Backend::CUDA, op.value()) == Float32Precision::TF32;
   }
-  warn_deprecated_fp32_precision_api();
   return allow_tf32_cudnn;
 }
 
@@ -214,7 +201,6 @@ void Context::setAllowTF32CuDNN(bool b) {
   setFloat32Precision(Float32Backend::CUDA, Float32Op::RNN, b ? Float32Precision::TF32 : Float32Precision::NONE);
   setFloat32Precision(Float32Backend::CUDA, Float32Op::CONV, b ? Float32Precision::TF32 : Float32Precision::NONE);
   allow_tf32_cudnn = b;
-  warn_deprecated_fp32_precision_api();
 }
 
 void Context::setSDPPriorityOrder(const std::vector<int64_t>& order) {
@@ -325,7 +311,6 @@ bool Context::allowTF32CuBLAS() const {
       "Current status indicate that you have used mix of the legacy and new APIs to set the TF32 status for cublas matmul. ",
       "We suggest only using the new API to set the TF32 flag. See also: ",
       "https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices");
-  warn_deprecated_fp32_precision_api();
   return allow_tf32_new;
 }
 
@@ -349,7 +334,6 @@ Float32MatmulPrecision Context::float32MatmulPrecision() const {
       "Current status indicate that you have used mix of the legacy and new APIs to set the matmul precision. ",
       "We suggest only using the new API for matmul precision. See also: ",
       "https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices");
-  warn_deprecated_fp32_precision_api();
   return float32_matmul_precision;
 }
 
@@ -377,7 +361,6 @@ Float32Precision Context::float32Precision(Float32Backend backend, Float32Op op)
 
 void Context::setFloat32MatmulPrecision(const std::string &s) {
   auto match = [this](const std::string & s_) {
-    warn_deprecated_fp32_precision_api();
     // TODO: consider if CuDNN field needs to also be set for potential future CuDNN ops like multi-headed attention
     if (s_ == "highest") {
       float32_matmul_precision = at::Float32MatmulPrecision::HIGHEST;
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index 6807e527eb75f..385ccb88c463b 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -174,6 +174,12 @@ class TORCH_API Context {
   static long versionCuDNN() {
     return detail::getCUDAHooks().versionCuDNN();
   }
+  static long versionRuntimeCuDNN() {
+    return detail::getCUDAHooks().versionRuntimeCuDNN();
+  }
+  static long versionCuDNNFrontend() {
+    return detail::getCUDAHooks().versionCuDNNFrontend();
+  }
   static bool hasCuSOLVER() {
     return detail::getCUDAHooks().hasCuSOLVER();
   }
diff --git a/aten/src/ATen/DeviceAccelerator.h b/aten/src/ATen/DeviceAccelerator.h
index f23b35047fcc8..2cc4cff7cd1f2 100644
--- a/aten/src/ATen/DeviceAccelerator.h
+++ b/aten/src/ATen/DeviceAccelerator.h
@@ -94,6 +94,11 @@ TORCH_API inline void resetPeakStats(c10::DeviceIndex device_index) {
   at::getDeviceAllocator(device_type)->resetPeakStats(device_index);
 }
 
+TORCH_API inline std::pair<size_t, size_t> getMemoryInfo(
+    c10::DeviceIndex device_index) {
+  const auto device_type = getAccelerator(true).value();
+  return at::getDeviceAllocator(device_type)->getMemoryInfo(device_index);
+}
 } // namespace at::accelerator
 
 namespace at {
diff --git a/aten/src/ATen/Dispatch.h b/aten/src/ATen/Dispatch.h
index 40ad61cbd6455..870f7172d1622 100644
--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@@ -6,6 +6,7 @@
 #include <c10/util/Half.h>
 #include <c10/util/Metaprogramming.h>
 #include <c10/util/complex.h>
+#include <torch/headeronly/core/Dispatch.h>
 
 #ifdef __CUDACC__
 #include <cuda.h> // For CUDA_VERSION
@@ -61,12 +62,9 @@ TORCH_API void record_kernel_function_dtype(std::string name);
     }                                                 \
   } while (0)
 
-#define AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, HINT, ...)                 \
-  case enum_type: {                                                           \
-    AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type);                              \
-    using HINT [[maybe_unused]] = c10::impl::ScalarTypeToCPPTypeT<enum_type>; \
-    return __VA_ARGS__();                                                     \
-  }
+#define AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, HINT, ...) \
+  THO_PRIVATE_CASE_TYPE_USING_HINT_TMPL(                      \
+      AT_PRIVATE_CHECK_SELECTIVE_BUILD, enum_type, HINT, __VA_ARGS__)
 
 #define AT_DISPATCH_CASE(enum_type, ...) \
   AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, scalar_t, __VA_ARGS__)
@@ -95,14 +93,6 @@ TORCH_API void record_kernel_function_dtype(std::string name);
     return __VA_ARGS__();                                                   \
   }
 
-namespace detail {
-
-inline at::ScalarType scalar_type(at::ScalarType s) {
-  return s;
-}
-
-} // namespace detail
-
 // The AT_DISPATCH_* family of macros provides the ability to
 // conveniently generate specializations of a kernel over all of the
 // dtypes we care about in PyTorch.  We call it "dispatch" because
@@ -190,27 +180,13 @@ inline at::ScalarType scalar_type(at::ScalarType s) {
 // but we're just being safe (and it doesn't hurt.)  Note we must
 // use it to shut up warnings about unused store.
 
-#define AT_DISPATCH_SWITCH(TYPE, NAME, ...)                                 \
-  [&] {                                                                     \
-    const auto& the_type = TYPE;                                            \
-    constexpr const char* at_dispatch_name = NAME;                          \
-    /* don't use TYPE again in case it is an expensive or side-effect op */ \
-    at::ScalarType _st = ::detail::scalar_type(the_type);                   \
-    RECORD_KERNEL_FUNCTION_DTYPE(at_dispatch_name, _st);                    \
-    C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum")             \
-    switch (_st) {                                                          \
-      __VA_ARGS__                                                           \
-      default:                                                              \
-        TORCH_CHECK_NOT_IMPLEMENTED(                                        \
-            false,                                                          \
-            '"',                                                            \
-            at_dispatch_name,                                               \
-            "\" not implemented for '",                                     \
-            toString(_st),                                                  \
-            "'");                                                           \
-    }                                                                       \
-    C10_DIAGNOSTIC_POP()                                                    \
-  }()
+#define AT_DISPATCH_SWITCH(TYPE, NAME, ...) \
+  THO_DISPATCH_SWITCH_TMPL(                 \
+      RECORD_KERNEL_FUNCTION_DTYPE,         \
+      TORCH_CHECK_NOT_IMPLEMENTED,          \
+      TYPE,                                 \
+      NAME,                                 \
+      __VA_ARGS__)
 
 #define AT_DISPATCH_CASE_FLOATING_TYPES(...)            \
   AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \
diff --git a/aten/src/ATen/Dispatch_v2.h b/aten/src/ATen/Dispatch_v2.h
index d0b77220faef2..fbeb48d45e32a 100644
--- a/aten/src/ATen/Dispatch_v2.h
+++ b/aten/src/ATen/Dispatch_v2.h
@@ -1,3 +1,8 @@
+#pragma once
+
+#include <torch/headeronly/core/Dispatch_v2.h>
+
+// Get AT_DISPATCH_SWITCH and AT_DISPATCH_CASE:
 #include <ATen/Dispatch.h>
 
 // This is a new implementation of the AT_DISPATCH macro family from
@@ -74,41 +79,19 @@
 // macro expansion occurs, mediated with AT_EXPAND and AT_GUARD.  I mostly
 // relied on GPT4 to help me get it right.
 
-// Public API macros
-
 // See documentation above
 #define AT_DISPATCH_V2(TYPE, NAME, BODY, ...) \
-  AT_DISPATCH_SWITCH(TYPE, NAME, AT_AP_VAR(AT_WRAP(BODY), TYPE, __VA_ARGS__))
-
-// This macro lets you pass an arbitrary expression that may contain internal
-// commas to another macro without having the commas causing the expression
-// to be interpreted as being multiple arguments
-#define AT_WRAP(...) __VA_ARGS__
-
-#define AT_FLOAT8_TYPES                                          \
-  c10::kFloat8_e5m2, c10::kFloat8_e5m2fnuz, c10::kFloat8_e4m3fn, \
-      c10::kFloat8_e4m3fnuz, c10::kFloat8_e8m0fnu
-
-#define AT_INTEGRAL_TYPES \
-  c10::kByte, c10::kChar, c10::kInt, c10::kLong, c10::kShort
-#define AT_FLOATING_TYPES c10::kDouble, c10::kFloat
-#define AT_BAREBONES_UNSIGNED_TYPES c10::kUInt16, c10::kUInt32, c10::kUInt64
-#define AT_INTEGRAL_TYPES_V2 \
-  AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)
-#define AT_COMPLEX_TYPES c10::kComplexDouble, c10::kComplexFloat
-#define AT_QINT_TYPES c10::kQInt8, c10::kQUInt8, c10::kQInt32
-// NB: not *actually* all types
-#define AT_ALL_TYPES AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_FLOATING_TYPES)
-#define AT_ALL_TYPES_AND_COMPLEX \
-  AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_COMPLEX_TYPES)
-
-// Helper macros
-
+  THO_DISPATCH_V2_TMPL(                       \
+      AT_DISPATCH_SWITCH,                     \
+      AT_DISPATCH_CASE,                       \
+      TYPE,                                   \
+      NAME,                                   \
+      AT_WRAP(BODY),                          \
+      __VA_ARGS__)
+
+// Unused helper macros, kept for BC:
 #define AT_AP_VAR(N, T, ...) \
   AT_EXPAND(AT_CONCAT(AT_AP, AT_NUM_ARGS(__VA_ARGS__))(AT_WRAP(N), __VA_ARGS__))
-#define AT_CONCAT(a, b) AT_CONCAT_AUX(a, b)
-#define AT_CONCAT_AUX(a, b) a##b
-#define AT_EXPAND(X) X
 
 // Ensure we never have too many scalar types for the expansion here to
 // support.  To bump this, you must regenerate the macros below.
@@ -119,12 +102,6 @@ static_assert(static_cast<int>(c10::ScalarType::NumOptions) < 60);
 
 num_args = 60
 
-nums = ', '.join(str(i) for i in reversed(range(num_args+1)))
-args = ', '.join(f'_{i}' for i in range(1, num_args+1))
-
-print(f'#define AT_NUM_ARGS(...) AT_EXPAND(AT_NUM_ARGS_AUX(__VA_ARGS__, {nums}))')
-print(f'#define AT_NUM_ARGS_AUX({args}, N, ...) N')
-
 for i in range(1, num_args+1):
     args = ', '.join(f'_{i}' for i in range(1, i+1))
     cases = ' '.join([f'AT_DISPATCH_CASE(_{j}, N)' for j in range(1, i+1)])
@@ -135,8 +112,6 @@ for i in range(1, num_args+1):
 // Begin generated code
 // clang-format off
 
-#define AT_NUM_ARGS(...) AT_EXPAND(AT_NUM_ARGS_AUX(__VA_ARGS__, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0))
-#define AT_NUM_ARGS_AUX(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58, _59, _60, N, ...) N
 #define AT_AP1(N, _1) AT_DISPATCH_CASE(_1, N)
 #define AT_AP2(N, _1, _2) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N)
 #define AT_AP3(N, _1, _2, _3) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N)
diff --git a/aten/src/ATen/LegacyBatchedTensorImpl.h b/aten/src/ATen/LegacyBatchedTensorImpl.h
index 798e3535af3fb..f051e7b1f6531 100644
--- a/aten/src/ATen/LegacyBatchedTensorImpl.h
+++ b/aten/src/ATen/LegacyBatchedTensorImpl.h
@@ -144,7 +144,7 @@ inline std::bitset<kVmapNumLevels> createVmapLevelsBitset(BatchDimsRef bdims) {
 }
 
 inline std::ostream& operator<<(std::ostream& out, const BatchDim& bdim) {
-  out << "(lvl=" << bdim.level() << ", dim=" << bdim.dim() << ")";
+  out << "(lvl=" << bdim.level() << ", dim=" << bdim.dim() << ')';
   return out;
 }
 
diff --git a/aten/src/ATen/TensorIndexing.cpp b/aten/src/ATen/TensorIndexing.cpp
index 1fa852686656f..8618a67259c9c 100644
--- a/aten/src/ATen/TensorIndexing.cpp
+++ b/aten/src/ATen/TensorIndexing.cpp
@@ -9,7 +9,7 @@ namespace indexing {
 const EllipsisIndexType Ellipsis = EllipsisIndexType();
 
 std::ostream& operator<<(std::ostream& stream, const Slice& slice) {
-  stream << slice.start() << ":" << slice.stop() << ":" << slice.step();
+  stream << slice.start() << ':' << slice.stop() << ':' << slice.step();
   return stream;
 }
 
@@ -31,12 +31,12 @@ std::ostream& operator<<(std::ostream& stream, const TensorIndex& tensor_index)
 }
 
 std::ostream& operator<<(std::ostream& stream, const std::vector<TensorIndex>& tensor_indices) {
-  stream << "(";
+  stream << '(';
   for (const auto i : c10::irange(tensor_indices.size())) {
     stream << tensor_indices[i];
     if (i < tensor_indices.size() - 1) stream << ", ";
   }
-  stream << ")";
+  stream << ')';
   return stream;
 }
 
diff --git a/aten/src/ATen/TensorNames.cpp b/aten/src/ATen/TensorNames.cpp
index bff12aa8de65f..ac6857b95c1d6 100644
--- a/aten/src/ATen/TensorNames.cpp
+++ b/aten/src/ATen/TensorNames.cpp
@@ -113,7 +113,7 @@ void TensorNames::checkUnique(const char* op_name) const {
 std::ostream& operator<<(std::ostream& out, const TensorName& tensorname) {
   out << tensorname.name_ << " (index ";
   out << tensorname.origin_idx_ << " of ";
-  out << tensorname.origin_ << ")";
+  out << tensorname.origin_ << ')';
   return out;
 }
 
diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp
index 8236751679f06..2752ff792e485 100644
--- a/aten/src/ATen/TensorUtils.cpp
+++ b/aten/src/ATen/TensorUtils.cpp
@@ -13,9 +13,9 @@ std::ostream& operator<<(std::ostream & out, const TensorGeometryArg& t) {
   if (t.pos == 0) {
     // 0 is distinguished; it usually indicates 'self' or the return
     // tensor
-    out << "'" << t.name << "'";
+    out << '\'' << t.name << '\'';
   } else {
-    out << "argument #" << t.pos << " '" << t.name << "'";
+    out << "argument #" << t.pos << " '" << t.name << '\'';
   }
   return out;
 }
@@ -154,7 +154,7 @@ void checkSameGPU(CheckedFrom c, const TensorArg& t1, const TensorArg& t2) {
       oss << "Tensor for " << t2 << " is on CPU, ";
     }
     oss << "but expected " << ((!t1->is_cpu() && !t2->is_cpu()) ? "them" : "it")
-        << " to be on GPU (while checking arguments for " << c << ")";
+        << " to be on GPU (while checking arguments for " << c << ')';
     TORCH_CHECK(false, oss.str());
   }
   TORCH_CHECK(
@@ -199,7 +199,7 @@ void checkScalarTypes(CheckedFrom c, const TensorArg& t,
         i++;
       }
       oss << "; but got " << t->toString()
-          << " instead (while checking arguments for " << c << ")";
+          << " instead (while checking arguments for " << c << ')';
       TORCH_CHECK(false, oss.str());
     }
 }
diff --git a/aten/src/ATen/Version.cpp b/aten/src/ATen/Version.cpp
index 7239f357fdd64..a6335d9e11304 100644
--- a/aten/src/ATen/Version.cpp
+++ b/aten/src/ATen/Version.cpp
@@ -43,8 +43,8 @@ std::string get_mkldnn_version() {
     // https://github.com/intel/ideep/issues/29
     {
       const dnnl_version_t* ver = dnnl_version();
-      ss << "Intel(R) MKL-DNN v" << ver->major << "." << ver->minor << "." << ver->patch
-         << " (Git Hash " << ver->hash << ")";
+      ss << "Intel(R) MKL-DNN v" << ver->major << '.' << ver->minor << '.' << ver->patch
+         << " (Git Hash " << ver->hash << ')';
     }
   #else
     ss << "MKLDNN not found";
@@ -81,7 +81,7 @@ std::string get_openmp_version() {
           break;
       }
       if (ver_str) {
-        ss << " (a.k.a. OpenMP " << ver_str << ")";
+        ss << " (a.k.a. OpenMP " << ver_str << ')';
       }
     }
   #else
@@ -135,38 +135,38 @@ std::string show_config() {
 
 #if defined(__GNUC__)
   {
-    ss << "  - GCC " << __GNUC__ << "." << __GNUC_MINOR__ << "\n";
+    ss << "  - GCC " << __GNUC__ << '.' << __GNUC_MINOR__ << '\n';
   }
 #endif
 
 #if defined(__cplusplus)
   {
-    ss << "  - C++ Version: " << __cplusplus << "\n";
+    ss << "  - C++ Version: " << __cplusplus << '\n';
   }
 #endif
 
 #if defined(__clang_major__)
   {
-    ss << "  - clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__ << "\n";
+    ss << "  - clang " << __clang_major__ << '.' << __clang_minor__ << '.' << __clang_patchlevel__ << '\n';
   }
 #endif
 
 #if defined(_MSC_VER)
   {
-    ss << "  - MSVC " << _MSC_FULL_VER << "\n";
+    ss << "  - MSVC " << _MSC_FULL_VER << '\n';
   }
 #endif
 
 #if AT_MKL_ENABLED()
-  ss << "  - " << get_mkl_version() << "\n";
+  ss << "  - " << get_mkl_version() << '\n';
 #endif
 
 #if AT_MKLDNN_ENABLED()
-  ss << "  - " << get_mkldnn_version() << "\n";
+  ss << "  - " << get_mkldnn_version() << '\n';
 #endif
 
 #ifdef _OPENMP
-  ss << "  - " << get_openmp_version() << "\n";
+  ss << "  - " << get_openmp_version() << '\n';
 #endif
 
 #if AT_BUILD_WITH_LAPACK()
@@ -183,7 +183,7 @@ std::string show_config() {
   ss << "  - Cross compiling on MacOSX\n";
 #endif
 
-  ss << "  - "<< used_cpu_capability() << "\n";
+  ss << "  - "<< used_cpu_capability() << '\n';
 
   if (hasCUDA()) {
     ss << detail::getCUDAHooks().showConfig();
@@ -200,10 +200,10 @@ std::string show_config() {
   ss << "  - Build settings: ";
   for (const auto& pair : caffe2::GetBuildOptions()) {
     if (!pair.second.empty()) {
-      ss << pair.first << "=" << pair.second << ", ";
+      ss << pair.first << '=' << pair.second << ", ";
     }
   }
-  ss << "\n";
+  ss << '\n';
 
   // TODO: do HIP
   // TODO: do XLA
diff --git a/aten/src/ATen/code_template.h b/aten/src/ATen/code_template.h
index 2026795fc0a3d..2cde802dac172 100644
--- a/aten/src/ATen/code_template.h
+++ b/aten/src/ATen/code_template.h
@@ -209,7 +209,7 @@ struct CodeTemplate {
   // to indent correctly in the context.
   void emitIndent(std::ostream& out, size_t indent) const {
     for ([[maybe_unused]] const auto i : c10::irange(indent)) {
-      out << " ";
+      out << ' ';
     }
   }
   void emitStringWithIndents(
diff --git a/aten/src/ATen/core/CachingHostAllocator.h b/aten/src/ATen/core/CachingHostAllocator.h
index 603e7e73bc1ea..71af40c5fd20a 100644
--- a/aten/src/ATen/core/CachingHostAllocator.h
+++ b/aten/src/ATen/core/CachingHostAllocator.h
@@ -226,8 +226,8 @@ template <
     typename B = HostBlock<S>>
 struct CachingHostAllocatorImpl {
   virtual ~CachingHostAllocatorImpl() {
-    active_ = false;
-    if (pinned_use_background_threads()) {
+    if (active_) {
+      active_ = false;
       getBackgroundThreadPool()->waitWorkComplete();
     }
   }
@@ -260,6 +260,7 @@ struct CachingHostAllocatorImpl {
     if (pinned_use_background_threads()) {
       // Launch the background thread and process events in a loop.
       static bool background_thread_flag [[maybe_unused]] = [this] {
+        active_ = true;
         getBackgroundThreadPool()->run([&]() {
           while (active_) {
             process_events();
@@ -683,9 +684,9 @@ struct CachingHostAllocatorImpl {
   alignas(hardware_destructive_interference_size) std::mutex events_mutex_;
   std::deque<std::pair<E, B*>> events_; // event queue paired with block
 
-  // Indicates whether the object is active.
+  // Indicates whether the event-processing thread pool is active.
   // Set to false in the destructor to signal background threads to stop.
-  std::atomic<bool> active_{true};
+  std::atomic<bool> active_{false};
 protected:
   alignas(hardware_destructive_interference_size) HostStatsStaged stats_;
 };
diff --git a/aten/src/ATen/core/Dimname.cpp b/aten/src/ATen/core/Dimname.cpp
index c78d554732b9e..66aa8cb69e1ed 100644
--- a/aten/src/ATen/core/Dimname.cpp
+++ b/aten/src/ATen/core/Dimname.cpp
@@ -10,7 +10,7 @@ std::ostream& operator<<(std::ostream& out, const Dimname& dimname) {
   if (dimname.type() == NameType::WILDCARD) {
     out << "None";
   } else {
-    out << "'" << dimname.symbol().toUnqualString() << "'";
+    out << '\'' << dimname.symbol().toUnqualString() << '\'';
   }
   return out;
 }
diff --git a/aten/src/ATen/core/Range.cpp b/aten/src/ATen/core/Range.cpp
index 06a79a9c7d063..b5f4c7b6f85bc 100644
--- a/aten/src/ATen/core/Range.cpp
+++ b/aten/src/ATen/core/Range.cpp
@@ -5,7 +5,7 @@
 namespace at {
 
 std::ostream& operator<<(std::ostream& out, const Range& range) {
-  out << "Range[" << range.begin << ", " << range.end << "]";
+  out << "Range[" << range.begin << ", " << range.end << ']';
   return out;
 }
 
diff --git a/aten/src/ATen/core/Tensor.cpp b/aten/src/ATen/core/Tensor.cpp
index c5f887f096cd1..090e77e703736 100644
--- a/aten/src/ATen/core/Tensor.cpp
+++ b/aten/src/ATen/core/Tensor.cpp
@@ -71,7 +71,7 @@ void TensorBase::enforce_invariants() {
 
 void TensorBase::print() const {
   if (defined()) {
-    std::cerr << "[" << toString() << " " << sizes() << "]" << '\n';
+    std::cerr << '[' << toString() << ' ' << sizes() << ']' << '\n';
   } else {
     std::cerr << "[UndefinedTensor]" << '\n';
   }
diff --git a/aten/src/ATen/core/TensorAccessor.h b/aten/src/ATen/core/TensorAccessor.h
index 8cf57d2b646fe..d6421bcced0a8 100644
--- a/aten/src/ATen/core/TensorAccessor.h
+++ b/aten/src/ATen/core/TensorAccessor.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <torch/headeronly/core/TensorAccessor.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/ArrayRef.h>
 #include <c10/util/Deprecated.h>
@@ -11,252 +12,37 @@
 
 namespace at {
 
-// The PtrTraits argument to the TensorAccessor/GenericPackedTensorAccessor
-// is used to enable the __restrict__ keyword/modifier for the data
-// passed to cuda.
-template <typename T>
-struct DefaultPtrTraits {
-  typedef T* PtrType;
-};
-
+using torch::headeronly::DefaultPtrTraits;
 #if defined(__CUDACC__) || defined(__HIPCC__)
-template <typename T>
-struct RestrictPtrTraits {
-  typedef T* __restrict__ PtrType;
-};
+  using torch::headeronly::RestrictPtrTraits;
 #endif
 
-// TensorAccessorBase and TensorAccessor are used for both CPU and CUDA tensors.
-// For CUDA tensors it is used in device code (only). This means that we restrict ourselves
-// to functions and types available there (e.g. IntArrayRef isn't).
-
-// The PtrTraits argument is only relevant to cuda to support `__restrict__` pointers.
-template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
-class TensorAccessorBase {
-public:
-  typedef typename PtrTraits<T>::PtrType PtrType;
-
-  C10_HOST_DEVICE TensorAccessorBase(
-      PtrType data_,
-      const index_t* sizes_,
-      const index_t* strides_)
-      : data_(data_), sizes_(sizes_), strides_(strides_) {}
-  C10_HOST IntArrayRef sizes() const {
-    return IntArrayRef(sizes_,N);
-  }
-  C10_HOST IntArrayRef strides() const {
-    return IntArrayRef(strides_,N);
-  }
-  C10_HOST_DEVICE index_t stride(index_t i) const {
-    return strides_[i];
-  }
-  C10_HOST_DEVICE index_t size(index_t i) const {
-    return sizes_[i];
-  }
-  C10_HOST_DEVICE PtrType data() {
-    return data_;
-  }
-  C10_HOST_DEVICE const PtrType data() const {
-    return data_;
-  }
-protected:
-  PtrType data_;
-  const index_t* sizes_;
-  const index_t* strides_;
-};
-
-// The `TensorAccessor` is typically instantiated for CPU `Tensor`s using
-// `Tensor.accessor<T, N>()`.
-// For CUDA `Tensor`s, `GenericPackedTensorAccessor` is used on the host and only
-// indexing on the device uses `TensorAccessor`s.
 template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
-class TensorAccessor : public TensorAccessorBase<T,N,PtrTraits,index_t> {
-public:
-  typedef typename PtrTraits<T>::PtrType PtrType;
-
-  C10_HOST_DEVICE TensorAccessor(
-      PtrType data_,
-      const index_t* sizes_,
-      const index_t* strides_)
-      : TensorAccessorBase<T, N, PtrTraits, index_t>(data_,sizes_,strides_) {}
-
-  C10_HOST_DEVICE TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) {
-    return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1);
-  }
-
-  C10_HOST_DEVICE const TensorAccessor<T, N-1, PtrTraits, index_t> operator[](index_t i) const {
-    return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1);
-  }
-};
-
-template<typename T, template <typename U> class PtrTraits, typename index_t>
-class TensorAccessor<T,1,PtrTraits,index_t> : public TensorAccessorBase<T,1,PtrTraits,index_t> {
-public:
-  typedef typename PtrTraits<T>::PtrType PtrType;
-
-  C10_HOST_DEVICE TensorAccessor(
-      PtrType data_,
-      const index_t* sizes_,
-      const index_t* strides_)
-      : TensorAccessorBase<T, 1, PtrTraits, index_t>(data_,sizes_,strides_) {}
-  C10_HOST_DEVICE T & operator[](index_t i) {
-    // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
-    return this->data_[this->strides_[0]*i];
-  }
-  C10_HOST_DEVICE const T & operator[](index_t i) const {
-    return this->data_[this->strides_[0]*i];
-  }
-};
+using TensorAccessorBase = torch::headeronly::detail::TensorAccessorBase<c10::IntArrayRef, T, N, PtrTraits, index_t>;
 
-
-// GenericPackedTensorAccessorBase and GenericPackedTensorAccessor are used on for CUDA `Tensor`s on the host
-// and as
-// In contrast to `TensorAccessor`s, they copy the strides and sizes on instantiation (on the host)
-// in order to transfer them on the device when calling kernels.
-// On the device, indexing of multidimensional tensors gives to `TensorAccessor`s.
-// Use RestrictPtrTraits as PtrTraits if you want the tensor's data pointer to be marked as __restrict__.
-// Instantiation from data, sizes, strides is only needed on the host and std::copy isn't available
-// on the device, so those functions are host only.
 template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
-class GenericPackedTensorAccessorBase {
-public:
-  typedef typename PtrTraits<T>::PtrType PtrType;
-  C10_HOST GenericPackedTensorAccessorBase(
-      PtrType data_,
-      const index_t* sizes_,
-      const index_t* strides_)
-      : data_(data_) {
-    std::copy(sizes_, sizes_ + N, std::begin(this->sizes_));
-    std::copy(strides_, strides_ + N, std::begin(this->strides_));
-  }
+using TensorAccessor = torch::headeronly::detail::TensorAccessor<c10::IntArrayRef, T, N, PtrTraits, index_t>;
 
-  // if index_t is not int64_t, we want to have an int64_t constructor
-  template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
-  C10_HOST GenericPackedTensorAccessorBase(
-      PtrType data_,
-      const source_index_t* sizes_,
-      const source_index_t* strides_)
-      : data_(data_) {
-    for (const auto i : c10::irange(N)) {
-      this->sizes_[i] = sizes_[i];
-      this->strides_[i] = strides_[i];
-    }
-  }
+namespace detail {
 
-  C10_HOST_DEVICE index_t stride(index_t i) const {
-    return strides_[i];
-  }
-  C10_HOST_DEVICE index_t size(index_t i) const {
-    return sizes_[i];
-  }
-  C10_HOST_DEVICE PtrType data() {
-    return data_;
-  }
-  C10_HOST_DEVICE const PtrType data() const {
-    return data_;
-  }
-protected:
-  PtrType data_;
-  // NOLINTNEXTLINE(*c-arrays*)
-  index_t sizes_[N];
-  // NOLINTNEXTLINE(*c-arrays*)
-  index_t strides_[N];
-  C10_HOST void bounds_check_(index_t i) const {
-    TORCH_CHECK_INDEX(
+template <size_t N, typename index_t>
+struct IndexBoundsCheck {
+    IndexBoundsCheck(index_t i) {
+      TORCH_CHECK_INDEX(
         0 <= i && i < index_t{N},
         "Index ",
         i,
         " is not within bounds of a tensor of dimension ",
         N);
-  }
+    }
 };
+}  // namespace detail
 
 template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
-class GenericPackedTensorAccessor : public GenericPackedTensorAccessorBase<T,N,PtrTraits,index_t> {
-public:
-  typedef typename PtrTraits<T>::PtrType PtrType;
-
-  C10_HOST GenericPackedTensorAccessor(
-      PtrType data_,
-      const index_t* sizes_,
-      const index_t* strides_)
-      : GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t>(data_, sizes_, strides_) {}
-
-  // if index_t is not int64_t, we want to have an int64_t constructor
-  template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
-  C10_HOST GenericPackedTensorAccessor(
-      PtrType data_,
-      const source_index_t* sizes_,
-      const source_index_t* strides_)
-      : GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t>(data_, sizes_, strides_) {}
-
-  C10_DEVICE TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) {
-    index_t* new_sizes = this->sizes_ + 1;
-    index_t* new_strides = this->strides_ + 1;
-    return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i, new_sizes, new_strides);
-  }
-
-  C10_DEVICE const TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) const {
-    const index_t* new_sizes = this->sizes_ + 1;
-    const index_t* new_strides = this->strides_ + 1;
-    return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i, new_sizes, new_strides);
-  }
-
-  /// Returns a PackedTensorAccessor of the same dimension after transposing the
-  /// two dimensions given. Does not actually move elements; transposition is
-  /// made by permuting the size/stride arrays. If the dimensions are not valid,
-  /// asserts.
-  C10_HOST GenericPackedTensorAccessor<T, N, PtrTraits, index_t> transpose(
-      index_t dim1,
-      index_t dim2) const {
-    this->bounds_check_(dim1);
-    this->bounds_check_(dim2);
-    GenericPackedTensorAccessor<T, N, PtrTraits, index_t> result(
-        this->data_, this->sizes_, this->strides_);
-    std::swap(result.strides_[dim1], result.strides_[dim2]);
-    std::swap(result.sizes_[dim1], result.sizes_[dim2]);
-    return result;
-  }
-};
-
-template<typename T, template <typename U> class PtrTraits, typename index_t>
-class GenericPackedTensorAccessor<T,1,PtrTraits,index_t> : public GenericPackedTensorAccessorBase<T,1,PtrTraits,index_t> {
-public:
-  typedef typename PtrTraits<T>::PtrType PtrType;
-  C10_HOST GenericPackedTensorAccessor(
-      PtrType data_,
-      const index_t* sizes_,
-      const index_t* strides_)
-      : GenericPackedTensorAccessorBase<T, 1, PtrTraits, index_t>(data_, sizes_, strides_) {}
-
-  // if index_t is not int64_t, we want to have an int64_t constructor
-  template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
-  C10_HOST GenericPackedTensorAccessor(
-      PtrType data_,
-      const source_index_t* sizes_,
-      const source_index_t* strides_)
-      : GenericPackedTensorAccessorBase<T, 1, PtrTraits, index_t>(data_, sizes_, strides_) {}
-
-  C10_DEVICE T & operator[](index_t i) {
-    return this->data_[this->strides_[0] * i];
-  }
-  C10_DEVICE const T& operator[](index_t i) const {
-    return this->data_[this->strides_[0]*i];
-  }
-
-  // Same as in the general N-dimensional case, but note that in the
-  // 1-dimensional case the returned PackedTensorAccessor will always be an
-  // identical copy of the original
-  C10_HOST GenericPackedTensorAccessor<T, 1, PtrTraits, index_t> transpose(
-      index_t dim1,
-      index_t dim2) const {
-    this->bounds_check_(dim1);
-    this->bounds_check_(dim2);
-    return GenericPackedTensorAccessor<T, 1, PtrTraits, index_t>(
-        this->data_, this->sizes_, this->strides_);
-  }
-};
+using GenericPackedTensorAccessorBase = torch::headeronly::detail::GenericPackedTensorAccessorBase<detail::IndexBoundsCheck<N, index_t>, T, N, PtrTraits, index_t>;
 
+template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+using GenericPackedTensorAccessor = torch::headeronly::detail::GenericPackedTensorAccessor<TensorAccessor<T, N-1, PtrTraits, index_t>, detail::IndexBoundsCheck<N, index_t>, T, N, PtrTraits, index_t>;
 
 // Can't put this directly into the macro function args because of commas
 #define AT_X GenericPackedTensorAccessor<T, N, PtrTraits, index_t>
diff --git a/aten/src/ATen/core/TensorBase.h b/aten/src/ATen/core/TensorBase.h
index 2b9558197bdcb..2d7ca10433d6a 100644
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@@ -245,6 +245,9 @@ class TORCH_API TensorBase {
   size_t weak_use_count() const noexcept {
     return impl_.weak_use_count();
   }
+  bool is_uniquely_owned() const noexcept {
+    return impl_.is_uniquely_owned();
+  }
 
   std::string toString() const;
 
diff --git a/aten/src/ATen/core/Vitals.cpp b/aten/src/ATen/core/Vitals.cpp
index 1cfc720aca52b..ac1ee45d58345 100644
--- a/aten/src/ATen/core/Vitals.cpp
+++ b/aten/src/ATen/core/Vitals.cpp
@@ -9,8 +9,8 @@ APIVitals VitalsAPI;
 
 std::ostream& operator<<(std::ostream& os, TorchVital const& tv) {
   for (const auto& m : tv.attrs) {
-    os << "[TORCH_VITAL] " << tv.name << "." << m.first << "\t\t "
-       << m.second.value << "\n";
+    os << "[TORCH_VITAL] " << tv.name << '.' << m.first << "\t\t "
+       << m.second.value << '\n';
   }
   return os;
 }
diff --git a/aten/src/ATen/core/alias_info.h b/aten/src/ATen/core/alias_info.h
index bf0ff6ee72d3b..6a3335c328be2 100644
--- a/aten/src/ATen/core/alias_info.h
+++ b/aten/src/ATen/core/alias_info.h
@@ -100,18 +100,18 @@ inline bool operator==(const AliasInfo& lhs, const AliasInfo& rhs) {
 
 // this does match the way things are represented in the schema
 inline std::ostream& operator<<(std::ostream& out, const AliasInfo& aliasInfo) {
-  out << "(";
+  out << '(';
   bool first = true;
   for (const auto& set : aliasInfo.beforeSets()) {
     if (first) {
       first = false;
     } else {
-      out << "|";
+      out << '|';
     }
     out << set.toUnqualString();
   }
   if (aliasInfo.isWrite()) {
-    out << "!";
+    out << '!';
   }
   if (aliasInfo.beforeSets() != aliasInfo.afterSets()) {
     out << " -> ";
@@ -120,12 +120,12 @@ inline std::ostream& operator<<(std::ostream& out, const AliasInfo& aliasInfo) {
       if (first) {
         first = false;
       } else {
-        out << "|";
+        out << '|';
       }
       out << set.toUnqualString();
     }
   }
-  out << ")";
+  out << ')';
   return out;
 }
 } // namespace c10
diff --git a/aten/src/ATen/core/blob.h b/aten/src/ATen/core/blob.h
index 251da65e0896f..617d6a982ab4e 100644
--- a/aten/src/ATen/core/blob.h
+++ b/aten/src/ATen/core/blob.h
@@ -198,7 +198,7 @@ inline void swap(Blob& lhs, Blob& rhs)  noexcept {
 }
 
 inline std::ostream& operator<<(std::ostream& out, const Blob& v) {
-  return out << "Blob[" << v.TypeName() << "]";
+  return out << "Blob[" << v.TypeName() << ']';
 }
 
 } // namespace caffe2
diff --git a/aten/src/ATen/core/class_type.cpp b/aten/src/ATen/core/class_type.cpp
index 800d9ea0ef9f6..a65124e80979e 100644
--- a/aten/src/ATen/core/class_type.cpp
+++ b/aten/src/ATen/core/class_type.cpp
@@ -456,8 +456,8 @@ bool ClassType::isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const {
           *why_not << "Method on class '" << repr_str()
                    << "' (1) is not compatible with interface '"
                    << rhs.repr_str() << "' (2)\n"
-                   << "  (1) " << self_method->getSchema() << "\n"
-                   << "  (2) " << schema << "\n";
+                   << "  (1) " << self_method->getSchema() << '\n'
+                   << "  (2) " << schema << '\n';
         }
         return false;
       }
diff --git a/aten/src/ATen/core/class_type.h b/aten/src/ATen/core/class_type.h
index ea537400ef73d..f6f6bade9c90d 100644
--- a/aten/src/ATen/core/class_type.h
+++ b/aten/src/ATen/core/class_type.h
@@ -100,7 +100,7 @@ struct TORCH_API ClassType : public NamedType {
   std::string repr_str() const override {
     std::stringstream ss;
     ss << str()
-       << " (of Python compilation unit at: " << compilation_unit().get() << ")";
+       << " (of Python compilation unit at: " << compilation_unit().get() << ')';
     return ss.str();
   }
 
diff --git a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp
index 9180d0d19e644..369bd374747ad 100644
--- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp
+++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp
@@ -58,12 +58,12 @@ std::string DispatchKeyExtractor::dumpState() const {
   std::ostringstream oss;
   for (const auto i : c10::irange(c10::utils::bitset::NUM_BITS())) {
     if (dispatch_arg_indices_reverse_.get(i)) {
-      oss << "1";
+      oss << '1';
     } else {
-      oss << "0";
+      oss << '0';
     }
   }
-  oss << " " << nonFallthroughKeys_ << "\n";
+  oss << ' ' << nonFallthroughKeys_ << '\n';
   return oss.str();
 }
 
diff --git a/aten/src/ATen/core/dispatch/Dispatcher.cpp b/aten/src/ATen/core/dispatch/Dispatcher.cpp
index afcaf51f231ae..5facca30a54f3 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.cpp
+++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp
@@ -69,8 +69,8 @@ class RegistrationListenerList final {
 
 void _print_dispatch_trace(const std::string& label, const std::string& op_name, const DispatchKeySet& dispatchKeySet) {
   auto nesting_value = dispatch_trace_nesting_value();
-  for (int64_t i = 0; i < nesting_value; ++i) std::cerr << " ";
-  std::cerr << label << " op=[" << op_name << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << "]" << std::endl;
+  for (int64_t i = 0; i < nesting_value; ++i) std::cerr << ' ';
+  std::cerr << label << " op=[" << op_name << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << ']' << std::endl;
 }
 } // namespace detail
 
diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
index 928474ec3336d..e2627354971a0 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
@@ -570,7 +570,7 @@ void OperatorEntry::checkInvariants() const {
 
 std::string OperatorEntry::listAllDispatchKeys() const {
   std::ostringstream str;
-  str << "[";
+  str << '[';
 
   bool has_kernels = false;
   for (auto k : allDispatchKeysInFullSet()) {
@@ -584,7 +584,7 @@ std::string OperatorEntry::listAllDispatchKeys() const {
     str << k;
     has_kernels = true;
   }
-  str << "]";
+  str << ']';
   return str.str();
 }
 
@@ -683,12 +683,12 @@ void OperatorEntry::setReportErrorCallback_(std::unique_ptr<c10::SafePyObject> c
 // This WON'T report backend fallbacks.
 std::string OperatorEntry::dumpState() const {
   std::ostringstream oss;
-  oss << "name: " << name_ << "\n";
+  oss << "name: " << name_ << '\n';
   if (schema_) {
-    oss << "schema: " << schema_->schema << "\n";
-    oss << "debug: " << schema_->debug << "\n";
+    oss << "schema: " << schema_->schema << '\n';
+    oss << "debug: " << schema_->debug << '\n';
     oss << "alias analysis kind: " << toString(schema_->schema.aliasAnalysis())
-        << (schema_->schema.isDefaultAliasAnalysisKind() ? " (default)" : "") << "\n";
+        << (schema_->schema.isDefaultAliasAnalysisKind() ? " (default)" : "") << '\n';
   } else {
     oss << "schema: (none)\n";
   }
diff --git a/aten/src/ATen/core/function_schema.cpp b/aten/src/ATen/core/function_schema.cpp
index 6587af0f9ccc0..ffccbe282ddd2 100644
--- a/aten/src/ATen/core/function_schema.cpp
+++ b/aten/src/ATen/core/function_schema.cpp
@@ -7,7 +7,7 @@
 namespace c10 {
 
 void FunctionSchema::dump() const {
-  std::cout << *this << "\n";
+  std::cout << *this << '\n';
 }
 
 const std::vector<Argument>& FunctionSchema::getCorrectList(SchemaArgType type) const {
@@ -210,9 +210,9 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
 
   out << schema.name();
   if (!schema.overload_name().empty()) {
-    out << "." << schema.overload_name();
+    out << '.' << schema.overload_name();
   }
-  out << "(";
+  out << '(';
 
   bool seen_kwarg_only = false;
   for (const auto i : c10::irange(schema.arguments().size())) {
@@ -273,7 +273,7 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
   }
 
   if (need_paren) {
-    out << "(";
+    out << '(';
   }
   for (const auto i : c10::irange(returns.size())) {
     if (i > 0) {
@@ -288,7 +288,7 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
     out << "...";
   }
   if (need_paren) {
-    out << ")";
+    out << ')';
   }
   return out;
 }
@@ -471,7 +471,7 @@ bool FunctionSchema::isForwardCompatibleWith(
     if (!arguments().at(i).isForwardCompatibleWith(old.arguments().at(i))) {
       if (why_not) {
         why_not
-            << "'" << arguments().at(i).name() << "'"
+            << '\'' << arguments().at(i).name() << '\''
             << " is not forward compatible with the older version of the schema";
       }
       return false;
@@ -511,7 +511,7 @@ bool FunctionSchema::isForwardCompatibleWith(
              .isForwardCompatibleWith(old.arguments().at(i))) {
       if (why_not) {
         why_not << "Out argument '"
-                << "'" << arguments().at(i).name()
+                << '\'' << arguments().at(i).name()
                 << " is not FC with the older version of the schema";
       }
       return false;
diff --git a/aten/src/ATen/core/function_schema.h b/aten/src/ATen/core/function_schema.h
index c3e1520dc9868..f349567c26478 100644
--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@@ -571,7 +571,7 @@ inline std::ostream& operator<<(std::ostream& out, const Argument& arg) {
     if (arg.N()) {
         N = std::to_string(*arg.N());
     }
-    out << "[" << N << "]";
+    out << '[' << N << ']';
   } else {
     out << unopt_type->str();
   }
@@ -582,15 +582,15 @@ inline std::ostream& operator<<(std::ostream& out, const Argument& arg) {
   }
 
   if (is_opt) {
-    out << "?";
+    out << '?';
   }
 
   if (!arg.name().empty()) {
-    out << " " << arg.name();
+    out << ' ' << arg.name();
   }
 
   if (arg.default_value()) {
-    out << "=";
+    out << '=';
     if ((type->kind() == c10::TypeKind::StringType ||
         unopt_type->kind() == c10::TypeKind::StringType) &&
         arg.default_value().value().isString()) {
diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
index 1ff8dd0410949..6e4ee82ab1137 100644
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@@ -66,7 +66,7 @@ bool operator==(const ivalue::Tuple& lhs, const ivalue::Tuple& rhs) {
 }
 
 std::ostream& operator<<(std::ostream& out, const ivalue::EnumHolder& v) {
-  out << v.qualifiedClassName() << "." << v.name();
+  out << v.qualifiedClassName() << '.' << v.name();
   return out;
 }
 
@@ -526,7 +526,7 @@ std::ostream& printMaybeAnnotatedList(
       !elementTypeCanBeInferredFromMembers(list_elem_type)) {
     out << "annotate(" << the_list.type<c10::Type>()->annotation_str() << ", ";
     printList(out, the_list.toListRef(), "[", "]", formatter);
-    out << ")";
+    out << ')';
     return out;
   } else {
     return printList(out, the_list.toListRef(), "[", "]", formatter);
@@ -538,7 +538,7 @@ std::ostream& printDict(
     std::ostream& out,
     const Dict& v,
     const IValueFormatter& formatter) {
-  out << "{";
+  out << '{';
 
   bool first = true;
   for (const auto& pair : v) {
@@ -552,7 +552,7 @@ std::ostream& printDict(
     first = false;
   }
 
-  out << "}";
+  out << '}';
   return out;
 }
 }
@@ -565,8 +565,8 @@ static std::ostream& printMaybeAnnotatedDict(
   auto value_type = the_dict.type()->castRaw<DictType>()->getValueType();
   if (the_dict.toGenericDict().empty() ||
       !elementTypeCanBeInferredFromMembers(value_type)) {
-    out << "annotate(" << the_dict.type<c10::Type>()->annotation_str() << ",";
-    printDict(out, the_dict.toGenericDict(), formatter) << ")";
+    out << "annotate(" << the_dict.type<c10::Type>()->annotation_str() << ',';
+    printDict(out, the_dict.toGenericDict(), formatter) << ')';
   } else {
     return printDict(out, the_dict.toGenericDict(), formatter);
   }
@@ -577,7 +577,7 @@ static std::ostream& printComplex(std::ostream & out, const IValue & v) {
   c10::complex<double> d = v.toComplexDouble();
   IValue real(d.real()), imag(std::abs(d.imag()));
   auto sign = d.imag() >= 0 ? '+' : '-';
-  return out << real << sign << imag << "j";
+  return out << real << sign << imag << 'j';
 }
 
 std::ostream& IValue::repr(
@@ -605,9 +605,9 @@ std::ostream& IValue::repr(
         if (static_cast<double>(i) == d) {
           // -0.0 (signed zero) needs to be parsed as -0.
           if (i == 0 && std::signbit(d)) {
-            return out << "-" << i << ".";
+            return out << '-' << i << '.';
           }
-          return out << i << ".";
+          return out << i << '.';
         }
       }
       auto orig_prec = out.precision();
@@ -643,20 +643,20 @@ std::ostream& IValue::repr(
       device_stream << v.toDevice();
       out << "torch.device(";
       c10::printQuotedString(out, device_stream.str());
-      return out << ")";
+      return out << ')';
     }
     case IValue::Tag::Generator: {
       auto generator = v.toGenerator();
       out << "torch.Generator(device=";
       c10::printQuotedString(out, generator.device().str());
-      out << ", seed=" << generator.current_seed() << ")";
+      out << ", seed=" << generator.current_seed() << ')';
       return out;
     }
     case IValue::Tag::GenericDict:
       return printMaybeAnnotatedDict(out, v, formatter);
     case IValue::Tag::Enum: {
       auto enum_holder = v.toEnumHolder();
-      return out << enum_holder->qualifiedClassName() << "." <<
+      return out << enum_holder->qualifiedClassName() << '.' <<
           enum_holder->name();
     }
     case IValue::Tag::Object: {
@@ -801,7 +801,7 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
       if (c == FP_NORMAL || c == FP_ZERO) {
         int64_t i = static_cast<int64_t>(d);
         if (static_cast<double>(i) == d) {
-          return out << i << ".";
+          return out << i << '.';
         }
       }
       auto orig_prec = out.precision();
@@ -852,7 +852,7 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
       return printDict(out, v.toGenericDict(), formatter);
     case IValue::Tag::PyObject: {
       auto py_obj = v.toPyObject();
-      return out << "<PyObject at" << py_obj << ">";
+      return out << "<PyObject at" << py_obj << '>';
     }
     case IValue::Tag::Generator:
       return out << "Generator";
@@ -862,22 +862,22 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
       // TODO we should attempt to call __str__ if the object defines it.
       auto obj = v.toObject();
       // print this out the way python would do it
-      return out << "<" << obj->name() << " object at " << obj.get() << ">";
+      return out << '<' << obj->name() << " object at " << obj.get() << '>';
     }
     case IValue::Tag::Enum: {
       auto enum_holder = v.toEnumHolder();
-      return out << "Enum<" << enum_holder->unqualifiedClassName() << "." <<
-          enum_holder->name() << ">";
+      return out << "Enum<" << enum_holder->unqualifiedClassName() << '.' <<
+          enum_holder->name() << '>';
     }
 
   }
-  return out << "<Invalid IValue tag=" << std::to_string(static_cast<uint32_t>(v.tag)) << ">";
+  return out << "<Invalid IValue tag=" << std::to_string(static_cast<uint32_t>(v.tag)) << '>';
 }
 
 #undef TORCH_FORALL_TAGS
 
 void IValue::dump() const {
-  std::cout << *this << "\n";
+  std::cout << *this << '\n';
 }
 
 std::shared_ptr<ClassType> ivalue::Object::type() const {
@@ -1050,7 +1050,7 @@ c10::intrusive_ptr<ivalue::Object> ivalue::Object::deepcopy(
       std::stringstream err;
       err << "Cannot serialize custom bound C++ class";
       if (auto qualname = type()->name()) {
-        err << " " << qualname->qualifiedName();
+        err << ' ' << qualname->qualifiedName();
       }
       err << ". Please define serialization methods via def_pickle() for "
             "this class.";
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
index f13b0613691b4..73aed03da073d 100644
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@@ -18,6 +18,8 @@
 #include <unordered_set>
 #include <utility>
 
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default")
+
 namespace torch {
 class TORCH_API CustomClassHolder : public c10::intrusive_ptr_target {};
 namespace jit {
@@ -1630,4 +1632,6 @@ struct TORCH_API WeakOrStrongTypePtr {
 
 } // namespace c10
 
+C10_DIAGNOSTIC_POP()
+
 #include <ATen/core/ivalue_inl.h> // IWYU pragma: keep
diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h
index 8d1c3aa83dadb..ac7540cffd18f 100644
--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@@ -29,6 +29,8 @@
 #include <c10/util/intrusive_ptr.h>
 #include <c10/util/irange.h>
 
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default")
+
 namespace torch {
 namespace jit {
 struct Function;
@@ -2567,3 +2569,5 @@ TypePtr IValue::type() const {
 }
 
 } // namespace c10
+
+C10_DIAGNOSTIC_POP()
diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
index 666d1ade5789c..535831ea11d6e 100644
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@@ -211,7 +211,7 @@ struct TORCH_API OptionalType : public UnionType {
 
   std::string str() const override {
     std::stringstream ss;
-    ss << getElementType()->str() << "?";
+    ss << getElementType()->str() << '?';
     return ss.str();
   }
 
@@ -240,7 +240,7 @@ struct TORCH_API OptionalType : public UnionType {
 
   std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
     std::stringstream ss;
-    ss << "Optional[" << getElementType()->annotation_str(printer) << "]";
+    ss << "Optional[" << getElementType()->annotation_str(printer) << ']';
     return ss.str();
   }
 };
@@ -906,7 +906,7 @@ struct TORCH_API ListType
 
   std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
     std::stringstream ss;
-    ss << "List[" << getElementType()->annotation_str(printer) << "]";
+    ss << "List[" << getElementType()->annotation_str(printer) << ']';
     return ss.str();
   }
 };
@@ -946,7 +946,7 @@ struct TORCH_API DictType : public SharedType {
   std::string str() const override {
     std::stringstream ss;
     ss << "Dict(" << getKeyType()->str() << ", " << getValueType()->str()
-       << ")";
+       << ')';
     return ss.str();
   }
 
@@ -1018,7 +1018,7 @@ struct TORCH_API FutureType
 
   std::string str() const override {
     std::stringstream ss;
-    ss << "Future(" << getElementType()->str() << ")";
+    ss << "Future(" << getElementType()->str() << ')';
     return ss.str();
   }
   TypePtr createWithContained(
@@ -1041,7 +1041,7 @@ struct TORCH_API FutureType
 
   std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
     std::stringstream ss;
-    ss << "Future[" << getElementType()->annotation_str(printer) << "]";
+    ss << "Future[" << getElementType()->annotation_str(printer) << ']';
     return ss.str();
   }
 };
@@ -1060,7 +1060,7 @@ struct TORCH_API AwaitType
 
   std::string str() const override {
     std::stringstream ss;
-    ss << "Await(" << getElementType()->str() << ")";
+    ss << "Await(" << getElementType()->str() << ')';
     return ss.str();
   }
   TypePtr createWithContained(
@@ -1083,7 +1083,7 @@ struct TORCH_API AwaitType
 
   std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
     std::stringstream ss;
-    ss << "Await[" << getElementType()->annotation_str(printer) << "]";
+    ss << "Await[" << getElementType()->annotation_str(printer) << ']';
     return ss.str();
   }
 };
@@ -1102,7 +1102,7 @@ struct TORCH_API RRefType
 
   std::string str() const override {
     std::stringstream ss;
-    ss << "RRef(" << getElementType()->str() << ")";
+    ss << "RRef(" << getElementType()->str() << ')';
     return ss.str();
   }
   TypePtr createWithContained(
@@ -1115,7 +1115,7 @@ struct TORCH_API RRefType
 
   std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
     std::stringstream ss;
-    ss << "RRef[" << getElementType()->annotation_str(printer) << "]";
+    ss << "RRef[" << getElementType()->annotation_str(printer) << ']';
     return ss.str();
   }
 };
diff --git a/aten/src/ATen/core/operator_name.cpp b/aten/src/ATen/core/operator_name.cpp
index 43a1fd24749a7..e55a84a4d305a 100644
--- a/aten/src/ATen/core/operator_name.cpp
+++ b/aten/src/ATen/core/operator_name.cpp
@@ -11,7 +11,7 @@ std::string toString(const OperatorName& opName) {
 std::ostream& operator<<(std::ostream& os, const OperatorName& opName) {
   os << opName.name;
   if (!opName.overload_name.empty()) {
-    os << "." << opName.overload_name;
+    os << '.' << opName.overload_name;
   }
   return os;
 }
diff --git a/aten/src/ATen/core/tensor_type.cpp b/aten/src/ATen/core/tensor_type.cpp
index 9d8080cb8f317..d428aceb3d04c 100644
--- a/aten/src/ATen/core/tensor_type.cpp
+++ b/aten/src/ATen/core/tensor_type.cpp
@@ -65,7 +65,7 @@ VaryingShape<T> VaryingShape<T>::merge(const VaryingShape<T>& other) const {
 
 template <typename T>
 std::ostream& operator<<(std::ostream& out, const VaryingShape<T>& vs) {
-  out << "(";
+  out << '(';
   if (!vs.size()) {
     out << "*)";
     return out;
@@ -79,10 +79,10 @@ std::ostream& operator<<(std::ostream& out, const VaryingShape<T>& vs) {
     if (v.has_value()) {
       out << v.value();
     } else {
-      out << "*";
+      out << '*';
     }
   }
-  out << ")";
+  out << ')';
   return out;
 }
 
@@ -105,7 +105,7 @@ std::ostream& operator<<(
   }
   auto sizes_opt = ss.sizes();
 
-  os << "(";
+  os << '(';
   for (size_t i = 0; i < rank_opt.value(); i++) {
     if (i > 0) {
       os << ", ";
@@ -113,10 +113,10 @@ std::ostream& operator<<(
     if(sizes_opt.has_value() && sizes_opt.value()[i].is_static()) {
       os << sizes_opt.value()[i];
     } else {
-      os << "*";
+      os << '*';
     }
   }
-  os << ")";
+  os << ')';
 
   return os;
 }
@@ -131,17 +131,17 @@ std::ostream& operator<<(std::ostream& os, const ShapeSymbol& s) {
 }
 
 std::ostream& operator<<(std::ostream& os, const Stride& s) {
-  os << "{";
+  os << '{';
   if (s.stride_index_.has_value()) {
     os << *s.stride_index_;
   } else {
-    os << "*";
+    os << '*';
   }
-  os << ":";
+  os << ':';
   if (s.stride_.has_value()) {
     os << *s.stride_;
   } else {
-    os << "*";
+    os << '*';
   }
   os << '}';
   return os;
diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp
index abba4e14583a3..46dc550b1f37b 100644
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@@ -67,7 +67,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
       bool has_valid_strides_info = ndim > 0 &&
           value->strides().isComplete() && value->strides().size() == ndim;
 
-      out << "(";
+      out << '(';
       size_t i = 0;
       bool symbolic = type_verbosity() == TypeVerbosity::Symbolic;
       for (i = 0; i < *ndim; ++i) {
@@ -79,7 +79,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
         } else if (symbolic) {
           out << value->symbolic_sizes().at(i);
         } else {
-          out << "*";
+          out << '*';
         }
       }
       if (has_valid_strides_info &&
@@ -91,7 +91,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
           }
           out << value->strides()[i].value();
         }
-        out << "]";
+        out << ']';
       }
       if (type_verbosity() >= TypeVerbosity::Full) {
         if (value->requiresGrad()) {
@@ -107,12 +107,12 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
           out << "device=" << *value->device();
         }
       }
-      out << ")";
+      out << ')';
     } else {
       if (type_verbosity() >= TypeVerbosity::Full) {
         size_t i = 0;
         if (value->requiresGrad()) {
-          out << "("
+          out << '('
               << "requires_grad=" << *value->requiresGrad();
           i++;
         }
@@ -120,7 +120,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
           out << ((i++ > 0) ? ", " : "(") << "device=" << *value->device();
         }
         if (i > 0) {
-          out << ")";
+          out << ')';
         }
       }
     }
@@ -133,18 +133,18 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
     out << *prim << "[]";
   } else if (t.kind() == TypeKind::OptionalType) {
     auto prim = t.castRaw<OptionalType>()->getElementType();
-    out << *prim << "?";
+    out << *prim << '?';
   } else if(t.kind() == TypeKind::FutureType) {
     auto elem = t.castRaw<FutureType>()->getElementType();
-    out << "Future[" << *elem << "]";
+    out << "Future[" << *elem << ']';
   } else if(t.kind() == TypeKind::RRefType) {
     auto elem = t.castRaw<RRefType>()->getElementType();
-    out << "RRef[" << *elem << "]";
+    out << "RRef[" << *elem << ']';
   } else if(auto tup = t.cast<TupleType>()) {
     if (tup->schema()) {
       out << "NamedTuple";
     }
-    out << "(";
+    out << '(';
     for(size_t i = 0; i < tup->elements().size(); ++i) {
       if(i > 0)
         out << ", ";
@@ -160,7 +160,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
         out << *(tup->elements()[i]);
       }
     }
-    out << ")";
+    out << ')';
   } else if (t.kind() == TypeKind::FunctionType) {
     out << "Function";
   } else {
@@ -475,7 +475,7 @@ std::optional<TypePtr> unifyTypeList(
       why_not << "Could not unify type list since element " << i << " of type "
               << elements.at(i)->repr_str()
               << " did not match the types before it ("
-              << ret_type->repr_str() << ")";
+              << ret_type->repr_str() << ')';
       return std::nullopt;
     }
     ret_type = *maybe_unified;
@@ -907,13 +907,13 @@ std::string TupleType::str() const {
     // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
     ss << name()->qualifiedName();
   } else {
-    ss << "(";
+    ss << '(';
     for(size_t i = 0; i < elements().size(); ++i) {
       if(i > 0)
         ss << ", ";
       ss << elements()[i]->str();
     }
-    ss << ")";
+    ss << ')';
   }
   return ss.str();
 }
@@ -1003,8 +1003,8 @@ bool InterfaceType::isSubTypeImpl(
           *why_not << "Method on interface '" << lhs.repr_str()
                    << "' (1) is not compatible with interface '"
                    << rhs.repr_str() << "' (2)\n"
-                   << "  (1) " << *self_schema << "\n"
-                   << "  (2) " << schema << "\n";
+                   << "  (1) " << *self_schema << '\n'
+                   << "  (2) " << schema << '\n';
           return false;
         }
         return false;
@@ -1078,7 +1078,7 @@ SymbolicShape SymbolicShape::merge(const SymbolicShape& other) const {
 }
 
 void SymbolicShape::dump() const {
-  std::cout << *this << "\n";
+  std::cout << *this << '\n';
 }
 
 bool EnumType::isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const {
diff --git a/aten/src/ATen/core/union_type.cpp b/aten/src/ATen/core/union_type.cpp
index dc4cb78872182..8731c2cbc4952 100644
--- a/aten/src/ATen/core/union_type.cpp
+++ b/aten/src/ATen/core/union_type.cpp
@@ -205,9 +205,9 @@ UnionType::UnionType(std::vector<TypePtr> reference, TypeKind kind) : SharedType
     for (const auto i : c10::irange(reference.size())) {
       msg << reference[i]->repr_str();
       if (i > 0) {
-        msg << ",";
+        msg << ',';
       }
-      msg << " ";
+      msg << ' ';
     }
     msg << "} has the single type " << types_[0]->repr_str()
          << ". Use the common supertype instead of creating a Union"
diff --git a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
index 9e0b189bdac89..757ef839f965a 100644
--- a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
@@ -191,7 +191,7 @@ class Vectorized<BFloat16> {
     auto vals = svreinterpret_u16_bf16(values);
     vals = sveor_u16_x(ptrue, vals, mask);
     return svreinterpret_bf16_u16(vals);
-  };
+  }
   Vectorized<BFloat16> round() const;
   Vectorized<BFloat16> tan() const;
   Vectorized<BFloat16> tanh() const;
@@ -349,47 +349,47 @@ Vectorized<BFloat16> inline Vectorized<BFloat16>::frac() const {
     return convert_float_bfloat16(v1, v2);                     \
   }
 
-DEFINE_BF16_FUNC_VIA_FLOAT(isnan);
-DEFINE_BF16_FUNC_VIA_FLOAT(angle);
-DEFINE_BF16_FUNC_VIA_FLOAT(acos);
-DEFINE_BF16_FUNC_VIA_FLOAT(acosh);
-DEFINE_BF16_FUNC_VIA_FLOAT(asin);
-DEFINE_BF16_FUNC_VIA_FLOAT(atan);
-DEFINE_BF16_FUNC_VIA_FLOAT(atanh);
-DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(atan2);
-DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(copysign);
-DEFINE_BF16_FUNC_VIA_FLOAT(erf);
-DEFINE_BF16_FUNC_VIA_FLOAT(erfc);
-DEFINE_BF16_FUNC_VIA_FLOAT(exp);
-DEFINE_BF16_FUNC_VIA_FLOAT(exp2);
-DEFINE_BF16_FUNC_VIA_FLOAT(expm1);
-DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(fmod);
-DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(hypot);
-DEFINE_BF16_FUNC_VIA_FLOAT(i0);
-DEFINE_BF16_FUNC_VIA_FLOAT(i0e);
-DEFINE_BF16_FUNC_VIA_FLOAT(digamma);
-DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(igamma);
-DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(igammac);
-DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(nextafter);
-DEFINE_BF16_FUNC_VIA_FLOAT(log);
-DEFINE_BF16_FUNC_VIA_FLOAT(log2);
-DEFINE_BF16_FUNC_VIA_FLOAT(log10);
-DEFINE_BF16_FUNC_VIA_FLOAT(log1p);
-DEFINE_BF16_FUNC_VIA_FLOAT(sin);
-DEFINE_BF16_FUNC_VIA_FLOAT(sinh);
-DEFINE_BF16_FUNC_VIA_FLOAT(cos);
-DEFINE_BF16_FUNC_VIA_FLOAT(cosh);
-DEFINE_BF16_FUNC_VIA_FLOAT(ceil);
-DEFINE_BF16_FUNC_VIA_FLOAT(floor);
-DEFINE_BF16_FUNC_VIA_FLOAT(round);
-DEFINE_BF16_FUNC_VIA_FLOAT(tan);
-DEFINE_BF16_FUNC_VIA_FLOAT(tanh);
-DEFINE_BF16_FUNC_VIA_FLOAT(trunc);
-DEFINE_BF16_FUNC_VIA_FLOAT(lgamma);
-DEFINE_BF16_FUNC_VIA_FLOAT(sqrt);
-DEFINE_BF16_FUNC_VIA_FLOAT(reciprocal);
-DEFINE_BF16_FUNC_VIA_FLOAT(rsqrt);
-DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(pow);
+DEFINE_BF16_FUNC_VIA_FLOAT(isnan)
+DEFINE_BF16_FUNC_VIA_FLOAT(angle)
+DEFINE_BF16_FUNC_VIA_FLOAT(acos)
+DEFINE_BF16_FUNC_VIA_FLOAT(acosh)
+DEFINE_BF16_FUNC_VIA_FLOAT(asin)
+DEFINE_BF16_FUNC_VIA_FLOAT(atan)
+DEFINE_BF16_FUNC_VIA_FLOAT(atanh)
+DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(atan2)
+DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(copysign)
+DEFINE_BF16_FUNC_VIA_FLOAT(erf)
+DEFINE_BF16_FUNC_VIA_FLOAT(erfc)
+DEFINE_BF16_FUNC_VIA_FLOAT(exp)
+DEFINE_BF16_FUNC_VIA_FLOAT(exp2)
+DEFINE_BF16_FUNC_VIA_FLOAT(expm1)
+DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(fmod)
+DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(hypot)
+DEFINE_BF16_FUNC_VIA_FLOAT(i0)
+DEFINE_BF16_FUNC_VIA_FLOAT(i0e)
+DEFINE_BF16_FUNC_VIA_FLOAT(digamma)
+DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(igamma)
+DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(igammac)
+DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(nextafter)
+DEFINE_BF16_FUNC_VIA_FLOAT(log)
+DEFINE_BF16_FUNC_VIA_FLOAT(log2)
+DEFINE_BF16_FUNC_VIA_FLOAT(log10)
+DEFINE_BF16_FUNC_VIA_FLOAT(log1p)
+DEFINE_BF16_FUNC_VIA_FLOAT(sin)
+DEFINE_BF16_FUNC_VIA_FLOAT(sinh)
+DEFINE_BF16_FUNC_VIA_FLOAT(cos)
+DEFINE_BF16_FUNC_VIA_FLOAT(cosh)
+DEFINE_BF16_FUNC_VIA_FLOAT(ceil)
+DEFINE_BF16_FUNC_VIA_FLOAT(floor)
+DEFINE_BF16_FUNC_VIA_FLOAT(round)
+DEFINE_BF16_FUNC_VIA_FLOAT(tan)
+DEFINE_BF16_FUNC_VIA_FLOAT(tanh)
+DEFINE_BF16_FUNC_VIA_FLOAT(trunc)
+DEFINE_BF16_FUNC_VIA_FLOAT(lgamma)
+DEFINE_BF16_FUNC_VIA_FLOAT(sqrt)
+DEFINE_BF16_FUNC_VIA_FLOAT(reciprocal)
+DEFINE_BF16_FUNC_VIA_FLOAT(rsqrt)
+DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(pow)
 
 Vectorized<BFloat16> inline Vectorized<BFloat16>::operator==(
     const Vectorized<BFloat16>& other) const {
diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_convert.h b/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
index e968389987fc5..060d60fa3e2d8 100644
--- a/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
@@ -223,6 +223,62 @@ CONVERT_FROM_BF16_TEMPLATE(double)
 CONVERT_FROM_BF16_TEMPLATE(float16_t)
 #endif
 
+#ifdef __ARM_FEATURE_BF16
+
+// clang-[17, 20] crashes when autovectorizing static cast to bf16
+// Below is a workaround to have some vectorization
+// Works decently well for smaller int types
+template <typename from_type>
+inline void convertToBf16Impl(
+    const from_type* __restrict src,
+    c10::BFloat16* __restrict dst,
+    uint64_t n) {
+  bfloat16_t* dstPtr = reinterpret_cast<bfloat16_t*>(dst);
+  uint64_t loopBound = n - (n % 16);
+  uint64_t i = 0;
+  for (; i < loopBound; i += 16) {
+    float32x4_t a, b, c, d;
+    a[0] = static_cast<float>(src[i]);
+    a[1] = static_cast<float>(src[i + 1]);
+    a[2] = static_cast<float>(src[i + 2]);
+    a[3] = static_cast<float>(src[i + 3]);
+    b[0] = static_cast<float>(src[i + 4]);
+    b[1] = static_cast<float>(src[i + 5]);
+    b[2] = static_cast<float>(src[i + 6]);
+    b[3] = static_cast<float>(src[i + 7]);
+    c[0] = static_cast<float>(src[i + 8]);
+    c[1] = static_cast<float>(src[i + 9]);
+    c[2] = static_cast<float>(src[i + 10]);
+    c[3] = static_cast<float>(src[i + 11]);
+    d[0] = static_cast<float>(src[i + 12]);
+    d[1] = static_cast<float>(src[i + 13]);
+    d[2] = static_cast<float>(src[i + 14]);
+    d[3] = static_cast<float>(src[i + 15]);
+
+    vst1q_bf16(dstPtr + i, vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(a), b));
+    vst1q_bf16(dstPtr + i + 8, vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(c), d));
+  }
+
+#pragma clang loop vectorize(disable) interleave(disable) unroll(disable)
+  for (; i < n; i++) {
+    float a = static_cast<float>(src[i]);
+    dstPtr[i] = vcvth_bf16_f32(a);
+  }
+}
+
+#define CONVERT_TO_BF16_TEMPLATE(from_type)                                  \
+  template <>                                                                \
+  inline void convert(const from_type* src, c10::BFloat16* dst, int64_t n) { \
+    return convertToBf16Impl<from_type>(src, dst, n);                        \
+  }
+
+CONVERT_TO_BF16_TEMPLATE(uint8_t)
+CONVERT_TO_BF16_TEMPLATE(int8_t)
+CONVERT_TO_BF16_TEMPLATE(int16_t)
+CONVERT_TO_BF16_TEMPLATE(int32_t)
+
+#endif
+
 inline void convertBoolToBfloat16Impl(
     const bool* __restrict src,
     c10::BFloat16* __restrict dst,
diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h b/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
index c479fc2e4aeb2..6a64226475cf3 100644
--- a/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
@@ -11,6 +11,8 @@
 #include <sleef.h>
 #endif
 
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default")
+
 // Sleef offers vectorized versions of some transcedentals
 // such as sin, cos, tan etc..
 // However for now opting for STL, since we are not building
@@ -650,3 +652,5 @@ inline Vectorized<float> Vectorized<float>::erf() const {
 
 } // namespace CPU_CAPABILITY
 } // namespace at::vec
+
+C10_DIAGNOSTIC_POP()
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h
index 50c3cc31a6c48..a2eb9e5f45104 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256.h
@@ -80,7 +80,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
     }
     stream << buf[i];
   }
-  stream << "]";
+  stream << ']';
   return stream;
 }
 
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h
index 975b71ce9a867..623971454df8b 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512.h
@@ -55,7 +55,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
     }
     stream << buf[i];
   }
-  stream << "]";
+  stream << ']';
   return stream;
 }
 
diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
index aaed431064611..9a55b058001da 100644
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -388,6 +388,7 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
 #ifndef USE_ROCM
   at::Half halpha;
   at::Half hbeta;
+  uint32_t mask = -1;
 #endif
   void * alpha_ptr = &alpha;
   void * beta_ptr = &beta;
@@ -427,7 +428,7 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
     auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
     if (fp16_reduction !=
         at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
-      uint32_t mask =
+      mask =
           fp16_reduction ==
                   at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
               ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
@@ -444,7 +445,7 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
     auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
     if (bf16_reduction !=
         at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
-      uint32_t mask =
+      mask =
           bf16_reduction ==
                   at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
               ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
@@ -511,17 +512,41 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
   cublasStatus_t cublasStatus = CUBLAS_STATUS_SUCCESS;
   cublasLtMatmulHeuristicResult_t heuristicResult = {};
   int returnedResult = 0;
-  TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
-      ltHandle,
-      computeDesc.descriptor(),
-      Adesc.descriptor(),
-      Bdesc.descriptor(),
-      Cdesc.descriptor(),
-      Cdesc.descriptor(),
-      preference.descriptor(),
-      1,
-      &heuristicResult,
-      &returnedResult));
+  // on Blackwell+, we fake a n > 1 matmul when querying heuristics
+  // to prevent cuBLASLt from dispatching to a GEMV kernel for batch-invariance
+#ifndef USE_ROCM
+  const bool lie_to_cublaslt = mask == CUBLASLT_REDUCTION_SCHEME_NONE && n == 1 && at::cuda::getCurrentDeviceProperties()->major >= 10;
+#else
+  const bool lie_to_cublaslt = false;
+#endif
+  if (lie_to_cublaslt) {
+     CuBlasLtMatrixLayout FakeBdesc(abType, k, 2, ldb, opb == CUBLAS_OP_T);
+     CuBlasLtMatrixLayout FakeCdesc(cType, m, 2, ldc);
+
+     TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
+        ltHandle,
+        computeDesc.descriptor(),
+        Adesc.descriptor(),
+        FakeBdesc.descriptor(),
+        FakeCdesc.descriptor(),
+        FakeCdesc.descriptor(),
+        preference.descriptor(),
+        1,
+        &heuristicResult,
+        &returnedResult));
+  } else {
+    TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
+        ltHandle,
+        computeDesc.descriptor(),
+        Adesc.descriptor(),
+        Bdesc.descriptor(),
+        Cdesc.descriptor(),
+        Cdesc.descriptor(),
+        preference.descriptor(),
+        1,
+        &heuristicResult,
+        &returnedResult));
+  }
   if (returnedResult == 0) {
     cublasStatus = CUBLAS_STATUS_NOT_SUPPORTED;
   }
@@ -1572,7 +1597,7 @@ bool gemm_and_bias(
   }
 
   using opmath_t = at::opmath_type<Dtype>;
-  opmath_t beta_val = 0; // bias is added in epilogue
+  opmath_t beta_val = bias ? 0 : 1; // bias is added in epilogue unless nullptr
 
   cudaDataType_t abType = CUDA_R_32F;
   cudaDataType_t cType = CUDA_R_32F;
@@ -1661,15 +1686,22 @@ bool gemm_and_bias(
     _syncCurrentWithCarveoutStream(stream, true);
   }
 #endif
-  cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_BIAS;
-  if (activation == GEMMAndBiasActivationEpilogue::RELU) {
-    epilogue = CUBLASLT_EPILOGUE_RELU_BIAS;
-  } else if (activation == GEMMAndBiasActivationEpilogue::GELU) {
-    epilogue = CUBLASLT_EPILOGUE_GELU_BIAS;
-  }
+  const auto epilogue = [&]() -> cublasLtEpilogue_t {
+    // The cuBLAS documentation indicates that
+    // *_<ACTIVATION>_BIAS = *_<ACTIVATION>,
+    // but we keep it verbose here for clarity.
+    switch (activation) {
+      case GEMMAndBiasActivationEpilogue::RELU:
+        return bias ? CUBLASLT_EPILOGUE_RELU_BIAS : CUBLASLT_EPILOGUE_RELU;
+      case GEMMAndBiasActivationEpilogue::GELU:
+        return bias ? CUBLASLT_EPILOGUE_GELU_BIAS : CUBLASLT_EPILOGUE_GELU;
+      default:
+        return bias ? CUBLASLT_EPILOGUE_BIAS : CUBLASLT_EPILOGUE_DEFAULT;
+    }
+  }();
+  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, epilogue);
 
-  if (bias != nullptr) {
-    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, epilogue);
+  if (bias) {
     computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_POINTER, bias);
   }
 
diff --git a/aten/src/ATen/cuda/CUDAContextLight.h b/aten/src/ATen/cuda/CUDAContextLight.h
index 86e960cc1ab4a..01d10f61da692 100644
--- a/aten/src/ATen/cuda/CUDAContextLight.h
+++ b/aten/src/ATen/cuda/CUDAContextLight.h
@@ -3,6 +3,7 @@
 
 #include <cstdint>
 #include <map>
+#include <shared_mutex>
 
 #include <cuda_runtime_api.h>
 #include <cusparse.h>
@@ -88,8 +89,13 @@ TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle();
 TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle();
 
 TORCH_CUDA_CPP_API void clearCublasWorkspaces();
-TORCH_CUDA_CPP_API std::map<std::tuple<void *, void *>, at::DataPtr>& cublas_handle_stream_to_workspace();
-TORCH_CUDA_CPP_API std::map<std::tuple<void *, void *>, at::DataPtr>& cublaslt_handle_stream_to_workspace();
+struct WorkspaceMapWithMutex {
+  std::map<std::tuple<void*, void*>, at::DataPtr> map;
+  std::shared_mutex mutex;
+};
+
+TORCH_CUDA_CPP_API WorkspaceMapWithMutex& cublas_handle_stream_to_workspace();
+TORCH_CUDA_CPP_API WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace();
 TORCH_CUDA_CPP_API size_t getChosenWorkspaceSize();
 TORCH_CUDA_CPP_API size_t getCUDABlasLtWorkspaceSize();
 TORCH_CUDA_CPP_API void* getCUDABlasLtWorkspace();
diff --git a/aten/src/ATen/cuda/CUDAGraph.cpp b/aten/src/ATen/cuda/CUDAGraph.cpp
index 31d2d3f1fe589..1c0687dcd5fb7 100644
--- a/aten/src/ATen/cuda/CUDAGraph.cpp
+++ b/aten/src/ATen/cuda/CUDAGraph.cpp
@@ -1,6 +1,7 @@
 #include <ATen/cuda/CUDAGeneratorImpl.h>
 #include <ATen/cuda/CUDAGraph.h>
 #include <ATen/cuda/Exceptions.h>
+#include <ATen/cuda/MemPool.h>
 #include <ATen/Functions.h>
 #include <c10/cuda/CUDAFunctions.h>
 
@@ -13,7 +14,7 @@ static bool _cuda_graphs_debug = false;
 MempoolId_t graph_pool_handle() {
   // Sets just the second value, to distinguish it from MempoolId_ts created from
   // cudaStreamGetCaptureInfo id_s in capture_begin.
-  return c10::cuda::MemPool::graph_pool_handle();
+  return at::cuda::MemPool::graph_pool_handle();
 }
 
 /**
@@ -90,7 +91,7 @@ void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/, cudaStreamCaptureMode capt
   } else {
     // User did not ask us to share a mempool. Create graph pool handle using is_user_created=false.
     // Sets just the first value, to distinguish it from MempoolId_ts created by graph_pool_handle().
-    mempool_id_ = c10::cuda::MemPool::graph_pool_handle(false);
+    mempool_id_ = at::cuda::MemPool::graph_pool_handle(false);
     TORCH_INTERNAL_ASSERT(mempool_id_.first > 0);
   }
 
@@ -174,17 +175,24 @@ void CUDAGraph::instantiate() {
     // Trailing NULL, NULL, 0 arguments were recommended by Cuda driver people,
     // who prefer not to report error message through these arguments moving forward
     // (they prefer return value, or errors on api calls internal to the capture)
-#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12000)
-    AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, 0));
+    // ROCM appears to fail with HIP error: invalid argument
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && !defined(USE_ROCM)
+    AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, cudaGraphInstantiateFlagUseNodePriority));
 #else
     AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, NULL, NULL, 0));
 #endif
 //Since ROCm 6.2, we want to go down this path as hipGraphExecDestroy in the destructor will not immediately free the memory.
 //It will wait for the next sync operation. cudaGraphInstantiateFlagAutoFreeOnLaunch will add async frees after graph launch.
   } else {
+#if !defined(USE_ROCM)
+    AT_CUDA_CHECK(cudaGraphInstantiateWithFlags(&graph_exec_,
+                                                graph_,
+                                                cudaGraphInstantiateFlagAutoFreeOnLaunch | cudaGraphInstantiateFlagUseNodePriority));
+#else
     AT_CUDA_CHECK(cudaGraphInstantiateWithFlags(&graph_exec_,
                                                 graph_,
                                                 cudaGraphInstantiateFlagAutoFreeOnLaunch));
+#endif
   }
   has_graph_exec_ = true;
 }
diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp
index 6175e69827e2f..9ec3acf4cd29e 100644
--- a/aten/src/ATen/cuda/CublasHandlePool.cpp
+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp
@@ -99,7 +99,7 @@ void destroyCublasHandle(cublasHandle_t handle) {
 //   - Comments of @soumith copied from cuDNN handle pool implementation
 #ifdef NO_CUDNN_DESTROY_HANDLE
 #else
-    cublasDestroy(handle);
+  cublasDestroy(handle);
 #endif
 }
 
@@ -107,19 +107,27 @@ using CuBlasPoolType = DeviceThreadHandlePool<cublasHandle_t, createCublasHandle
 
 } // namespace
 
-std::map<std::tuple<void *, void *>, at::DataPtr>& cublas_handle_stream_to_workspace() {
-  static auto& instance = *new std::map<std::tuple<void *, void *>, at::DataPtr>;
+WorkspaceMapWithMutex& cublas_handle_stream_to_workspace() {
+  static auto& instance = *new WorkspaceMapWithMutex;
   return instance;
 }
 
-std::map<std::tuple<void *, void *>, at::DataPtr>& cublaslt_handle_stream_to_workspace() {
-  static auto& instance = *new std::map<std::tuple<void *, void *>, at::DataPtr>;
+WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace() {
+  static auto& instance = *new WorkspaceMapWithMutex;
   return instance;
 }
 
 void clearCublasWorkspaces() {
-  cublas_handle_stream_to_workspace().clear();
-  cublaslt_handle_stream_to_workspace().clear();
+  {
+    auto& workspace = cublas_handle_stream_to_workspace();
+    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
+    workspace.map.clear();
+  }
+  {
+    auto& workspace = cublaslt_handle_stream_to_workspace();
+    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
+    workspace.map.clear();
+  }
 }
 
 size_t parseChosenWorkspaceSize() {
@@ -233,6 +241,38 @@ at::DataPtr getNewCUDABlasLtWorkspace() {
   return c10::cuda::CUDACachingAllocator::get()->allocate(getCUDABlasLtWorkspaceSize());
 }
 
+void setWorkspaceForHandle(cublasHandle_t handle, c10::cuda::CUDAStream stream) {
+  cudaStream_t _stream = stream;
+  auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
+
+  auto& workspace = cublas_handle_stream_to_workspace();
+
+  size_t workspace_size = getChosenWorkspaceSize();
+
+  // Fast path: check if workspace already exists
+  {
+    std::shared_lock<std::shared_mutex> lock(workspace.mutex);
+    auto workspace_it = workspace.map.find(key);
+    if (workspace_it != workspace.map.end()) {
+      TORCH_CUDABLAS_CHECK(cublasSetWorkspace(
+          handle, workspace_it->second.get(), workspace_size));
+      return;
+    }
+  }
+
+  // Slow path: allocate workspace outside the lock
+  auto new_workspace = getNewWorkspace();
+
+  // Insert with lock (double-check in case another thread inserted while we
+  // were allocating)
+  {
+    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
+    auto workspace_it = workspace.map.try_emplace(key, std::move(new_workspace)).first;
+    TORCH_CUDABLAS_CHECK(
+        cublasSetWorkspace(handle, workspace_it->second.get(), workspace_size));
+  }
+}
+
 void* getCUDABlasLtWorkspace() {
 #ifndef USE_ROCM
   static bool unified = c10::utils::check_env(TORCH_CUBLASLT_UNIFIED_WORKSPACE) == true;
@@ -241,8 +281,10 @@ void* getCUDABlasLtWorkspace() {
     auto stream = c10::cuda::getCurrentCUDAStream();
     cudaStream_t _stream = stream;
     auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
-    auto workspace_it = at::cuda::cublas_handle_stream_to_workspace().find(key);
-    TORCH_INTERNAL_ASSERT(workspace_it != at::cuda::cublas_handle_stream_to_workspace().end());
+    auto& workspace = at::cuda::cublas_handle_stream_to_workspace();
+    std::shared_lock<std::shared_mutex> lock(workspace.mutex);
+    auto workspace_it = workspace.map.find(key);
+    TORCH_INTERNAL_ASSERT(workspace_it != workspace.map.end());
     return workspace_it->second.mutable_get();
   }
 #endif
@@ -250,11 +292,29 @@ void* getCUDABlasLtWorkspace() {
   auto stream = c10::cuda::getCurrentCUDAStream();
   cudaStream_t _stream = stream;
   auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
-  auto workspace_it = cublaslt_handle_stream_to_workspace().find(key);
-  if (workspace_it == cublaslt_handle_stream_to_workspace().end()) {
-    workspace_it = cublaslt_handle_stream_to_workspace().insert(workspace_it, {key, getNewCUDABlasLtWorkspace()});
+
+  auto& workspace = cublaslt_handle_stream_to_workspace();
+
+  // Fast path: check if workspace already exists
+  {
+    std::shared_lock<std::shared_mutex> lock(workspace.mutex);
+    auto workspace_it = workspace.map.find(key);
+    if (workspace_it != workspace.map.end()) {
+      return workspace_it->second.mutable_get();
+    }
+  }
+
+  // Slow path: allocate workspace outside the lock
+  auto new_workspace = getNewCUDABlasLtWorkspace();
+
+  // Insert with lock (double-check in case another thread inserted while we
+  // were allocating)
+  {
+    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
+    auto workspace_it =
+          workspace.map.try_emplace(key, std::move(new_workspace)).first;
+    return workspace_it->second.mutable_get();
   }
-  return workspace_it->second.mutable_get();
 }
 
 cublasHandle_t getCurrentCUDABlasHandle() {
@@ -298,13 +358,8 @@ cublasHandle_t getCurrentCUDABlasHandle() {
   // will allocate memory dynamically (even if they're cheap) outside
   // PyTorch's CUDA caching allocator. It's possible that CCA used up
   // all the memory and cublas's cudaMallocAsync will return OOM
-  cudaStream_t _stream = stream;
-  auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
-  auto workspace_it = cublas_handle_stream_to_workspace().find(key);
-  if (workspace_it == cublas_handle_stream_to_workspace().end()) {
-    workspace_it = cublas_handle_stream_to_workspace().insert(workspace_it, {key, getNewWorkspace()});
-  }
-  TORCH_CUDABLAS_CHECK(cublasSetWorkspace(handle, workspace_it->second.get(), getChosenWorkspaceSize()));
+  setWorkspaceForHandle(handle, stream);
+
 #if !defined(USE_ROCM)
   // On CUDA >= 11, and architecture >= Ampere, cuBLAS can use TF32 to speedup
   // FP32 data type calculations based on the value of the allow_tf32 flag.
diff --git a/aten/src/ATen/cuda/MemPool.cpp b/aten/src/ATen/cuda/MemPool.cpp
new file mode 100644
index 0000000000000..99405965898e0
--- /dev/null
+++ b/aten/src/ATen/cuda/MemPool.cpp
@@ -0,0 +1,69 @@
+#include <ATen/core/CachingHostAllocator.h>
+#include <ATen/cuda/MemPool.h>
+
+namespace at::cuda {
+
+// uid_ is incremented when a user creates a MemPool,
+// for example: using graph_pool_handle() or c10::cuda::MemPool().
+//
+// uuid_ is incremented when CUDAGraph creates a MemPool
+// as a result of a user not providing a pool.
+//
+// MempoolId_t of {0, 0} is used to denote when no MemPool has been
+// passed to a function, either by user or CUDAGraphs. For example,
+// default value of MempoolId_t for capture_begin function is {0, 0}.
+// That's why uid_ and uuid_ start at 1.
+std::atomic<CaptureId_t> MemPool::uid_{1};
+std::atomic<CaptureId_t> MemPool::uuid_{1};
+
+MemPool::MemPool(
+    CUDACachingAllocator::CUDAAllocator* allocator,
+    bool is_user_created,
+    bool use_on_oom)
+    : allocator_(allocator), is_user_created_(is_user_created) {
+  if (is_user_created_) {
+    id_ = {0, uid_++};
+  } else {
+    id_ = {uuid_++, 0};
+  }
+  device_ = c10::cuda::current_device();
+  CUDACachingAllocator::createOrIncrefPool(device_, id_, allocator);
+  if (use_on_oom) {
+    CUDACachingAllocator::setUseOnOOM(device_, id_);
+  }
+}
+
+MemPool::~MemPool() {
+  // TORCH_INTERNAL_ASSERT(use_count() == 1);
+  // We used to assert that TORCH_INTERNAL_ASSERT(use_count() == 1);
+  // However, this assertion is not true if a memory pool is shared
+  // with a cuda graph. That CUDAGraph will increase the use count
+  // until it is reset.
+  CUDACachingAllocator::releasePool(device_, id_);
+  c10::cuda::CUDACachingAllocator::emptyCache(id_);
+}
+
+MempoolId_t MemPool::id() {
+  return id_;
+}
+
+CUDACachingAllocator::CUDAAllocator* MemPool::allocator() {
+  return allocator_;
+}
+
+int MemPool::use_count() {
+  return CUDACachingAllocator::getPoolUseCount(device_, id_);
+}
+
+c10::DeviceIndex MemPool::device() {
+  return device_;
+}
+
+MempoolId_t MemPool::graph_pool_handle(bool is_user_created) {
+  if (is_user_created) {
+    return {0, uid_++};
+  }
+  return {uuid_++, 0};
+}
+
+} // namespace at::cuda
diff --git a/aten/src/ATen/cuda/MemPool.h b/aten/src/ATen/cuda/MemPool.h
new file mode 100644
index 0000000000000..ba281c96b7043
--- /dev/null
+++ b/aten/src/ATen/cuda/MemPool.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+
+namespace at::cuda {
+
+// Keep BC only
+using c10::CaptureId_t;
+using c10::MempoolId_t;
+
+// MemPool represents a pool of memory in a caching allocator. Currently,
+// it's just the ID of the pool object maintained in the CUDACachingAllocator.
+//
+// An allocator pointer can be passed to the MemPool to define how the
+// allocations should be done in the pool. For example: using a different
+// system allocator such as ncclMemAlloc.
+struct TORCH_CUDA_CPP_API MemPool {
+  MemPool(
+      c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator = nullptr,
+      bool is_user_created = true,
+      bool use_on_oom = false);
+  MemPool(const MemPool&) = delete;
+  MemPool(MemPool&&) = default;
+  MemPool& operator=(const MemPool&) = delete;
+  MemPool& operator=(MemPool&&) = default;
+  ~MemPool();
+
+  MempoolId_t id();
+  c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator();
+  int use_count();
+  c10::DeviceIndex device();
+  static MempoolId_t graph_pool_handle(bool is_user_created = true);
+
+ private:
+  static std::atomic<CaptureId_t> uid_;
+  static std::atomic<CaptureId_t> uuid_;
+  c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator_;
+  bool is_user_created_;
+  MempoolId_t id_;
+  c10::DeviceIndex device_;
+};
+
+} // namespace at::cuda
diff --git a/aten/src/ATen/cuda/cub.h b/aten/src/ATen/cuda/cub.h
index 7430edaf8a3dc..bca9b1faff523 100644
--- a/aten/src/ATen/cuda/cub.h
+++ b/aten/src/ATen/cuda/cub.h
@@ -24,7 +24,13 @@ namespace detail {
 // radix_sort_pairs doesn't interact with value_t other than to copy
 // the data, so we can save template instantiations by reinterpreting
 // it as an opaque type.
+// We use native integer types for 1/2/4/8-byte values to reduce
+// register usage in CUDA kernels. For sizes > 8 fall back to char array.
 template <int N> struct alignas(N) OpaqueType { char data[N]; };
+template <> struct alignas(1) OpaqueType<1> { uint8_t data; };
+template <> struct alignas(2) OpaqueType<2> { uint16_t data; };
+template <> struct alignas(4) OpaqueType<4> { uint32_t data; };
+template <> struct alignas(8) OpaqueType<8> { uint64_t data; };
 
 template<typename key_t, int value_size>
 void radix_sort_pairs_impl(
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
index b7f80101d926e..b2b9be4498e5b 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -21,6 +21,7 @@
 
 #if AT_CUDNN_ENABLED()
 #include <ATen/cudnn/cudnn-wrapper.h>
+#include <cudnn_frontend.h>
 #endif
 
 #if AT_MAGMA_ENABLED()
@@ -351,6 +352,26 @@ long CUDAHooks::versionCuDNN() const {
 #endif
 }
 
+long CUDAHooks::versionRuntimeCuDNN() const {
+#if AT_CUDNN_ENABLED()
+#ifndef USE_STATIC_CUDNN
+  return cudnnGetVersion();
+#else
+  return CUDNN_VERSION;
+#endif
+#else
+  TORCH_CHECK(false, "Cannot query CuDNN version if ATen_cuda is not built with CuDNN");
+#endif
+}
+
+long CUDAHooks::versionCuDNNFrontend() const {
+#if AT_CUDNN_ENABLED()
+  return CUDNN_FRONTEND_VERSION;
+#else
+  TORCH_CHECK(false, "Cannot query CuDNN Frontend version if ATen_cuda is not built with CuDNN");
+#endif
+}
+
 long CUDAHooks::versionMIOpen() const {
 #if AT_ROCM_ENABLED()
   return MIOPEN_VERSION_MAJOR * 10000 +
@@ -390,16 +411,16 @@ std::string CUDAHooks::showConfig() const {
     // HIP_VERSION value format was changed after ROCm v4.2 to include the patch number
     if(v < 500) {
       // If major=xx, minor=yy then format -> xxyy
-      oss << (v / 100) << "." << (v % 10);
+      oss << (v / 100) << '.' << (v % 10);
     }
     else {
       // If major=xx, minor=yy & patch=zzzzz then format -> xxyyzzzzz
-      oss << (v / 10000000) << "." << (v / 100000 % 100) << "." << (v % 100000);
+      oss << (v / 10000000) << '.' << (v / 100000 % 100) << '.' << (v % 100000);
     }
 #else
-    oss << (v / 1000) << "." << (v / 10 % 100);
+    oss << (v / 1000) << '.' << (v / 10 % 100);
     if (v % 10 != 0) {
-      oss << "." << (v % 10);
+      oss << '.' << (v % 10);
     }
 #endif
   };
@@ -410,16 +431,16 @@ std::string CUDAHooks::showConfig() const {
   oss << "  - HIP Runtime ";
 #endif
   printCudaStyleVersion(runtimeVersion);
-  oss << "\n";
+  oss << '\n';
 
   // TODO: Make HIPIFY understand CUDART_VERSION macro
 #if !defined(USE_ROCM)
   if (runtimeVersion != CUDART_VERSION) {
     oss << "  - Built with CUDA Runtime ";
     printCudaStyleVersion(CUDART_VERSION);
-    oss << "\n";
+    oss << '\n';
   }
-  oss << "  - NVCC architecture flags: " << NVCC_FLAGS_EXTRA << "\n";
+  oss << "  - NVCC architecture flags: " << NVCC_FLAGS_EXTRA << '\n';
 #endif
 
 #if !defined(USE_ROCM)
@@ -427,9 +448,9 @@ std::string CUDAHooks::showConfig() const {
 
 
   auto printCudnnStyleVersion = [&](size_t v) {
-    oss << (v / 1000) << "." << (v / 100 % 10);
+    oss << (v / 1000) << '.' << (v / 100 % 10);
     if (v % 100 != 0) {
-      oss << "." << (v % 100);
+      oss << '.' << (v % 100);
     }
   };
 
@@ -440,22 +461,22 @@ std::string CUDAHooks::showConfig() const {
   if (cudnnCudartVersion != CUDART_VERSION) {
     oss << "  (built against CUDA ";
     printCudaStyleVersion(cudnnCudartVersion);
-    oss << ")";
+    oss << ')';
   }
-  oss << "\n";
+  oss << '\n';
   if (cudnnVersion != CUDNN_VERSION) {
     oss << "    - Built with CuDNN ";
     printCudnnStyleVersion(CUDNN_VERSION);
-    oss << "\n";
+    oss << '\n';
   }
 #endif
 #else
   // TODO: Check if miopen has the functions above and unify
-  oss << "  - MIOpen " << MIOPEN_VERSION_MAJOR << "." << MIOPEN_VERSION_MINOR << "." << MIOPEN_VERSION_PATCH << "\n";
+  oss << "  - MIOpen " << MIOPEN_VERSION_MAJOR << '.' << MIOPEN_VERSION_MINOR << '.' << MIOPEN_VERSION_PATCH << '\n';
 #endif
 
 #if AT_MAGMA_ENABLED()
-  oss << "  - Magma " << MAGMA_VERSION_MAJOR << "." << MAGMA_VERSION_MINOR << "." << MAGMA_VERSION_MICRO << "\n";
+  oss << "  - Magma " << MAGMA_VERSION_MAJOR << '.' << MAGMA_VERSION_MINOR << '.' << MAGMA_VERSION_MICRO << '\n';
 #endif
 
   return oss.str();
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h
index 8d3d1db003928..8902c68d342f8 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@@ -49,6 +49,8 @@ struct CUDAHooks : public at::CUDAHooksInterface {
   bool hasCUDART() const override;
   long versionCUDART() const override;
   long versionCuDNN() const override;
+  long versionRuntimeCuDNN() const override;
+  long versionCuDNNFrontend() const override;
   long versionMIOpen() const override;
   std::string showConfig() const override;
   double batchnormMinEpsilonCuDNN() const override;
diff --git a/aten/src/ATen/cuda/jiterator.cu b/aten/src/ATen/cuda/jiterator.cu
index 3af5104288d21..d664c828bdad6 100644
--- a/aten/src/ATen/cuda/jiterator.cu
+++ b/aten/src/ATen/cuda/jiterator.cu
@@ -42,7 +42,7 @@ static inline void launch_jitted_vectorized_kernel_dynamic(
 
   // The cache key includes all the parameters to generate_code + vec_size + dev_idx
   std::stringstream ss;
-  ss << nInputs << "_" << nOutputs << f;
+  ss << nInputs << '_' << nOutputs << f;
   ss << f_inputs_type_str << compute_type_str << result_type_str;
   ss << static_cast<int>(at::cuda::jit::BinaryFuncVariant::NoScalar);
   ss << extra_args_types;
@@ -144,7 +144,7 @@ static inline void launch_jitted_unrolled_kernel_dynamic(
 
   // The cache key includes all the parameters to generate_code + dev_idx
   std::stringstream ss;
-  ss << nInputs << "_" << nOutputs << f;
+  ss << nInputs << '_' << nOutputs << f;
   ss << f_inputs_type_str << compute_type_str << result_type_str;
   ss << contiguous << dynamic_casting;
   ss << static_cast<int>(at::cuda::jit::BinaryFuncVariant::NoScalar);
diff --git a/aten/src/ATen/cuda/tunable/Tunable.cpp b/aten/src/ATen/cuda/tunable/Tunable.cpp
index 9fb04b40d30f6..eb7e381d27766 100644
--- a/aten/src/ATen/cuda/tunable/Tunable.cpp
+++ b/aten/src/ATen/cuda/tunable/Tunable.cpp
@@ -52,10 +52,10 @@ TuningContext* getTuningContext() {
 std::ostream& operator<<(std::ostream& stream, const ResultEntry& entry) {
   static const bool blaslog = c10::utils::get_env("PYTORCH_TUNABLEOP_BLAS_LOG") == "1";
   if (!blaslog) {
-    return stream << entry.key_ << "," << entry.time_;
+    return stream << entry.key_ << ',' << entry.time_;
   }
   else {
-    return stream << entry.key_ << "," << entry.time_ << ",BLAS_PARAMS: " << entry.blas_sig_;
+    return stream << entry.key_ << ',' << entry.time_ << ",BLAS_PARAMS: " << entry.blas_sig_;
   }
 }
 
@@ -156,10 +156,10 @@ void TuningResultsManager::RecordUntuned( std::ofstream& untuned_file, const std
     if (isNew) {
       static const bool blaslog = c10::utils::get_env("PYTORCH_TUNABLEOP_BLAS_LOG") == "1";
       if (!blaslog) {
-        untuned_file << op_signature << "," << params_signature << std::endl;
+        untuned_file << op_signature << ',' << params_signature << std::endl;
       }
       else {
-        untuned_file << op_signature << "," << params_signature << ",BLAS_PARAMS: " << blas_signature << std::endl;
+        untuned_file << op_signature << ',' << params_signature << ",BLAS_PARAMS: " << blas_signature << std::endl;
       }
       TUNABLE_LOG3("Untuned,", op_signature, ",", params_signature);
     }
@@ -201,7 +201,7 @@ void TuningResultsManager::InitRealtimeAppend(const std::string& filename, const
 
   if(!file_exists || file_empty) {
     for(const auto& [key, val] : validators) {
-      (*realtime_out_) << "Validator," << key << "," << val << std::endl;
+      (*realtime_out_) << "Validator," << key << ',' << val << std::endl;
       realtime_out_->flush();
     }
     validators_written_ = true;
@@ -219,7 +219,7 @@ void TuningResultsManager::AppendResultLine(const std::string& op_sig, const std
     return;
   }
 
-  (*realtime_out_) << op_sig << "," << param_sig << "," << result << std::endl;
+  (*realtime_out_) << op_sig << ',' << param_sig << ',' << result << std::endl;
   realtime_out_->flush(); //ensure immediate write to disk
 
   TUNABLE_LOG3("Realtime append: ", op_sig, "(", param_sig, ") -> ", result);
diff --git a/aten/src/ATen/cudnn/Descriptors.cpp b/aten/src/ATen/cudnn/Descriptors.cpp
index 8636d267209e9..a2cb0cb0a1025 100644
--- a/aten/src/ATen/cudnn/Descriptors.cpp
+++ b/aten/src/ATen/cudnn/Descriptors.cpp
@@ -93,31 +93,31 @@ std::string cudnnTypeToString(cudnnDataType_t dtype) {
       return "CUDNN_DATA_UINT8x4";
     default:
       std::ostringstream oss;
-      oss << "(unknown data-type " << static_cast<int>(dtype) << ")";
+      oss << "(unknown data-type " << static_cast<int>(dtype) << ')';
       return oss.str();
   }
 }
 
 std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) {
-  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << "\n";
+  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << '\n';
   int nbDims = 0;
   int dimA[CUDNN_DIM_MAX];
   int strideA[CUDNN_DIM_MAX];
   cudnnDataType_t dtype{};
   cudnnGetTensorNdDescriptor(d.desc(), CUDNN_DIM_MAX, &dtype, &nbDims, dimA, strideA);
-  out << "    type = " << cudnnTypeToString(dtype) << "\n";
-  out << "    nbDims = " << nbDims << "\n";
+  out << "    type = " << cudnnTypeToString(dtype) << '\n';
+  out << "    nbDims = " << nbDims << '\n';
   // Read out only nbDims of the arrays!
   out << "    dimA = ";
   for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
     out << i << ", ";
   }
-  out << "\n";
+  out << '\n';
   out << "    strideA = ";
   for (auto i : ArrayRef<int>{strideA, static_cast<size_t>(nbDims)}) {
     out << i << ", ";
   }
-  out << "\n";
+  out << '\n';
   return out;
 }
 
@@ -168,27 +168,27 @@ std::string cudnnMemoryFormatToString(cudnnTensorFormat_t tformat) {
       return "CUDNN_TENSOR_NHWC";
     default:
       std::ostringstream oss;
-      oss << "(unknown cudnn tensor format " << static_cast<int>(tformat) << ")";
+      oss << "(unknown cudnn tensor format " << static_cast<int>(tformat) << ')';
       return oss.str();
   }
 }
 
 std::ostream& operator<<(std::ostream & out, const FilterDescriptor& d) {
-  out << "FilterDescriptor " << static_cast<void*>(d.desc()) << "\n";
+  out << "FilterDescriptor " << static_cast<void*>(d.desc()) << '\n';
   int nbDims = 0;
   int dimA[CUDNN_DIM_MAX];
   cudnnDataType_t dtype{};
   cudnnTensorFormat_t tformat{};
   cudnnGetFilterNdDescriptor(d.desc(), CUDNN_DIM_MAX, &dtype, &tformat, &nbDims, dimA);
-  out << "    type = " << cudnnTypeToString(dtype) << "\n";
-  out << "    tensor_format = " << cudnnMemoryFormatToString(tformat) << "\n";
-  out << "    nbDims = " << nbDims << "\n";
+  out << "    type = " << cudnnTypeToString(dtype) << '\n';
+  out << "    tensor_format = " << cudnnMemoryFormatToString(tformat) << '\n';
+  out << "    nbDims = " << nbDims << '\n';
   // Read out only nbDims of the arrays!
   out << "    dimA = ";
   for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
     out << i << ", ";
   }
-  out << "\n";
+  out << '\n';
   return out;
 }
 
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h
index f1f2056917472..0ab8e82a30166 100644
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@@ -174,6 +174,14 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
     TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
   }
 
+  virtual long versionRuntimeCuDNN() const {
+    TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
+  }
+
+  virtual long versionCuDNNFrontend() const {
+    TORCH_CHECK(false, "Cannot query cuDNN Frontend version without ATen_cuda library. ", CUDA_HELP);
+  }
+
   virtual long versionMIOpen() const {
     TORCH_CHECK(false, "Cannot query MIOpen version without ATen_cuda library. ", CUDA_HELP);
   }
diff --git a/aten/src/ATen/functorch/BatchedTensorImpl.h b/aten/src/ATen/functorch/BatchedTensorImpl.h
index 985b289b3fe02..14be24d63e65a 100644
--- a/aten/src/ATen/functorch/BatchedTensorImpl.h
+++ b/aten/src/ATen/functorch/BatchedTensorImpl.h
@@ -157,6 +157,8 @@ constexpr DispatchKeySet kKeysToPropagateToWrapper({
   DispatchKey::Negative,
   DispatchKey::Conjugate,
   DispatchKey::XLA,
+  DispatchKey::XPU,
+  DispatchKey::HPU,
   DispatchKey::CUDA,
   DispatchKey::CPU,
   DispatchKey::PrivateUse1,
diff --git a/aten/src/ATen/functorch/DynamicLayer.cpp b/aten/src/ATen/functorch/DynamicLayer.cpp
index 69af08a7bd7ce..518098a8b4a80 100644
--- a/aten/src/ATen/functorch/DynamicLayer.cpp
+++ b/aten/src/ATen/functorch/DynamicLayer.cpp
@@ -346,15 +346,15 @@ void foreachTensorInplaceWithFlag(std::vector<IValue>& args, int64_t begin, int6
 }
 
 std::ostream& operator<< (std::ostream& os, const DynamicLayer& layer) {
-  os << layer.layerId() << ":" << layer.key();
+  os << layer.layerId() << ':' << layer.key();
   return os;
 }
 std::ostream& operator<< (std::ostream& os, const std::vector<DynamicLayer>& dls) {
   os << "DynamicLayerStack[ ";
   for (const auto& layer : dls) {
-    os << layer << " ";
+    os << layer << ' ';
   }
-  os << "]";
+  os << ']';
   return os;
 }
 
diff --git a/aten/src/ATen/functorch/TensorWrapper.cpp b/aten/src/ATen/functorch/TensorWrapper.cpp
index 65de9268927f0..ba5dcfc923878 100644
--- a/aten/src/ATen/functorch/TensorWrapper.cpp
+++ b/aten/src/ATen/functorch/TensorWrapper.cpp
@@ -22,7 +22,7 @@ void dumpTensor(std::ostream& ss, const Tensor& tensor) {
     if (batched) {
       ss << "Batched[lvl=" << batched->level() << " dim=" << batched->bdim() << ", ";
       dumpTensor(ss, batched->value());
-      ss << "]";
+      ss << ']';
       return;
     }
     ss << "Tensor" << tensor.sizes();
@@ -36,7 +36,7 @@ void dumpTensor(std::ostream& ss, const Tensor& tensor) {
     ss << "dead, ";
   }
   dumpTensor(ss, wrapped->value());
-  ss << "]";
+  ss << ']';
 }
 
 void TensorWrapper::refreshMetadata() {
diff --git a/aten/src/ATen/miopen/Descriptors.cpp b/aten/src/ATen/miopen/Descriptors.cpp
index 86e42ee3b66dc..3fe27c7a0825b 100644
--- a/aten/src/ATen/miopen/Descriptors.cpp
+++ b/aten/src/ATen/miopen/Descriptors.cpp
@@ -73,32 +73,32 @@ std::string miopenTypeToString(miopenDataType_t dtype) {
       return "miopenBFloat16";
     default:
       std::ostringstream oss;
-      oss << "(unknown data-type " << static_cast<int>(dtype) << ")";
+      oss << "(unknown data-type " << static_cast<int>(dtype) << ')';
       return oss.str();
   }
 }
 
 std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) {
-  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << "\n";
+  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << '\n';
   int nbDims = 0;
   int dimA[MIOPEN_DIM_MAX];
   int strideA[MIOPEN_DIM_MAX];
   miopenDataType_t dtype;
   miopenGetTensorDescriptorSize(d.desc(), &nbDims);
   miopenGetTensorDescriptor(d.desc(), &dtype, dimA, strideA);
-  out << "    type = " << miopenTypeToString(dtype) << "\n";
-  out << "    nbDims = " << nbDims << "\n";
+  out << "    type = " << miopenTypeToString(dtype) << '\n';
+  out << "    nbDims = " << nbDims << '\n';
   // Read out only nbDims of the arrays!
   out << "    dimA = ";
   for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
     out << i << ", ";
   }
-  out << "\n";
+  out << '\n';
   out << "    strideA = ";
   for (auto i : ArrayRef<int>{strideA, static_cast<size_t>(nbDims)}) {
     out << i << ", ";
   }
-  out << "\n";
+  out << '\n';
   return out;
 }
 
diff --git a/aten/src/ATen/mps/MPSAllocator.mm b/aten/src/ATen/mps/MPSAllocator.mm
index c8b3453fc81dd..dfdd67c8f4458 100644
--- a/aten/src/ATen/mps/MPSAllocator.mm
+++ b/aten/src/ATen/mps/MPSAllocator.mm
@@ -440,7 +440,7 @@
   // we need to release the lock temporarily as synchronizing may cause deadlock with completion handlers.
   m_mutex.unlock();
   auto stream = getDefaultMPSStream();
-  dispatch_sync(stream->queue(), ^() {
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
     stream->synchronize(SyncType::COMMIT_AND_WAIT);
   });
   m_mutex.lock();
diff --git a/aten/src/ATen/mps/MPSProfiler.h b/aten/src/ATen/mps/MPSProfiler.h
index c1cb9090fc4af..187e86d92e1bf 100644
--- a/aten/src/ATen/mps/MPSProfiler.h
+++ b/aten/src/ATen/mps/MPSProfiler.h
@@ -91,7 +91,7 @@ struct OperationInfo : BaseInfo {
     std::stringstream kernelStr;
     kernelStr << kernelName;
     for (const Tensor& tensor : tensors) {
-      kernelStr << ":" << BaseInfo::buildTensorString(tensor, includeBufferId);
+      kernelStr << ':' << BaseInfo::buildTensorString(tensor, includeBufferId);
     }
     return kernelStr.str();
   }
diff --git a/aten/src/ATen/mps/MPSProfiler.mm b/aten/src/ATen/mps/MPSProfiler.mm
index a91574c56c52d..1d0408b8089c9 100644
--- a/aten/src/ATen/mps/MPSProfiler.mm
+++ b/aten/src/ATen/mps/MPSProfiler.mm
@@ -39,9 +39,9 @@
     // see comments for INCLUDE_BUFFER_ID
     if (includeBufferId && deviceType == at::kMPS) {
       id<MTLBuffer> buffer = __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
-      tensorStr << "(buf#" << (getIMPSAllocator()->getBufferId(buffer)) << ":" << buffer.retainCount << ")";
+      tensorStr << "(buf#" << (getIMPSAllocator()->getBufferId(buffer)) << ':' << buffer.retainCount << ')';
     }
-    tensorStr << ":" << tensor.scalar_type() << tensor.sizes();
+    tensorStr << ':' << tensor.scalar_type() << tensor.sizes();
     return tensorStr.str();
   } else {
     return "undefined";
diff --git a/aten/src/ATen/mps/MPSStream.h b/aten/src/ATen/mps/MPSStream.h
index 10627cfc36b80..b00890b9f5901 100644
--- a/aten/src/ATen/mps/MPSStream.h
+++ b/aten/src/ATen/mps/MPSStream.h
@@ -110,6 +110,9 @@ class TORCH_API MPSStream {
     return _stream;
   }
 
+  MTLBuffer_t getErrorBuffer();
+  void checkLastError();
+
  private:
   Stream _stream;
   MTLCommandQueue_t _commandQueue = nil;
@@ -121,6 +124,8 @@ class TORCH_API MPSStream {
   dispatch_queue_t _serialQueue = nullptr;
   // CommitAndContinue is enabled by default
   bool _enableCommitAndContinue = true;
+  // Buffer that contains last raised error
+  MTLBuffer_t _errorBuffer = nil;
 
   // use synchronize() to access any of these commit functions outside MPSStream
   void commit();
@@ -155,4 +160,7 @@ class TORCH_API MPSStreamImpl {
   MPSStreamImpl();
 };
 
+#ifdef __OBJC__
+void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)());
+#endif
 } // namespace at::mps
diff --git a/aten/src/ATen/mps/MPSStream.mm b/aten/src/ATen/mps/MPSStream.mm
index 595d71aeef15a..2150c21c18d75 100644
--- a/aten/src/ATen/mps/MPSStream.mm
+++ b/aten/src/ATen/mps/MPSStream.mm
@@ -3,13 +3,13 @@
 #include <ATen/mps/MPSAllocatorInterface.h>
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/mps/MPSStream.h>
+#include <c10/metal/error.h>
 
 @interface MPSGraphExecutionDescriptor ()
 @property(readwrite, atomic) BOOL enableCommitAndContinue;
 @end
 
 namespace at::mps {
-
 //-----------------------------------------------------------------
 //  MPSStream
 //-----------------------------------------------------------------
@@ -30,6 +30,10 @@ @interface MPSGraphExecutionDescriptor ()
   // Choose level which optimizes for GPU
   _compilationDescriptor.optimizationLevel = MPSGraphOptimizationLevel0;
   _executionDescriptor.compilationDescriptor = _compilationDescriptor;
+
+  _errorBuffer = [MPSDevice::getInstance()->device() newBufferWithLength:sizeof(c10::metal::ErrorMessages)
+                                                                 options:MTLResourceStorageModeShared];
+  std::memset([_errorBuffer contents], 0, 1024);
 }
 
 MPSStream::~MPSStream() {
@@ -38,6 +42,8 @@ @interface MPSGraphExecutionDescriptor ()
   [_executionDescriptor release];
   [_compilationDescriptor release];
   _executionDescriptor = nil;
+  [_errorBuffer release];
+  _errorBuffer = nil;
   _compilationDescriptor = nil;
 
   assert(_commandBuffer == nil);
@@ -104,6 +110,7 @@ @interface MPSGraphExecutionDescriptor ()
     [_prevCommandBuffer waitUntilCompleted];
     [_prevCommandBuffer release];
     _prevCommandBuffer = nil;
+    checkLastError();
   }
 
   if (_commandBuffer) {
@@ -111,6 +118,7 @@ @interface MPSGraphExecutionDescriptor ()
     [_commandBuffer waitUntilCompleted];
     [_commandBuffer release];
     _commandBuffer = nil;
+    checkLastError();
   }
 }
 
@@ -153,7 +161,7 @@ @interface MPSGraphExecutionDescriptor ()
   if (length == 0) {
     return;
   }
-  dispatch_sync(_serialQueue, ^() {
+  dispatch_sync_with_rethrow(_serialQueue, ^() {
     @autoreleasepool {
       endKernelCoalescing();
       id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer() blitCommandEncoder];
@@ -183,7 +191,7 @@ @interface MPSGraphExecutionDescriptor ()
                      size_t dstOffset,
                      uint64_t profileId,
                      SyncType syncType) {
-  dispatch_sync(_serialQueue, ^() {
+  dispatch_sync_with_rethrow(_serialQueue, ^() {
     @autoreleasepool {
       endKernelCoalescing();
       id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer() blitCommandEncoder];
@@ -236,7 +244,7 @@ @interface MPSGraphExecutionDescriptor ()
   auto& profiler = getMPSProfiler();
   const bool isGraphProfilingEnabled = profiler.isOperationProfilingEnabled();
 
-  dispatch_sync(_serialQueue, ^() {
+  dispatch_sync_with_rethrow(_serialQueue, ^() {
     endKernelCoalescing();
     if (isGraphProfilingEnabled) {
       // this function call is only relevant for interval-based Signposts
@@ -266,6 +274,24 @@ @interface MPSGraphExecutionDescriptor ()
   });
 }
 
+id<MTLBuffer> MPSStream::getErrorBuffer() {
+  return _errorBuffer;
+}
+
+void MPSStream::checkLastError() {
+  auto msgs = reinterpret_cast<c10::metal::ErrorMessages*>([_errorBuffer contents]);
+  const auto& msg = msgs->msg[0];
+  if (!msgs) {
+    return;
+  }
+  unsigned int count = 0;
+  std::swap(count, msgs->count);
+  if (!count) {
+    return;
+  }
+  throw c10::AcceleratorError({msg.func, msg.file, msg.line}, 1, msg.message);
+}
+
 //-----------------------------------------------------------------
 //  MPSStreamImpl
 //-----------------------------------------------------------------
@@ -289,4 +315,19 @@ @interface MPSGraphExecutionDescriptor ()
   return MPSStreamImpl::getInstance();
 }
 
+// Helper methods
+void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)()) {
+  __block std::optional<std::exception_ptr> block_exception;
+  dispatch_sync(queue, ^() {
+    try {
+      block();
+    } catch (...) {
+      block_exception = std::current_exception();
+    }
+  });
+  if (block_exception) {
+    std::rethrow_exception(*block_exception);
+  }
+}
+
 } // namespace at::mps
diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp
index f5d5edb6439a6..2fa6bcc6dc9ac 100644
--- a/aten/src/ATen/native/BinaryOps.cpp
+++ b/aten/src/ATen/native/BinaryOps.cpp
@@ -1009,12 +1009,25 @@ static Device correct_out_device(const Tensor& self, const Tensor& other) {
   }
 }
 
+static Tensor send_to_meta(const Tensor& self, const Device& device) {
+  Tensor out_meta;
+  if (self._is_zerotensor() && self.unsafeGetTensorImpl()->is_wrapped_number()) {
+    out_meta = at::_efficientzerotensor(self.sizes(), self.options().device(device));
+    out_meta.unsafeGetTensorImpl()->set_wrapped_number(true);
+  } else {
+    out_meta = self.to(device);
+  }
+  return out_meta;
+}
+
 Tensor mul_zerotensor(const Tensor& self, const Tensor& other) {
   auto out_device = correct_out_device(self, other);
   // hack to use the TensorIterator to get the correct broadcasting and type promotion logic
   auto device_ = Device(DeviceType::Meta);
   constexpr c10::DispatchKeySet meta_dks(at::DispatchKey::Meta);
-  auto meta_out = at::_ops::mul_Tensor::redispatch(meta_dks, self.to(device_), other.to(device_));
+  auto self_meta = send_to_meta(self, device_);
+  auto other_meta = send_to_meta(other, device_);
+  auto meta_out = at::_ops::mul_Tensor::redispatch(meta_dks, self_meta, other_meta);
   return at::_efficientzerotensor(meta_out.sizes(), meta_out.options().device(out_device));
 }
 
@@ -1023,7 +1036,9 @@ Tensor div_zerotensor(const Tensor& self, const Tensor& other) {
   // hack to use the TensorIterator to get the correct broadcasting and type promotion logic
   auto device_ = Device(DeviceType::Meta);
   constexpr c10::DispatchKeySet meta_dks(at::DispatchKey::Meta);
-  auto meta_out = at::_ops::div_Tensor::redispatch(meta_dks, self.to(device_), other.to(device_));
+  auto self_meta = send_to_meta(self, device_);
+  auto other_meta = send_to_meta(other, device_);
+  auto meta_out = at::_ops::div_Tensor::redispatch(meta_dks, self_meta, other_meta);
 
   if (self._is_zerotensor()) {
     if (other._is_zerotensor()) {
@@ -1052,8 +1067,9 @@ static Tensor maybe_add_maybe_sub(const Tensor& self, const Tensor& other, const
   // hack to use the TensorIterator to get the correct broadcasting and type promotion logic
   auto device_ = Device(DeviceType::Meta);
   constexpr c10::DispatchKeySet meta_dks(at::DispatchKey::Meta);
-  auto meta_out = at::_ops::add_Tensor::redispatch(
-      meta_dks, self.to(device_), other.to(device_), alpha);
+  auto self_meta = send_to_meta(self, device_);
+  auto other_meta = send_to_meta(other, device_);
+  auto meta_out = at::_ops::add_Tensor::redispatch(meta_dks, self_meta, other_meta, alpha);
 
   auto get_out_like = [&] (const Tensor& tensor)
   {
diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h
index 892144ac663a6..2a3388a052685 100644
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@@ -167,7 +167,7 @@ static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, co
     std::stringstream ss;
     ss << arg_name << " should be greater than zero but got (";
     std::copy(args.begin(), args.end() - 1, std::ostream_iterator<int>(ss,", "));
-    ss << args.back() <<  ")" << " (while checking arguments for " << c << ")";
+    ss << args.back() <<  ")" << " (while checking arguments for " << c << ')';
     TORCH_CHECK(false, ss.str());
   }
 }
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index 2c3f14aab911c..cb37f6f1030d3 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -409,7 +409,7 @@ struct ConvParams {
     if (!detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda() || !cudnn_enabled) {
       return false;
     }
-    static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
+    static long cudnn_version = detail::getCUDAHooks().versionRuntimeCuDNN();
     // broken on cuDNN 9.8 - 9.14
     if (cudnn_version >= 90800 && cudnn_version < 91500) {
       if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous &&
@@ -453,7 +453,7 @@ struct ConvParams {
     }
     // native kernel doesn't support 64-bit non-splittable case
     if (!(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) {
-      static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionCuDNN() : -1;
+      static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionRuntimeCuDNN() : -1;
       // TODO(eqy): remove this once cuDNN fixes 64-bit depthwise support, first broken in 9.11x
       if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) {
         if (cudnn_version < 0 || cudnn_version > 91000) {
@@ -639,7 +639,7 @@ static std::ostream& operator<<(std::ostream & out, const ConvParams<T>& params)
       << "  deterministic = " << params.deterministic
       << "  cudnn_enabled = " << params.cudnn_enabled
       << "  allow_tf32 = " << params.allow_tf32
-      << "}";
+      << '}';
   return out;
 }
 
diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp
index 1da245972f0cb..fbabba84dbb2d 100644
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@@ -50,18 +50,35 @@ static inline bool parseLinearFlatten3d() {
 // `_flatten_nd_linear` flattens all but the last dimension of the input tensor
 // before passing it to linear operation
 static inline Tensor _flatten_nd_linear(const Tensor& input, const Tensor& weight, const Tensor& bias) {
-    const auto input_sizes = input.sym_sizes();
-    // can't use -1 in reshape because it errors when a dimension is 0
-    c10::SymInt flattened_dim = 1;
-    for (int64_t i = 0, ndim = input_sizes.size(); i < ndim - 1; ++i) {
-      flattened_dim = flattened_dim * input_sizes[i];
+  const auto input_sizes = input.sym_sizes();
+
+  const auto result_flattened = [&]() -> Tensor {
+    const auto input_ncols = input_sizes.back();
+    const auto input_flattened_nrows = [&]() -> c10::SymInt {
+      // can't use -1 in reshape because it errors when a dimension is 0
+      auto flattened_nrows = c10::SymInt{1};
+      for (const auto& size : input_sizes.slice(0, input_sizes.size() - 1)) {
+        flattened_nrows *= size;
+      }
+      return flattened_nrows;
+    }();
+
+    const auto input_flattened = input.view_symint({input_flattened_nrows, input_ncols});
+    if (weight.layout() == c10::kStrided) {
+      return at::addmm(bias, input_flattened, weight.t());
+    } else {
+      // weight is sparse, and addmm for sparse expects matmul lhs to be sparse,
+      // so we transpose the problem.
+      // NOTE: at::matmul handles (dense @ sparse) similarly.
+      const auto bias_t = (bias.dim() >= 2) ? bias.mT() : bias.unsqueeze(-1);
+      return at::addmm(bias_t, weight, input_flattened.t()).t();
     }
-    auto inp_reshape = input.reshape_symint({flattened_dim, input_sizes.at(input_sizes.size() -1)});
-    const auto result = at::addmm(bias, inp_reshape, weight.t());
-    auto new_size = input_sizes.slice(0, input_sizes.size() - 1);
-    c10::SymDimVector sizes_vec(new_size.begin(), new_size.end());
-    sizes_vec.push_back(result.sym_size(1));
-    return result.view_symint(sizes_vec);
+  }();
+
+  // Unflatten flattened row dims
+  auto result_sizes = c10::SymDimVector{input_sizes.begin(), input_sizes.end()};
+  result_sizes.back() = result_flattened.sym_size(1);
+  return result_flattened.view_symint(result_sizes);
 }
 
 
@@ -90,15 +107,23 @@ Tensor linear(const Tensor& input, const Tensor& weight, const std::optional<Ten
     // Fused op is marginally faster.
     return at::addmm(*bias, input, weight.t());
   }
-  if (bias->defined() && !input.is_xla()) {
-    // Also hit the fused path for contiguous 3D input, if not using xla
+
+  const auto is_bias_likely_fusable = (
+      bias->defined() &&
+      // cuBLASLt: will fuse in the epilogue without copies
+      // when input/weight/bias are all strided.
+      // When weight is not strided, bias will not be fused,
+      // but we can still dispatch here to avoid at::matmul
+      // path which will probably use a very similar
+      // flattening optimization.
+      ((bias->dim() == 1 || bias->squeeze().dim() == 1) && bias->is_contiguous_or_false())
+  );
+  if (is_bias_likely_fusable && !input.is_xla()) {
+    // Also hit the fused path for contiguous nD input, if not using xla
     // backend. Reshaping/flattening has some performance implications on xla.
-    bool is_contiguous = input.is_contiguous_or_false();
-    if (is_contiguous && input_dim == 3) {
-      return _flatten_nd_linear(input, weight, *bias);
-    } else if (is_contiguous && input.layout() == c10::kStrided && weight.layout() == c10::kStrided && bias->dim() == 1) {
+    if (input.is_contiguous_or_false()) {
       return _flatten_nd_linear(input, weight, *bias);
-    } else if (parseLinearFlatten3d() && input_dim == 3) {
+    } else if (parseLinearFlatten3d()) {
       // If user forces flattening via env var
       const Tensor input_cont = input.contiguous();
       return _flatten_nd_linear(input_cont, weight, *bias);
diff --git a/aten/src/ATen/native/PackedSequence.cpp b/aten/src/ATen/native/PackedSequence.cpp
index d069108348d24..be7961b2a2452 100644
--- a/aten/src/ATen/native/PackedSequence.cpp
+++ b/aten/src/ATen/native/PackedSequence.cpp
@@ -142,6 +142,7 @@ Tensor _pack_padded_sequence_backward_symint(const Tensor& grad, c10::SymIntArra
 std::tuple<Tensor, Tensor> _pad_packed_sequence(const Tensor& data, const Tensor& _batch_sizes, bool batch_first, const Scalar& padding_value, int64_t total_length) {
   auto batch_sizes_t = _batch_sizes.contiguous();
   checkLongTensor(batch_sizes_t);
+  TORCH_CHECK(batch_sizes_t.numel() > 0, "batch_sizes can not be empty");
 
   int64_t * batch_sizes = batch_sizes_t.data_ptr<int64_t>();
   int64_t max_batch_size = batch_sizes[0];
diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp
index 79aaac48034ac..975e237c468d6 100644
--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@@ -847,7 +847,7 @@ Tensor stft(const Tensor& self, const int64_t n_fft, const std::optional<int64_t
        << ", hop_length=" << hop_length << ", win_length=" << win_length \
        << ", window="; \
     if (window.defined()) { \
-      SS << window.toString() << "{" << window.sizes() << "}"; \
+      SS << window.toString() << '{' << window.sizes() << '}'; \
     } else { \
       SS << "None"; \
     } \
@@ -1046,7 +1046,7 @@ Tensor istft(const Tensor& self, const int64_t n_fft, const std::optional<int64_
        << ", hop_length=" << hop_length << ", win_length=" << win_length \
        << ", window="; \
     if (window.defined()) { \
-      SS << window.toString() << "{" << window.sizes() << "}"; \
+      SS << window.toString() << '{' << window.sizes() << '}'; \
     } else { \
       SS << "None"; \
     } \
diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
index bfb5803eee07b..6c7efb3c161b0 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@@ -1087,7 +1087,8 @@ TORCH_IMPL_FUNC(index_copy_out)
     result.copy_(self);
 
   // See Note [Enabling Deterministic Operations]
-  if (result.is_cuda() && globalContext().deterministicAlgorithms()) {
+  if ((result.is_cuda() || result.is_xpu()) &&
+      globalContext().deterministicAlgorithms()) {
     torch::List<std::optional<Tensor>> indices;
     indices.resize(dim + 1);
     indices.set(dim, index);
diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp
index c6126eda61e73..8a0b38eafab36 100644
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@@ -23,6 +23,7 @@
 #include <ATen/ops/_aminmax_native.h>
 #include <ATen/ops/_assert_async_native.h>
 #include <ATen/ops/_assert_scalar_native.h>
+#include <ATen/ops/_async_error_native.h>
 #include <ATen/ops/_functional_assert_async_native.h>
 #include <ATen/ops/_functional_assert_scalar_native.h>
 #include <ATen/ops/_make_per_tensor_quantized_tensor.h>
@@ -479,6 +480,14 @@ Tensor isfinite(const Tensor& self) {
   });
 }
 
+void _async_error(std::string_view msg) {
+  TORCH_CHECK(0, msg);
+}
+
+void _async_error_meta(std::string_view msg) {
+  // Do NOT error, it's an async error!
+}
+
 void _assert_async_cpu(const Tensor& self) {
   TORCH_CHECK(
       native::is_nonzero(self),
@@ -514,7 +523,7 @@ Tensor _functional_assert_async_msg_cpu(
 }
 
 void _print(std::string_view s) {
-  std::cout << s << "\n";
+  std::cout << s << '\n';
 }
 
 // Sorting-based algorithm for isin(); used when the number of test elements is
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index 6df7761d822db..0079a530b3d0e 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -1,5 +1,6 @@
 #include <ATen/core/ATen_fwd.h>
 #include <c10/core/ScalarType.h>
+#include <c10/core/SymInt.h>
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
@@ -1710,11 +1711,37 @@ Tensor narrow_symint(
       "], but got ",
       start,
       ")")
-  if (start < 0) {
-    start = start + cur_size;
-  }
+
+  auto cond1 = TORCH_GUARD_OR_FALSE(start.sym_lt(0));
+  auto cond2 = TORCH_GUARD_OR_FALSE(start.sym_ge(0));
+
+  if (cond1 || cond2) {
+    if (cond1) {
+      start = start + cur_size;
+    }
+
+    TORCH_SYM_CHECK(
+        start.sym_le(cur_size - length),
+        "start (",
+        start,
+        ") + length (",
+        length,
+        ") exceeds dimension size (",
+        cur_size,
+        ").");
+    return at::slice_symint(self, dim, start, start + length, 1);
+  }
+
+  // Unbacked start handling!
+
+  // Bounds check without converting start:
+  // - If start < 0: need (start + cur_size) + length <= cur_size, i.e., start +
+  // length <= 0
+  // - If start >= 0: need start + length <= cur_size
+  auto end = start + length;
   TORCH_SYM_CHECK(
-      start.sym_le(cur_size - length),
+      (start.sym_lt(0).sym_and((end).sym_le(0)))
+          .sym_or(start.sym_ge(0).sym_and((end).sym_le(cur_size))),
       "start (",
       start,
       ") + length (",
@@ -1722,7 +1749,28 @@ Tensor narrow_symint(
       ") exceeds dimension size (",
       cur_size,
       ").");
-  return at::slice_symint(self, dim, start, start + length, 1);
+
+  if (TORCH_GUARD_OR_FALSE(end.sym_ne(0))) {
+    return at::slice_symint(self, dim, start, end, 1);
+  } else {
+    // Cannot statically determine the condition due to unbacked.
+    // This is an interesting situation; when start is negative and
+    // start + length == 0, slice and narrow do different things.
+    // i.e., x.narrow(0, -2, 2) != x[-2:0]; in that case, we want to
+    // pass curr_size instead of 0. Otherwise, they would do the same thing.
+    // This says at runtime: if start < 0 and end == 0, then pass curr_size
+    // instead of 0.
+
+    auto use_different = start.sym_lt(0).sym_and(end.sym_eq(0)).toSymInt();
+    auto result =
+        at::slice_symint(self, dim, start, end + use_different * cur_size, 1);
+
+    // Ensure slice allocated unbacked size is specialized to length.
+    SymInt new_size = result.sym_size(dim);
+    TORCH_SYM_CHECK(new_size.sym_eq(length), "")
+
+    return result;
+  }
 }
 
 // This overload exists purely for XLA, because they wanted to pass in
@@ -1736,8 +1784,8 @@ Tensor narrow_tensor_symint(
       start.dim() == 0 &&
           isIntegralType(start.scalar_type(), /*includeBool=*/false),
       "start must be an 0-dim integral Tensor.");
-  int64_t st = start.item<int64_t>();
-  return at::narrow_symint(self, dim, c10::SymInt(st), std::move(length));
+  c10::SymInt st = start.item().toSymInt();
+  return at::narrow_symint(self, dim, std::move(st), std::move(length));
 }
 
 std::
diff --git a/aten/src/ATen/native/TransposeType.h b/aten/src/ATen/native/TransposeType.h
index 603bf6fee60aa..bb63e6d542482 100644
--- a/aten/src/ATen/native/TransposeType.h
+++ b/aten/src/ATen/native/TransposeType.h
@@ -1,6 +1,8 @@
 #pragma once
 #include <c10/util/Exception.h>
 
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default")
+
 namespace at::native {
 
 // Used as an interface between the different BLAS-like libraries
@@ -21,3 +23,5 @@ static inline char to_blas(TransposeType trans) {
 }
 
 }  // namespace at::native
+
+C10_DIAGNOSTIC_POP()
diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index f849283043d37..acf14f3dfcdd5 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -904,19 +904,11 @@ Tensor mvlgamma(const Tensor& self, int64_t p) {
   return args.lgamma_().sum(-1).add_(p2_sub_p * std::log(c10::pi<double>) * QUARTER);
 }
 
+// since mvlgamma_ has different signature from its
+// out and functional variant, we explicitly
+// define it (instead of using structured kernel).
 Tensor& mvlgamma_(Tensor& self, int64_t p) {
-  mvlgamma_check(self, p);
-  Tensor args = native::arange(
-      -p *HALF  + HALF,
-      HALF,
-      HALF,
-      optTypeMetaToScalarType(self.options().dtype_opt()),
-      self.options().layout_opt(),
-      self.options().device_opt(),
-      self.options().pinned_memory_opt());
-  args = args.add(self.unsqueeze(-1));
-  const auto p2_sub_p = static_cast<double>(p * (p - 1));
-  return self.copy_(args.lgamma_().sum(-1).add_(p2_sub_p * std::log(c10::pi<double>) * QUARTER));
+  return at::mvlgamma_out(self, self, p);
 }
 
 Tensor& mvlgamma_out(const Tensor& self, int64_t p, Tensor& result) {
diff --git a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
index 7587988528ebb..73f8c136794ce 100644
--- a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
+++ b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
@@ -293,7 +293,7 @@ struct ComputeLocationBase<scalar_t, /*align_corners=*/false> {
     , empty(size <= 0) {}
 
   inline Vec unnormalize(const Vec &in) const {
-    return (in + Vec(1)) * Vec(scaling_factor) - Vec(0.5);
+    return (in + Vec(static_cast<scalar_t>(1))) * Vec(scaling_factor) - Vec(static_cast<scalar_t>(0.5));
   }
 
   inline Vec clip_coordinates(const Vec &in) const {
@@ -831,7 +831,7 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bicubic,
 
   // constant used in cubic convolution
   // could be -0.5 or -0.75, use the same value in UpSampleBicubic2d.h
-  const Vec A = Vec(-0.75);
+  const Vec A = Vec(static_cast<scalar_t>(-0.75));
 
   ApplyGridSample(const TensorAccessor<const scalar_t, 4>& input)
     : inp_H(input.size(2))
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
index 186f7d8a6a78a..75a4d357a1c0b 100644
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@@ -147,14 +147,24 @@ static bool isGloballyDisabledAddmmCudaLt(const at::Device& device) {
 /*
  * Check whether for the given input we want to enable the Lt interface
  */
-static bool isInputCompliesAddmmCudaLt(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha) {
+static bool isInputCompliesAddmmCudaLt(
+    Tensor& result,
+    const Tensor& self,
+    const Tensor& mat1,
+    const Tensor& mat2,
+    const Scalar& beta,
+    const Scalar& alpha,
+    Activation activation
+) {
+  #ifdef USE_ROCM
   // Implies 2D bias which we currently not send through Lt.
   // TODO: this check is done pre col-major input preparation,
   // so, this condition can be ralexed in cases when a col-major
   // copy of result is needed.
-  if (result.is_same(self)) {
+  if (self.is_same(result) || self.dim() == 2) {
     return false;
   }
+  #endif
 
   #if defined(USE_ROCM) && ROCM_VERSION == 60400
   // hipblaslt TT fp32 regression on ROCm 6.4, cannot use
@@ -169,13 +179,33 @@ static bool isInputCompliesAddmmCudaLt(Tensor& result, const Tensor& self, const
   #if defined(CUDA_VERSION) || defined(USE_ROCM)
   const auto scalar_type = mat1.scalar_type();
   return (beta.toComplexDouble() == 1.0
+    // NOTE: row-major result is important when bias is 1D.
+    // This is because Lt broadcasts 1D bias over the columns
+    // while the aten::addmm API broadcasts it over the rows,
+    // and this is in conjuction with the data preparation
+    // procedure that does not transpose arguments with
+    // col-major result. For col-major result we need
+    // to explicitly transpose the problem so that bias is
+    // correctly applied.
+    // TODO: enable col-major result if needed.
+    // TODO: no need to check result's layout when
+    // !result.is_same(self) and self.dim() == 2, because
+    // self needs to be copied into result and the bias ptr
+    // will be ignored.
     && result.dim() == 2 && result.is_contiguous()
-    // Conditions for bias to be fusable
     && (
-      self.is_contiguous() &&
-      // NOTE: fine to have 1-len dims to the left from the right-most one
-      (self.dim() == 1 || self.squeeze().dim() == 1) &&
-      self.sizes().back() == mat2_sizes[1]
+      ( // Conditions for bias to be fusable -- implies direct Lt path without copies.
+        self.is_contiguous() &&
+        // NOTE: fine to have 1-len dims to the left from the right-most one
+        (self.dim() == 1 || self.squeeze().dim() == 1) &&
+        self.sizes().back() == mat2_sizes[1]
+      )
+      || ( // 2D bias restrictions. self.is_contiguous() is implicit when result.is_same(self),
+        // and we need to copy self into result otherwise, so the self's layout becomes irrelevant.
+        // See also TODO from above.
+        activation != Activation::None && // Lt is faster when activation is fused
+        (self.dim() == 2 && at::is_expandable_to(self.sizes(), {mat1_sizes[0], mat2_sizes[1]}))
+      )
     )
     && ( // some dtype restrictions
       #ifndef USE_ROCM
@@ -266,11 +296,16 @@ template <typename scalar_t, typename res_scalar_t = scalar_t>
 bool launchGemmAndBiasCublasLt(
     // args contains result which is modified
     cublasCommonArgs& args,
-    const Tensor& self,
+    const std::optional<Tensor>& self,
     const Scalar& alpha,
     Activation activation = Activation::None
 ) {
-  const auto* self_ptr = self.const_data_ptr<scalar_t>();
+  // We apply bias in the epilogue only when it is 1D,
+  // or when it can be squeezed to 1D.
+  // self_ptr == nullptr implies ignore bias epilogue
+  // and use standard gemm-like API.
+  const auto* self_ptr = self.has_value() ? self.value().const_data_ptr<scalar_t>() : static_cast<const scalar_t*>(nullptr);
+
 
   const auto tuning_ctx = at::cuda::tunable::getTuningContext();
   if (tuning_ctx->IsTunableOpEnabled()) {
@@ -353,34 +388,30 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
   bool disable_addmm_cuda_lt = persistent_disable_addmm_cuda_lt || disable_addmm_cuda_lt_override;
   #ifdef USE_ROCM
   // Conditioned on the device index, which is not persistent
-  disable_addmm_cuda_lt = isGloballyDisabledAddmmCudaLt(self.device()) || disable_addmm_cuda_lt;
+  disable_addmm_cuda_lt = disable_addmm_cuda_lt || isGloballyDisabledAddmmCudaLt(self.device());
   #endif
   // Condition on the input
-  disable_addmm_cuda_lt = !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha) || disable_addmm_cuda_lt;
-  // }
+  disable_addmm_cuda_lt = disable_addmm_cuda_lt || !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha, activation);
 
   at::ScalarType scalar_type = mat1.scalar_type();
   bool is_float_output_with_half_input = (scalar_type == at::ScalarType::Half || scalar_type == at::ScalarType::BFloat16) && result.scalar_type() == at::ScalarType::Float;
 
+  #ifdef USE_ROCM
+  disable_addmm_cuda_lt = disable_addmm_cuda_lt || is_float_output_with_half_input;
+  #endif
+
+  bool use_bias_ptr_lt = (self.dim() == 1) && !disable_addmm_cuda_lt;
+  // for float output with half input cublasLT with bias produces wrong results
+  use_bias_ptr_lt &= !is_float_output_with_half_input;
+
   // Handle result/self shapes
   if (!result.is_same(self)) {
     at::native::resize_output(result, {mat1.sizes()[0], mat2.sizes()[1]});
 
-    const auto self_maybe_expanded = [&]() -> c10::MaybeOwned<Tensor> {
-      if (disable_addmm_cuda_lt) {
-        // When in non-Lt path we do expand self even before
-        // check for beta != 0.0 to make sure that
-        // test_sparse_csr.py::TestSparseCSRCUDA::test_addmm_errors_*
-        // runs green.
-        return expand_size(self, result.sizes(), "addmm");
-      }
-      // copy next, should broadcast
-      return c10::MaybeOwned<Tensor>::borrowed(self);
-    }();
-    // We copy bias when in the non-Lt path
-    if (beta.toComplexDouble() != 0.0 && disable_addmm_cuda_lt) {
+      // We do not copy bias only when we need the bias ptr
+    if (beta.toComplexDouble() != 0.0 && !use_bias_ptr_lt) {
       // NOTE: self should broadcast over result
-      at::native::copy_(result, *self_maybe_expanded);
+      at::native::copy_(result, *expand_size(self, result.sizes(), "addmm"));
     }
   }
 
@@ -428,7 +459,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
         scalar_type,
         "addmm_cuda_lt",
         [&] {
-          lt_success = launchGemmAndBiasCublasLt<scalar_t, float>(args, self, alpha, activation);
+          lt_success = launchGemmAndBiasCublasLt<scalar_t, float>(args, use_bias_ptr_lt ? std::make_optional(self) : std::nullopt, alpha, activation);
         }
       );
       #endif
@@ -440,7 +471,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
         scalar_type,
         "addmm_cuda_lt",
         [&] {
-          lt_success = launchGemmAndBiasCublasLt<scalar_t>(args, self, alpha, activation);
+          lt_success = launchGemmAndBiasCublasLt<scalar_t>(args, use_bias_ptr_lt ? std::make_optional(self) : std::nullopt, alpha, activation);
         }
       );
     } // end is_float_output_with_half_input
@@ -896,7 +927,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) {
   return _int_mm_out_cuda(self, mat2, result);
 }
 
-static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const at::ScalarType out_dtype, bool is_bmm, const std::optional<Tensor>& self_baddbmm = std::nullopt) {
+static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const at::ScalarType out_dtype, const std::optional<Tensor>& self_baddbmm = std::nullopt) {
   // ref ATen/native/LinearAlgebra.cpp common_checks_baddbmm_bmm
   TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor");
   TORCH_CHECK(batch2.dim() == 3, "batch2 must be a 3D tensor");
@@ -920,7 +951,7 @@ static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& bat
     (out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)),
     "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
 
-  if (!is_bmm && self_baddbmm.has_value()) {
+  if (self_baddbmm.has_value()) {
     const auto& self = self_baddbmm.value();
     TORCH_CHECK(self.dim() == 3, "self must be a 3D tensor");
     TORCH_CHECK(self.sizes() == output_size, "self must have the same shape as the output");
@@ -928,15 +959,12 @@ static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& bat
 }
 
 Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype) {
-  IntArrayRef batch1_sizes = batch1.sizes();
-  IntArrayRef batch2_sizes = batch2.sizes();
-
-  Tensor out = at::empty({batch1_sizes[0], batch1_sizes[1], batch2_sizes[2]}, batch1.options().dtype(out_dtype));
+  Tensor out = at::empty({batch1.size(0), batch1.size(1), batch2.size(2)}, batch1.options().dtype(out_dtype));
   return _bmm_out_dtype_cuda(batch1, batch2, out_dtype, out);
 }
 
 Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, Tensor &out) {
-  baddbmm_bmm_out_dtype_checks(batch1, batch2, 0.0, 1.0, out_dtype, true);
+  baddbmm_bmm_out_dtype_checks(batch1, batch2, 0.0, 1.0, out_dtype);
   Scalar beta(0.0);
   Scalar alpha(1.0);
   {
@@ -948,14 +976,16 @@ Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at
 }
 
 Tensor _baddbmm_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) {
-  // We need to copy the tensor
-  Tensor out = self.clone().to(self.options().dtype(out_dtype));
-
-  return _baddbmm_out_dtype_cuda(out, batch1, batch2, out_dtype, beta, alpha, out);
+  TORCH_CHECK(self.scalar_type() == out_dtype || self.scalar_type() == batch1.dtype(),
+  "self dtype must match either out_dtype or batch1 dtype");
+  Tensor out = at::empty({batch1.size(0), batch1.size(1), batch2.size(2)}, batch1.options().dtype(out_dtype));
+  return _baddbmm_out_dtype_cuda(self, batch1, batch2, out_dtype, beta, alpha, out);
 }
 
 Tensor& _baddbmm_out_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
-  baddbmm_bmm_out_dtype_checks(batch1, batch2, beta, alpha, out_dtype, false, self);
+  baddbmm_bmm_out_dtype_checks(batch1, batch2, beta, alpha, out_dtype, out);
+  // We need to copy the tensor
+  out.copy_(self);
   {
     NoNamesGuard guard;
     baddbmm_out_cuda_impl(out, out, batch1, batch2, beta, alpha);
@@ -990,24 +1020,27 @@ Tensor& _mm_dtype_out_cuda(const Tensor& self, const Tensor& mat2, const at::Sca
 }
 
 Tensor _addmm_dtype_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) {
-  Tensor result = at::empty(self.sizes(), self.options().dtype(out_dtype));
+  TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
+  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
+  Tensor result = at::empty({mat1.size(0), mat2.size(1)}, self.options().dtype(out_dtype));
   return _addmm_dtype_out_cuda(self, mat1, mat2, out_dtype, beta, alpha, result);
 }
 
 Tensor& _addmm_dtype_out_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
-  TORCH_CHECK(self.scalar_type() == mat2.scalar_type(), "self and mat2 must have the same dtype, but got ", self.scalar_type(), " and ", mat2.scalar_type());
-  TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "mat1 and mat2 must have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type());
+// repeat dimensionality checks for direct calls to `out` overload
   TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
   TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
   TORCH_CHECK(
       mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
       mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
+  TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "mat1 and mat2 must have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type());
+  TORCH_CHECK(out_dtype == mat1.scalar_type() ||
+  (out_dtype == at::ScalarType::Float && (mat1.scalar_type() == at::ScalarType::Half || mat1.scalar_type() == at::ScalarType::BFloat16)),
+  "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
 
   TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
-  TORCH_CHECK(out_dtype == self.scalar_type() ||
-    (out_dtype == at::ScalarType::Float && (self.scalar_type() == at::ScalarType::Half || self.scalar_type() == at::ScalarType::BFloat16)),
-    "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
-  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
+  TORCH_CHECK(out_dtype == self.scalar_type() || self.scalar_type() == mat1.scalar_type(),
+    "self dtype must match either out_dtype or mat1 dtype");
 
   addmm_out_cuda_impl(out, self, mat1, mat2, beta, alpha);
 
diff --git a/aten/src/ATen/native/cuda/CUDALoops.cuh b/aten/src/ATen/native/cuda/CUDALoops.cuh
index c42d03b9cbf7f..b83ec3c761e9b 100644
--- a/aten/src/ATen/native/cuda/CUDALoops.cuh
+++ b/aten/src/ATen/native/cuda/CUDALoops.cuh
@@ -884,6 +884,69 @@ struct type_specialized_kernel_launcher {
   }
 };
 
+template <int arg_index>
+struct type_specialized_broadcast_kernel_launcher {
+  template <
+      typename func_t,
+      typename array_t,
+      typename dtypes_t,
+      typename calc_t>
+  static void apply(
+      int64_t numel,
+      func_t f,
+      array_t data,
+      dtypes_t dtypes,
+      calc_t offset_calc) {
+        using traits = function_traits<func_t>;
+        using ret_t = typename traits::result_type;
+        using arg0_t = typename traits::template arg<0>::type;
+        using arg1_t = typename traits::template arg<1>::type;
+        if (dtypes[0] == rt_binary_specializations[arg_index][0] &&
+          dtypes[1] == rt_binary_specializations[arg_index][1] &&
+          dtypes[2] == rt_binary_specializations[arg_index][2]) {
+            using ret_cpp_t = c10::impl::ScalarTypeToCPPTypeT<rt_binary_specializations[arg_index][0]>;
+            using arg0_cpp_t = c10::impl::ScalarTypeToCPPTypeT<rt_binary_specializations[arg_index][1]>;
+            using arg1_cpp_t = c10::impl::ScalarTypeToCPPTypeT<rt_binary_specializations[arg_index][2]>;
+            constexpr int grp_sz = 128;
+            launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
+              if (unrl) {
+                auto offsets0 = offset_calc.get(idx);
+                auto offsets1 = offset_calc.get(idx + grp_sz);
+                auto offsets2 = offset_calc.get(idx + grp_sz * 2);
+                auto offsets3 = offset_calc.get(idx + grp_sz * 3);
+                void* out0 = data[0] + offsets0[0];
+                void* out1 = data[0] + offsets1[0];
+                void* out2 = data[0] + offsets2[0];
+                void* out3 = data[0] + offsets3[0];
+                auto u = c10::load<arg0_cpp_t>(data[1] + offsets0[1]);
+                auto v = c10::load<arg1_cpp_t>(data[2] + offsets0[2]);
+                ret_t result0 = f(c10::convert<arg0_t>(u), c10::convert<arg1_t>(v));
+                auto u1 = c10::load<arg0_cpp_t>(data[1] + offsets1[1]);
+                auto v1 = c10::load<arg1_cpp_t>(data[2]+ offsets1[2]);
+                ret_t result1 = f(c10::convert<arg0_t>(u1), c10::convert<arg1_t>(v1));
+                auto u2 = c10::load<arg0_cpp_t>(data[1] + offsets2[1]);
+                auto v2 = c10::load<arg1_cpp_t>(data[2] + offsets2[2]);
+                ret_t result2 = f(c10::convert<arg0_t>(u2), c10::convert<arg1_t>(v2));
+                auto u3 = c10::load<arg0_cpp_t>(data[1] + offsets3[1]);
+                auto v3 = c10::load<arg1_cpp_t>(data[2] + offsets3[2]);
+                ret_t result3 = f(c10::convert<arg0_t>(u3), c10::convert<arg1_t>(v3));
+                *(ret_cpp_t*)out0 = c10::convert<ret_cpp_t>(result0);
+                *(ret_cpp_t*)out1 = c10::convert<ret_cpp_t>(result1);
+                *(ret_cpp_t*)out2 = c10::convert<ret_cpp_t>(result2);
+                *(ret_cpp_t*)out3 = c10::convert<ret_cpp_t>(result3);
+              } else {
+                auto offsets = offset_calc.get(idx);
+                void* out = data[0] + offsets[0];
+                auto u = c10::load<arg0_cpp_t>(data[1] + offsets[1]);
+                auto v = c10::load<arg1_cpp_t>(data[2] + offsets[2]);
+                ret_t result = f(c10::convert<arg0_t>(u), c10::convert<arg1_t>(v));
+                *(ret_cpp_t*)out = c10::convert<ret_cpp_t>(result);
+              }
+            });
+        }
+      }
+};
+
 } // namespace
 #endif
 
@@ -1002,6 +1065,32 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
     }
     auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter);
 #ifdef USE_ROCM
+    if (check_binary_rt_types_for_specialization(iter)) {
+      // constexpr to reduce the amount of kernels generated for
+      // broadcast elementwise with mexed dtypes and limit which functors are actually
+      // applied to the load and store at compile time.
+      using func_tuple = typename traits::ArgsTuple;
+      if constexpr (
+        std::is_same_v<float, arg0_t> && traits::arity == 2 &&
+        check_binary_functor_types_for_specialization<
+          func_tuple,
+          float,
+          float,
+          traits::arity,
+          /*arg_num=*/0>::check()) {
+            memory::detail::static_unroll<
+              type_specialized_broadcast_kernel_launcher,
+              rt_binary_specializations.size()>::with_args(
+                numel,
+                f,
+                data,
+                dtypes,
+                offset_calc
+            );
+            return;
+      }
+    }
+
     constexpr int grp_sz = 128;
     launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
       if (unrl) {
diff --git a/aten/src/ATen/native/cuda/CompositeRandomAccessor.h b/aten/src/ATen/native/cuda/CompositeRandomAccessor.h
index d47a7fa776f1b..eb8587d1f9337 100644
--- a/aten/src/ATen/native/cuda/CompositeRandomAccessor.h
+++ b/aten/src/ATen/native/cuda/CompositeRandomAccessor.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <ATen/native/CompositeRandomAccessorCommon.h>
+#include <thrust/swap.h>
 #include <thrust/tuple.h>
 
 namespace at { namespace native {
diff --git a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
index 344906a2a4df2..88c552e9bf120 100644
--- a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
+++ b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
@@ -75,30 +75,52 @@ static inline bool can_use_int32_nhwc(
   return true;
 }
 
+static inline bool can_use_int32_nchw(
+    int64_t nbatch, int64_t channels,
+    int64_t height, int64_t width,
+    int64_t pooled_height, int64_t pooled_width) {
+  int64_t hw = height * width;
+  return can_use_int32_nhwc(
+      nbatch, channels, height, width,
+      pooled_height, pooled_width,
+      channels * hw,  // in_stride_n
+      hw, // in_stride_c
+      width, // in_stride_h
+      1 // in_stride_w
+  );
+}
+
 // kernels borrowed from Caffe
-template <typename scalar_t>
-__global__ void max_pool_forward_nchw(const int nthreads, const scalar_t* bottom_data,
-    const int64_t channels, const int64_t height,
-    const int64_t width, const int pooled_height, const int pooled_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, const int pad_h, const int pad_w,
-    const int dilation_h, const int dilation_w, scalar_t* top_data,
+template <typename scalar_t, typename index_t>
+__global__ void max_pool_forward_nchw(
+    const index_t nthreads,
+    const scalar_t* bottom_data,
+    const int64_t channels,
+    const int64_t height,
+    const int64_t width,
+    const int pooled_height,
+    const int pooled_width,
+    const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w,
+    const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w,
+    scalar_t* top_data,
     int64_t* top_mask) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
-    int hstart = ph * stride_h - pad_h;
-    int wstart = pw * stride_w - pad_w;
-    int hend = min(hstart + (kernel_h - 1) * dilation_h + 1, height);
-    int wend = min(wstart + (kernel_w - 1) * dilation_w + 1, width);
+  CUDA_KERNEL_LOOP_TYPE(index, nthreads, index_t) {
+    index_t pw = index % pooled_width;
+    index_t ph = (index / pooled_width) % pooled_height;
+    index_t c = (index / pooled_width / pooled_height) % channels;
+    index_t n = index / pooled_width / pooled_height / channels;
+    index_t hstart = ph * stride_h - pad_h;
+    index_t wstart = pw * stride_w - pad_w;
+    index_t hend = min(hstart + (kernel_h - 1) * dilation_h + 1, height);
+    index_t wend = min(wstart + (kernel_w - 1) * dilation_w + 1, width);
     while(hstart < 0)
       hstart += dilation_h;
     while(wstart < 0)
       wstart += dilation_w;
     scalar_t maxval = at::numeric_limits<scalar_t>::lower_bound(); // -Infinity
-    int maxidx = hstart * width + wstart;
+    index_t maxidx = hstart * width + wstart;
     const scalar_t* btm_data = bottom_data + (n * channels + c) * height * width;
     for (int h = hstart; h < hend; h += dilation_h) {
       for (int w = wstart; w < wend; w += dilation_w) {
@@ -251,32 +273,39 @@ __global__ void max_pool_forward_nhwc(
 
 static constexpr int BLOCK_THREADS = 256;
 
-template <typename scalar_t, typename accscalar_t>
+template <typename scalar_t, typename accscalar_t, typename index_t>
 #if defined (USE_ROCM)
 C10_LAUNCH_BOUNDS_2(BLOCK_THREADS, 4)
 #else
 C10_LAUNCH_BOUNDS_2(BLOCK_THREADS, 8)
 #endif
-__global__ void max_pool_backward_nchw(const scalar_t* top_diff,
-    const int64_t* top_mask, const int num, const int64_t channels,
-    const int64_t height, const int64_t width, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+__global__ void max_pool_backward_nchw(
+    const scalar_t* top_diff,
+    const int64_t* top_mask,
+    const index_t num,
+    const index_t channels,
+    const index_t height,
+    const index_t width,
+    const index_t pooled_height,
+    const index_t pooled_width,
+    const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w,
+    const int pad_h, const int pad_w,
     const int dilation_h, const int dilation_w,
     scalar_t* bottom_diff) {
-  CUDA_KERNEL_LOOP(index, height*width) {
-    int h = index / width;
-    int w = index - h * width;
-    int phstart = p_start(h, pad_h, kernel_h, dilation_h, stride_h);
-    int phend = p_end(h, pad_h, pooled_height, stride_h);
-    int pwstart = p_start(w, pad_w, kernel_w, dilation_w, stride_w);
-    int pwend = p_end(w, pad_w, pooled_width, stride_w);
-    for (int n = blockIdx.y; n < num; n += gridDim.y) {
-      for (int c = blockIdx.z; c < channels; c+= gridDim.z) {
+  CUDA_KERNEL_LOOP_TYPE(index, height*width, index_t) {
+    index_t h = index / width;
+    index_t w = index - h * width;
+    index_t phstart = p_start(h, pad_h, kernel_h, dilation_h, stride_h);
+    index_t phend = p_end(h, pad_h, pooled_height, stride_h);
+    index_t pwstart = p_start(w, pad_w, kernel_w, dilation_w, stride_w);
+    index_t pwend = p_end(w, pad_w, pooled_width, stride_w);
+    for (index_t n = blockIdx.y; n < num; n += gridDim.y) {
+      for (index_t c = blockIdx.z; c < channels; c += gridDim.z) {
         accscalar_t gradient = accscalar_t(0);
-        int offset = (n * channels + c) * pooled_height * pooled_width;
-        for (int ph = phstart; ph < phend; ++ph) {
-          for (int pw = pwstart; pw < pwend; ++pw) {
+        index_t offset = (n * channels + c) * pooled_height * pooled_width;
+        for (index_t ph = phstart; ph < phend; ++ph) {
+          for (index_t pw = pwstart; pw < pwend; ++pw) {
             if (top_mask[ph * pooled_width + pw + offset] == h * width + w) {
               gradient += static_cast<accscalar_t>(top_diff[ph * pooled_width + pw + offset]);
             }
@@ -469,8 +498,6 @@ const Tensor& indices) {
   const int64_t in_stride_h = input.stride(-2);
   const int64_t in_stride_w = input.stride(-1);
 
-  const int count = safe_downcast<int, int64_t>(output.numel());
-
   AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
     "max_pool2d_with_indices_out_cuda_frame",
     [&] {
@@ -553,14 +580,42 @@ const Tensor& indices) {
           break;
         }
         case MemoryFormat::Contiguous: {
-          const int num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock,
-                                            BLOCK_THREADS);
-          max_pool_forward_nchw<scalar_t>
-              <<<ceil_div(count, num_threads), num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
-              count, input_data,
-                  nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
-                  kH, kW, dH, dW, padH, padW, dilationH, dilationW,
-                  output_data, indices_data);
+          const int threads = std::min(
+              at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock,
+              BLOCK_THREADS);
+          const int64_t nthreads = output.numel();
+          bool use_int32 = can_use_int32_nchw(
+              nbatch, nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth);
+          const int maxGridX = at::cuda::getCurrentDeviceProperties()->maxGridSize[0];
+          const int blocks = static_cast<int>(std::min<int64_t>(
+              ceil_div(nthreads, static_cast<int64_t>(threads)),
+              static_cast<int64_t>(maxGridX)));
+          auto stream = at::cuda::getCurrentCUDAStream();
+          if (use_int32) {
+            max_pool_forward_nchw<scalar_t, int32_t>
+                <<<blocks, threads, 0, stream>>>(
+                    static_cast<int32_t>(nthreads),
+                    input_data,
+                    static_cast<int32_t>(nInputPlane),
+                    static_cast<int32_t>(inputHeight),
+                    static_cast<int32_t>(inputWidth),
+                    static_cast<int32_t>(outputHeight),
+                    static_cast<int32_t>(outputWidth),
+                    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+                    output_data, indices_data);
+          } else {
+            max_pool_forward_nchw<scalar_t, int64_t>
+                <<<blocks, threads, 0, stream>>>(
+                    nthreads,
+                    input_data,
+                    nInputPlane,
+                    inputHeight,
+                    inputWidth,
+                    outputHeight,
+                    outputWidth,
+                    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+                    output_data, indices_data);
+          }
           C10_CUDA_KERNEL_LAUNCH_CHECK();
           break;
         }
@@ -633,8 +688,6 @@ const Tensor& gradInput) {
 
   gradInput.zero_();
 
-  int64_t count = input.numel();
-
   AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
     "max_pool2d_with_indices_out_cuda_frame",
     [&] {
@@ -692,25 +745,45 @@ const Tensor& gradInput) {
           break;
         }
         case MemoryFormat::Contiguous: {
-          int imgcount = inputWidth * inputHeight;
-          dim3 grid;
-          const int blocks = (imgcount + BLOCK_THREADS - 1) / BLOCK_THREADS;
-          grid.x = blocks;
-          grid.y = nbatch;
-          uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
-          if (maxGridY < grid.y) grid.y = maxGridY;
-          grid.z = nInputPlane;
-          uint64_t maxGridZ = at::cuda::getCurrentDeviceProperties()->maxGridSize[2];
-          if (maxGridZ < grid.z) grid.z = maxGridZ;
-
-          max_pool_backward_nchw<scalar_t, accscalar_t>
-          <<<grid, BLOCK_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
-                  gradOutput_data,
-                  indices_data,
-                  nbatch,
-                  nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
-                  kH, kW, dH, dW, padH, padW, dilationH, dilationW,
-                  gradInput_data);
+          const int threads = std::min(
+              at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock,
+              BLOCK_THREADS);
+          const int imgcount = inputWidth * inputHeight;
+          const int maxGridX = at::cuda::getCurrentDeviceProperties()->maxGridSize[0];
+          const int maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
+          const int maxGridZ = at::cuda::getCurrentDeviceProperties()->maxGridSize[2];
+          const int blocks_x = std::min(ceil_div(imgcount, threads), maxGridX);
+          dim3 grid(blocks_x, static_cast<unsigned>(std::min<int64_t>(nbatch, maxGridY)), static_cast<unsigned>(std::min<int64_t>(nInputPlane, maxGridZ)));
+          bool use_int32 = can_use_int32_nchw(
+              nbatch, nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth);
+          auto stream = at::cuda::getCurrentCUDAStream();
+          if (use_int32) {
+            max_pool_backward_nchw<scalar_t, accscalar_t, int32_t>
+                <<<grid, threads, 0, stream>>>(
+                    gradOutput_data,
+                    indices_data,
+                    static_cast<int32_t>(nbatch),
+                    static_cast<int32_t>(nInputPlane),
+                    static_cast<int32_t>(inputHeight),
+                    static_cast<int32_t>(inputWidth),
+                    static_cast<int32_t>(outputHeight),
+                    static_cast<int32_t>(outputWidth),
+                    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+                    gradInput_data);
+          } else {
+            max_pool_backward_nchw<scalar_t, accscalar_t, int64_t>
+                <<<grid, threads, 0, stream>>>(
+                    gradOutput_data,
+                    indices_data,
+                    nbatch,
+                    nInputPlane,
+                    inputHeight,
+                    inputWidth,
+                    outputHeight,
+                    outputWidth,
+                    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+                    gradInput_data);
+          }
           C10_CUDA_KERNEL_LAUNCH_CHECK();
           break;
         }
diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu
index ab3747df031eb..9af8abcf3cf82 100644
--- a/aten/src/ATen/native/cuda/EmbeddingBag.cu
+++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu
@@ -78,9 +78,18 @@ __global__ void EmbeddingBag_updateOutputKernel_max(
       scalar_t weightFeatMax = 0;
       int64_t bag_size_ = 0;
       int64_t maxWord = -1;
+
+      // Separate validation loop reduces register pressure in the main loop below.
+      // No early exit (break) on invalid input as benchmarking shows it degrades performance.
+      bool has_invalid_index = false;
+      for (int64_t emb = begin; emb < end; emb++) {
+        index_t input_idx = input[emb];
+        has_invalid_index = has_invalid_index || (input_idx < 0 || input_idx >= numRows);
+      }
+      CUDA_KERNEL_ASSERT(!has_invalid_index && "Invalid input index in EmbeddingBag: index out of range [0, numRows)");
+
       for (int64_t emb = begin; emb < end; emb++) {
         bool pad = (input[emb] == padding_idx);
-        CUDA_KERNEL_ASSERT(input[emb] < numRows);
         const int64_t weightRow = input[emb] * weight_stride0;
         scalar_t weightValue = weightFeat[weightRow];
         if (bag_size_ == 0 || weightValue > weightFeatMax) {
@@ -129,10 +138,19 @@ __global__ void EmbeddingBag_updateOutputKernel_sum_mean(
       CUDA_KERNEL_ASSERT(end >= begin);
       accscalar_t weightFeatSum = 0;
       int64_t bag_size_ = 0;
+
+      // Separate validation loop reduces register pressure in the main loop below.
+      // No early exit (break) on invalid input as benchmarking shows it degrades performance.
+      bool has_invalid_index = false;
+      for (int64_t emb = begin; emb < end; emb++) {
+        index_t input_idx = input[emb];
+        has_invalid_index = has_invalid_index || (input_idx < 0 || input_idx >= numRows);
+      }
+      CUDA_KERNEL_ASSERT(!has_invalid_index && "Invalid input index in EmbeddingBag: index out of range [0, numRows)");
+
       for (int64_t emb = begin; emb < end; emb++) {
         index_t input_idx = input[emb];
         bool pad = (input_idx == padding_idx);
-        CUDA_KERNEL_ASSERT(0 <= input_idx && input_idx < numRows);
         const int64_t weightRow = input_idx * weight_stride0;
         scalar_t weightValue = weightFeat[weightRow];
         weightValue = pad ? static_cast<scalar_t>(0) : weightValue;
diff --git a/aten/src/ATen/native/cuda/GroupedBlas.cpp b/aten/src/ATen/native/cuda/GroupedBlas.cpp
index f64eb317d0cca..f4b229156d79f 100644
--- a/aten/src/ATen/native/cuda/GroupedBlas.cpp
+++ b/aten/src/ATen/native/cuda/GroupedBlas.cpp
@@ -22,6 +22,9 @@
 #include <ATen/native/cuda/RowwiseScaledMM.h>
 #include <ATen/native/cuda/ScaledGroupMM.h>
 #include <ATen/native/cuda/GroupMM.h>
+#ifdef USE_ROCM
+#include <ATen/native/hip/ck_group_gemm.h>
+#endif
 #include <ATen/ceil_div.h>
 
 #ifdef USE_FBGEMM_GENAI
@@ -75,9 +78,9 @@ _mx8_mx8_bf16_grouped_mm_fbgemm(
         const Tensor& mat_a,
         const Tensor& mat_b,
         const Tensor& scale_a,
-        const SwizzleType& swizzle_a,
+        const SwizzleType swizzle_a,
         const Tensor& scale_b,
-        const SwizzleType& swizzle_b,
+        const SwizzleType swizzle_b,
         const std::optional<at::Tensor>& offs,
         Tensor& out) {
     const bool a_is_2d = mat_a.dim() == 2;
@@ -604,6 +607,8 @@ _scaled_grouped_mm_cuda_v2(
       // scale shape checks
       _check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
       _check_scales_blocked(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
+      // swizze checks
+      TORCH_CHECK_VALUE(swizzle_a_enum.size() == 1 && swizzle_b_enum.size() == 1, "Expected single swizzle argument");
       return _mx8_mx8_bf16_grouped_mm_fbgemm(
           mat_a,
           mat_b,
@@ -666,12 +671,26 @@ std::optional<c10::ScalarType> out_dtype) {
   // _scaled_mm_allowed_device is used here within _grouped_mm_cuda which seems incorrect since scale is not used.
   // the _grouped_mm_fallback should be safe for any ROCm GPU since it's just calling typical mm/bmm
   bool use_fast_path = false;
+  // On non CK system(w/ ROCm), make sure use_fast_path is false
+#if defined(USE_ROCM_CK_GEMM)
+  if (at::detail::getCUDAHooks().isGPUArch({"gfx942", "gfx950"})) {
+    use_fast_path = true;
+  }
+#endif //USE_ROCM_CK_GEMM
 #endif
   const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype);
   Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
   if (use_fast_path) {
     // fast path, no d2h sync needed
+#ifndef USE_ROCM
     at::cuda::detail::bf16bf16_grouped_mm(mat_a, mat_b, offs, bias, out);
+#else
+#if defined(USE_ROCM_CK_GEMM)
+    at::hip::detail::group_gemm_ck(mat_a, mat_b, offs, bias, out);
+#else
+    TORCH_WARN("ROCm: Group Gemm through CK not selected.");
+#endif //USE_ROCM_CK_GEMM
+#endif
   } else {
     _grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out);
   }
diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu
index 927af661396cd..db85f62c8d124 100644
--- a/aten/src/ATen/native/cuda/IndexKernel.cu
+++ b/aten/src/ATen/native/cuda/IndexKernel.cu
@@ -5,7 +5,6 @@
 #include <array>
 #include <type_traits>
 #include <ATen/core/TensorBase.h>
-#include <ATen/ceil_div.h>
 #include <ATen/Dispatch.h>
 #include <ATen/Dispatch_v2.h>
 #include <ATen/cuda/CUDAContext.h>
@@ -74,7 +73,6 @@ void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, co
 
   char* const out_ptr = static_cast<char*>(iter.data_ptr(0));
   char* const in_ptr = static_cast<char*>(iter.data_ptr(1));
-
   if (is_gather_like && num_indices==1) {
       const size_t element_size = iter.element_size(0);
       constexpr size_t alignment = 16;
@@ -84,16 +82,9 @@ void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, co
         auto ind_dim_size = index_size[0];
         auto inp_stride_bytes = index_stride[0];
         auto out_stride_bytes = iter.strides(0)[1];
-        // avoid grid overflow in the fast kernel
-        const int64_t vec_chunks = ceil_div(slice_size, alignment);
-        const int64_t blocks_per_slice_upper = ceil_div(vec_chunks, (int64_t)launch_size_nd);
-        const int max_grid_y = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
-        // if it's an eligible grid we use the fast path, otherwise default to slower path
-        if (blocks_per_slice_upper <= max_grid_y) {
-          at::native::vectorized_gather_kernel_launch<alignment, int64_t>(out_ptr, in_ptr, (int64_t*)iter.data_ptr(2), num_ind,
-          slice_size, ind_dim_size, inp_stride_bytes, out_stride_bytes, /*allow_neg_indices*/true);
-          return;
-        }
+        at::native::vectorized_gather_kernel_launch<alignment, int64_t>(out_ptr, in_ptr, (int64_t*)iter.data_ptr(2), num_ind,
+        slice_size, ind_dim_size, inp_stride_bytes, out_stride_bytes, /*allow_neg_indices*/true);
+        return;
     }
   }
 
diff --git a/aten/src/ATen/native/cuda/IndexKernelUtils.cu b/aten/src/ATen/native/cuda/IndexKernelUtils.cu
index 8343c60418952..1e998251dd7be 100644
--- a/aten/src/ATen/native/cuda/IndexKernelUtils.cu
+++ b/aten/src/ATen/native/cuda/IndexKernelUtils.cu
@@ -13,11 +13,12 @@ __global__ void vectorized_gather_kernel(char * out, char * inp, index_t * idx,
     if (allow_neg_indices) {
         ind = (ind < 0) ? ind + ind_dim_size : ind;
     }
-    CUDA_KERNEL_ASSERT_VERBOSE(ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds", "Expected 0 <= index < ind_dim_size(%ld), but got index = %ld", ind_dim_size, ind);
-    int32_t off = (blockDim.x * blockIdx.y + threadIdx.x) * Alignment; // off is guaranteed to be within int32 limits
-    if (off >= slice_size) return;
-    auto vec = at::native::memory::ld_vec<Alignment>(inp + ind * inp_stride + off);
-    at::native::memory::st_vec<Alignment>(out + blockIdx.x * (int32_t)out_stride + off, vec);  // out offset is guaranteed to be within int32 limits
+    CUDA_KERNEL_ASSERT_VERBOSE(ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds");
+    // off is guaranteed to be within int32 limits
+    for (int32_t off = (blockDim.x * blockIdx.y + threadIdx.x) * Alignment; off < slice_size; off += blockDim.x * gridDim.y * Alignment) {
+      auto vec = at::native::memory::ld_vec<Alignment>(inp + ind * inp_stride + off);
+      at::native::memory::st_vec<Alignment>(out + blockIdx.x * (int32_t)out_stride + off, vec);  // out offset is guaranteed to be within int32 limits
+    }
 }
 
 
@@ -30,7 +31,9 @@ void vectorized_gather_kernel_launch(char * out, char * inp, index_t * idx, int
   auto num_threads = at::round_up(
       at::ceil_div(slice_size_in_bytes, Alignment),
       static_cast<int64_t>(C10_WARP_SIZE));
-  dim3 grid = {static_cast<uint32_t>(num_ind), static_cast<uint32_t>(at::ceil_div(slice_size_in_bytes, max_num_threads * Alignment)), 1};
+  uint32_t grid_y = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
+  grid_y = std::min(static_cast<uint32_t>(at::ceil_div(slice_size_in_bytes, max_num_threads * Alignment)), grid_y);
+  dim3 grid = {static_cast<uint32_t>(num_ind), grid_y, 1};
   auto block = std::min(max_num_threads, num_threads);
   vectorized_gather_kernel<Alignment, index_t><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(out, inp, idx, num_ind, slice_size_in_bytes,
   ind_dim_size, inp_stride_bytes, out_stride_bytes, allow_neg_indices);
diff --git a/aten/src/ATen/native/cuda/KernelUtils.cuh b/aten/src/ATen/native/cuda/KernelUtils.cuh
index 5c8b98105bb26..fd406829707a1 100644
--- a/aten/src/ATen/native/cuda/KernelUtils.cuh
+++ b/aten/src/ATen/native/cuda/KernelUtils.cuh
@@ -5,69 +5,11 @@
 #include <cuda_bf16.h>
 #endif
 
-// ROCm 6.3 is planned to have these functions, but until then here they are.
 #if defined(USE_ROCM)
 #include <device_functions.h>
 #include <hip/hip_fp16.h>
 #include <hip/hip_bf16.h>
-
-__device__ inline __hip_bfloat162 preview_unsafeAtomicAdd(__hip_bfloat162* address, __hip_bfloat162 value) {
-#if (defined(__gfx942__)) && \
-  __has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2bf16)
-  typedef unsigned short __attribute__((ext_vector_type(2))) vec_short2;
-  static_assert(sizeof(vec_short2) == sizeof(__hip_bfloat162_raw));
-  union {
-    __hip_bfloat162_raw bf162_raw;
-    vec_short2 vs2;
-  } u{static_cast<__hip_bfloat162_raw>(value)};
-  u.vs2 = __builtin_amdgcn_flat_atomic_fadd_v2bf16((vec_short2*)address, u.vs2);
-  return static_cast<__hip_bfloat162>(u.bf162_raw);
-#else
-  static_assert(sizeof(unsigned int) == sizeof(__hip_bfloat162_raw));
-  union u_hold {
-    __hip_bfloat162_raw h2r;
-    unsigned int u32;
-  };
-  u_hold old_val, new_val;
-  old_val.u32 = __hip_atomic_load((unsigned int*)address, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-  do {
-    new_val.h2r = __hadd2(old_val.h2r, value);
-  } while (!__hip_atomic_compare_exchange_strong(
-        (unsigned int*)address, &old_val.u32, new_val.u32,
-        __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT));
-  return old_val.h2r;
-#endif
-}
-
-__device__ inline __half2 preview_unsafeAtomicAdd(__half2* address, __half2 value) {
-#if (defined(__gfx942__)) && \
-  __has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2f16)
-  // The api expects an ext_vector_type of half
-  typedef _Float16 __attribute__((ext_vector_type(2))) vec_fp162;
-  static_assert(sizeof(vec_fp162) == sizeof(__half2_raw));
-  union {
-    __half2_raw h2r;
-    vec_fp162 fp16;
-  } u {static_cast<__half2_raw>(value)};
-  u.fp16 = __builtin_amdgcn_flat_atomic_fadd_v2f16((vec_fp162*)address, u.fp16);
-  return static_cast<__half2>(u.h2r);
-#else
-  static_assert(sizeof(__half2_raw) == sizeof(unsigned int));
-  union u_hold {
-    __half2_raw h2r;
-    unsigned int u32;
-  };
-  u_hold old_val, new_val;
-  old_val.u32 = __hip_atomic_load((unsigned int*)address, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-  do {
-    new_val.h2r = __hadd2(old_val.h2r, value);
-  } while (!__hip_atomic_compare_exchange_strong(
-        (unsigned int*)address, &old_val.u32, new_val.u32,
-        __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT));
-  return old_val.h2r;
-#endif
-}
-#define ATOMICADD preview_unsafeAtomicAdd
+#define ATOMICADD unsafeAtomicAdd
 #define NATIVE_ZERO_BF16 __float2bfloat16(0.0f)
 #else
 #define ATOMICADD atomicAdd
diff --git a/aten/src/ATen/native/cuda/LogAddExpKernel.cu b/aten/src/ATen/native/cuda/LogAddExpKernel.cu
index 7b8b5b5bb2032..910d3c1cddc93 100644
--- a/aten/src/ATen/native/cuda/LogAddExpKernel.cu
+++ b/aten/src/ATen/native/cuda/LogAddExpKernel.cu
@@ -2,18 +2,250 @@
 #include <ATen/Dispatch.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/cuda/JitLoops.cuh>
+#include <ATen/native/cuda/jit_utils.h>
+#include <ATen/native/cuda/ScanUtils.cuh>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/BinaryOps.h>
 #include <ATen/OpMathType.h>
 #include <c10/util/MathConstants.h>
+#include <c10/util/complex.h>
+
+#include <cmath>
+#include <limits>
 
 // NOTE: CUDA on Windows requires that the enclosing function
 // of a __device__ lambda not have internal linkage.
 
 namespace at::native {
 
+// custom min and max to be used in logaddexp for  complex arguments
+template <typename scalar_t, bool min>
+__host__ __device__ c10::complex<scalar_t> _logaddexp_minmax(const c10::complex<scalar_t>& x, const c10::complex<scalar_t>& y) {
+  scalar_t xr = std::real(x);
+  scalar_t yr = std::real(y);
+  if (::isnan(yr) || (::isnan(std::imag(y)))) {
+    return y;
+  } else if (::isnan(xr) || (::isnan(std::imag(x)))) {
+    return x;
+  } else if (min) { // min
+    return (xr < yr) ? x : y;
+  } else { // max
+    return (xr >= yr) ? x : y;
+  }
+}
+
+template <typename scalar_t>
+__host__ __device__ scalar_t _log_add_exp_helper(const scalar_t& x, const scalar_t& y) {
+  // Reference : https://www.tensorflow.org/api_docs/python/tf/math/cumulative_logsumexp
+  // Using the original expression: `at::_isnan(y) ? y : std::min(x, y)` causes an error in ROCM
+  const auto isnan_x = at::_isnan(x);
+  const auto isnan_y = at::_isnan(y);
+  scalar_t min = isnan_y ? y : (isnan_x ? x : std::min(x, y));
+  scalar_t max = isnan_y ? y : (isnan_x ? x : std::max(x, y));
+  if (min != max || ::isfinite(min)) {
+    // nan will be propagated here
+    return ::log1p(std::exp(min - max)) + max;
+  } else {
+    // special case to correctly handle infinite cases
+    return x;
+  }
+}
+
+template <typename scalar_t>
+__host__ __device__ c10::complex<scalar_t> _fast_build_exp(const c10::complex<scalar_t>& x) {
+  // complex exponential function, but implemented manually to get fast compilation time
+  // this function only handles the case where the x is finite (not inf nor nan)
+  const auto xreal = std::real(x);
+  const auto ximag = std::imag(x);
+  const auto exp_x_abs = std::exp(xreal);
+  auto exp_x_real = exp_x_abs * std::cos(ximag);
+  auto exp_x_imag = exp_x_abs * std::sin(ximag);
+  return {exp_x_real, exp_x_imag};
+}
+
+template <typename scalar_t>
+__host__ __device__ c10::complex<scalar_t> _fast_build_exp_inf(const c10::complex<scalar_t>& x) {
+  // complex exponential function, but implemented manually to get fast compilation time
+  // this function only handles the case where the real part of x is infinite
+  const auto ximag = std::imag(x);
+  constexpr auto exp_x_abs = std::numeric_limits<scalar_t>::infinity();
+  if (!::isfinite(ximag)) {  // add this to make consitent with std::exp(x+yi)
+    return {exp_x_abs, std::numeric_limits<scalar_t>::quiet_NaN()};
+  }
+  const auto sin = std::sin(ximag);
+  const auto cos = std::cos(ximag);
+  // special case if the angle is exactly the multiple of pi/2
+  auto exp_x_real = (cos == 0) ? (scalar_t)0.0 : exp_x_abs * cos;
+  auto exp_x_imag = (sin == 0) ? (scalar_t)0.0 : exp_x_abs * sin;
+  return {exp_x_real, exp_x_imag};
+}
+
+template <typename scalar_t>
+__host__ __device__ c10::complex<scalar_t> _log_add_exp_helper(const c10::complex<scalar_t>& x, const c10::complex<scalar_t>& y) {
+  c10::complex<scalar_t> min = _logaddexp_minmax<scalar_t, /*min=*/true>(x, y);
+  c10::complex<scalar_t> max = _logaddexp_minmax<scalar_t, /*min=*/false>(x, y);
+  scalar_t min_real = std::real(min);
+  scalar_t max_real = std::real(max);
+
+  if (::isnan(min_real) || ::isnan(std::imag(min))) {
+    // handling the "infectious" NaNs
+    return {std::numeric_limits<scalar_t>::quiet_NaN(), std::numeric_limits<scalar_t>::quiet_NaN()};
+  }
+  else if ((!::isfinite(min_real)) && (min_real == max_real)) {
+    if (min_real < 0) {
+      // handle the -inf case, the imaginary part here does not really matter as the exp(value)
+      // will be around 0.0 and the angle (i.e. the imaginary part) cannot be determined.
+      // It does not matter if we're taking the exp of this value
+      return min;
+    } else {
+      // handle the +inf case, we don't need the special precision for log1p for small values
+      // and to avoid producing nan in case of real(max) == real(min) == +inf
+      const auto exp_min = _fast_build_exp_inf(min);
+      const auto exp_max = _fast_build_exp_inf(max);
+      return ::log1p(exp_min + exp_max - 1);  // log1p(x - 1) builds faster than log
+    }
+  } else {
+    const auto minmax = min - max;
+    c10::complex<scalar_t> exp_minmax;
+    if (!::isfinite(minmax.real())) {
+        exp_minmax = minmax.real() < 0 ? c10::complex<scalar_t>{0.0, 0.0} : _fast_build_exp_inf(minmax);
+    } else {
+        exp_minmax = _fast_build_exp(minmax);
+    }
+    return ::log1p(exp_minmax) + max;
+  }
+}
+
+// Complex logaddexp jiterator string
+const auto logaddexp_complex_string = jiterator_stringify(
+    template<typename T>
+    std::complex<T> log1p(const std::complex<T>& z)
+    {
+      using complex_t = std::complex<T>;
+      T x = z.real();
+      T y = z.imag();
+      T zabs = abs(z);
+      T theta = atan2(y, x + T(1));
+      if (zabs < 0.5) {
+          T r = x * (T(2) + x) + y * y;
+          if (r == 0) { // handle underflow
+              return complex_t(x, theta);
+          }
+          return complex_t(T(0.5) * std::log1p(r), theta);
+      } else {
+          T z0 = std::hypot(x + 1, y);
+          return complex_t(log(z0), theta);
+      }
+    }
+
+    // separated _logaddexp_minmax into 2 different functions for jiterator_string
+    template <typename T>
+    std::complex<T> logaddexp_min(const std::complex<T>& x, const std::complex<T>& y) {
+        T xr = x.real();
+        T yr = y.real();
+        if (isnan(yr) || isnan(y.imag())) {
+            return y;
+        } else if (isnan(xr) || isnan(x.imag())) {
+            return x;
+        } else {
+            return (xr < yr) ? x : y;
+        }
+    }
+
+    template <typename T>
+    std::complex<T> logaddexp_max(const std::complex<T>& x, const std::complex<T>& y) {
+        T xr = x.real();
+        T yr = y.real();
+        if (isnan(yr) || isnan(y.imag())) {
+            return y;
+        } else if (isnan(xr) || isnan(x.imag())) {
+            return x;
+        } else {
+            return (xr >= yr) ? x : y;
+        }
+    }
+
+    template <typename T>
+    std::complex<T> fast_build_exp(const std::complex<T>& x) {
+        const auto xreal = x.real();
+        const auto ximag = x.imag();
+        const auto exp_x_abs = exp(xreal);
+        auto exp_x_real = exp_x_abs * cos(ximag);
+        auto exp_x_imag = exp_x_abs * sin(ximag);
+        return std::complex<T>(exp_x_real, exp_x_imag);
+    }
+
+    template <typename T>
+    std::complex<T> fast_build_exp_inf(const std::complex<T>& x) {
+        using complex_t = std::complex<T>;
+        const auto ximag = x.imag();
+        const T exp_x_abs = INFINITY;
+        if (!isfinite(ximag)) {
+            return complex_t(exp_x_abs, NAN);
+        }
+        const auto sin_val = sin(ximag);
+        const auto cos_val = cos(ximag);
+        auto exp_x_real = (cos_val == T(0)) ? T(0) : exp_x_abs * cos_val;
+        auto exp_x_imag = (sin_val == T(0)) ? T(0) : exp_x_abs * sin_val;
+        return complex_t(exp_x_real, exp_x_imag);
+    }
+
+    template <typename complex_t>
+    complex_t logaddexp_complex(complex_t x, complex_t y) {
+        using T = typename complex_t::value_type;
+        complex_t min_val = logaddexp_min(x, y);
+        complex_t max_val = logaddexp_max(x, y);
+        T min_real = min_val.real();
+        T max_real = max_val.real();
+
+        if (isnan(min_real) || isnan(min_val.imag())) {
+            return complex_t(NAN, NAN);
+        }
+        else if ((!isfinite(min_real)) && (min_real == max_real)) {
+            if (min_real < T(0)) {
+                return min_val;
+            } else {
+                const auto exp_min = fast_build_exp_inf<T>(min_val);
+                const auto exp_max = fast_build_exp_inf<T>(max_val);
+                return log1p(exp_min + exp_max - complex_t(1, 0));
+            }
+        } else {
+            const auto minmax = min_val - max_val;
+            complex_t exp_minmax;
+            if (!isfinite(minmax.real())) {
+                exp_minmax = (minmax.real() < T(0)) ? complex_t(0, 0) : fast_build_exp_inf<T>(minmax);
+            } else {
+                exp_minmax = fast_build_exp<T>(minmax);
+            }
+            return log1p(exp_minmax) + max_val;
+        }
+    }
+);
+
+constexpr char logaddexp_complex_name[] = "logaddexp_complex";
 void logaddexp_kernel_cuda(TensorIteratorBase& iter) {
-  AT_DISPATCH_FLOATING_TYPES_AND2(
+  if (at::isComplexType(iter.dtype())) {
+#if AT_USE_JITERATOR()
+    AT_DISPATCH_COMPLEX_TYPES_AND(at::ScalarType::ComplexHalf, iter.dtype(), "logaddexp_cuda", [&]() {
+      jitted_gpu_kernel<
+          /*name=*/logaddexp_complex_name,
+          /*return_dtype=*/scalar_t,
+          /*common_dtype=*/scalar_t,
+          /*arity=*/2>(iter, logaddexp_complex_string);
+    });
+#else
+    AT_DISPATCH_COMPLEX_TYPES_AND(at::ScalarType::ComplexHalf, iter.dtype(), "logaddexp_cuda", [&]() {
+      using opmath_t = at::opmath_type<scalar_t>;
+      gpu_kernel(iter, [] GPU_LAMBDA (scalar_t a_, scalar_t b_) -> scalar_t {
+        const auto a = static_cast<opmath_t>(a_);
+        const auto b = static_cast<opmath_t>(b_);
+        return static_cast<scalar_t>(_log_add_exp_helper(a, b));
+      });
+    });
+#endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
       ScalarType::BFloat16, ScalarType::Half,
       iter.dtype(), "logaddexp_cuda",
       [&]() {
@@ -29,6 +261,7 @@ void logaddexp_kernel_cuda(TensorIteratorBase& iter) {
           }
         });
       });
+  }
 }
 
 void logaddexp2_kernel_cuda(TensorIteratorBase& iter) {
diff --git a/aten/src/ATen/native/cuda/Reduce.cu b/aten/src/ATen/native/cuda/Reduce.cu
index 36a1313488245..b32c55a10df6b 100644
--- a/aten/src/ATen/native/cuda/Reduce.cu
+++ b/aten/src/ATen/native/cuda/Reduce.cu
@@ -11,7 +11,7 @@ static inline std::ostream& operator<<(std::ostream& out, dim3 dim) {
   if (dim.y == 1 && dim.z == 1) {
     out << dim.x;
   } else {
-    out << "[" << dim.x << "," << dim.y << "," << dim.z << "]";
+    out << '[' << dim.x << ',' << dim.y << ',' << dim.z << ']';
   }
   return out;
 }
@@ -27,7 +27,7 @@ std::ostream& operator<<(std::ostream& out, const ReduceConfig& config) {
   out << "input_mult=[";
   for (int i = 0; i < 3; i++) {
     if (i != 0) {
-      out << ",";
+      out << ',';
     }
     out << config.input_mult[i];
   }
@@ -35,7 +35,7 @@ std::ostream& operator<<(std::ostream& out, const ReduceConfig& config) {
   out << "output_mult=[";
   for (int i = 0; i < 2; i++) {
     if (i != 0) {
-      out << ",";
+      out << ',';
     }
     out << config.output_mult[i];
   }
@@ -49,7 +49,7 @@ std::ostream& operator<<(std::ostream& out, const ReduceConfig& config) {
   out << "block=" << config.block() << ", ";
   out << "grid=" << config.grid() << ", ";
   out << "global_memory_size=" << config.global_memory_size();
-  out << ")";
+  out << ')';
   return out;
 }
 
diff --git a/aten/src/ATen/native/cuda/ScaledBlas.cpp b/aten/src/ATen/native/cuda/ScaledBlas.cpp
index 0d2963874abbd..4ff61f71f2b61 100644
--- a/aten/src/ATen/native/cuda/ScaledBlas.cpp
+++ b/aten/src/ATen/native/cuda/ScaledBlas.cpp
@@ -59,6 +59,24 @@
 // forward declare
 class cublasCommonArgs;
 
+#ifndef _WIN32
+namespace fbgemm_gpu {
+
+// NOTE(slayton58): FBGemm_GPU kernels come from <fbgemm_gpu/torch_ops.h> within the FBGemm repo.
+//                  To update supported ops means a submodule bump, which is.. painful. Instead, we
+//                  can simply forward-declare the methods we want to use.. Works at least as a short-term
+//                  thing, but should still be fixed somewhere/somehow.
+at::Tensor f4f4bf16(
+    at::Tensor,
+    at::Tensor,
+    at::Tensor,
+    at::Tensor,
+    std::optional<at::Tensor>,
+    bool use_mx);
+
+} // namespace fbgemm_gpu
+#endif
+
 using at::blas::ScalingType;
 using at::blas::SwizzleType;
 
@@ -722,7 +740,12 @@ _scaled_rowwise_rowwise(
   TORCH_CHECK_VALUE(scale_a.numel() == mat_a.size(0) && scale_a.scalar_type() == kFloat, "scale_a must have ", mat_a.size(0), " Float elements, got ", scale_a.numel())
   TORCH_CHECK_VALUE(scale_b.numel() == mat_b.size(1) && scale_b.scalar_type() == kFloat, "scale_b must have ", mat_b.size(1), " Float elements, got ", scale_b.numel())
 
-  TORCH_CHECK_VALUE(scale_a.stride(1) == 1, "expected scale_a.stride(1) to be 1, but got ", scale_a.stride(1));
+  // if we have a scale of shape [256, 1] (say), then stride can be [1, 0] - handle this case
+  TORCH_CHECK_VALUE(
+      scale_a.stride(1) == 1 ||
+      scale_a.size(1) == 1,
+      "expected scale_a.stride(1) to be 1, but got ", scale_a.stride(1)
+  );
   TORCH_CHECK_VALUE(scale_b.stride(1) == 1, "expected scale_b.stride(1) to be 1, but got ", scale_b.stride(1));
 
   auto scaling_choice_a = ScalingType::RowWise;
@@ -1078,6 +1101,19 @@ _scaled_mxfp8_mxfp8(
   return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out);
 }
 
+void
+_check_mxfp4_support() {
+#ifndef USE_ROCM
+  auto dprops = at::cuda::getCurrentDeviceProperties();
+  // Only on B200 GPUs
+  TORCH_CHECK_NOT_IMPLEMENTED(
+    // B200 = 10.0, B300 = 10.3
+    dprops->major == 10,
+    "MXFP4 scaling only supported in CUDA for B200/B300"
+  );
+#endif
+}
+
 
 Tensor&
 _scaled_mxfp4_mxfp4(
@@ -1087,26 +1123,48 @@ _scaled_mxfp4_mxfp4(
           const std::optional<Tensor>& bias,
           const c10::ScalarType out_dtype,
           Tensor& out) {
-#ifndef USE_ROCM
-  TORCH_CHECK_NOT_IMPLEMENTED(false, "MXFP4 scaling supported on ROCM only");
-#endif
+#if defined(_WIN32) || (!defined(USE_ROCM) && !defined(USE_FBGEMM_GENAI))
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "MXFP4 scaling supported on ROCM and CUDA+FBGEMM_GENAI only");
+#else
+  _check_mxfp4_support();
   // Restrictions:
   // A, B are FP4, scales are e8m0, A: shape K//32, B: K, N//32
   TORCH_CHECK_VALUE(mat_a.scalar_type() == at::kFloat4_e2m1fn_x2 && mat_b.scalar_type() == at::kFloat4_e2m1fn_x2, "mat_a and mat_b must be fp4 types, got: ",
       mat_a.scalar_type(), mat_b.scalar_type());
 
-  auto scale_a_elems = ceil_div<int64_t>(2 * mat_a.size(0), 32) * mat_a.size(1);
-  auto scale_b_elems = ceil_div<int64_t>(2 * mat_b.size(1), 32) * mat_b.size(0);
+  // Packed FP4 format means actual-K = 2 * reported-K -- adjust
+  auto K_multiplier = 2;
+#ifdef USE_ROCM
+  // AMD
+  auto scale_a_elems = ceil_div<int64_t>(K_multiplier * mat_a.size(0), 32) * mat_a.size(1);
+  auto scale_b_elems = ceil_div<int64_t>(K_multiplier * mat_b.size(1), 32) * mat_b.size(0);
+#else
+  // NVIDIA
+  auto scale_a_elems = round_up<int64_t>(mat_a.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(K_multiplier * mat_a.size(1), 32), 4);
+  auto scale_b_elems = round_up<int64_t>(mat_b.size(1), 128) * round_up<int64_t>(ceil_div<int64_t>(K_multiplier * mat_b.size(0), 32), 4);
+#endif
   TORCH_CHECK_VALUE(scale_a_elems == scale_a.numel(),
          "For Blockwise scaling scale_a should have ", scale_a_elems, " elements, got: ", scale_a.numel());
   TORCH_CHECK_VALUE(scale_b_elems == scale_b.numel(),
          "For Blockwise scaling scale_b should have ", scale_b_elems, " elements, got: ", scale_b.numel());
 
+#ifdef USE_ROCM
+  // AMD
+  TORCH_CHECK_VALUE(swizzle_a == SwizzleType::NO_SWIZZLE, "scale_a must not be swizzled (NO_SWIZZLE format)");
+  TORCH_CHECK_VALUE(swizzle_b == SwizzleType::NO_SWIZZLE, "scale_b must not be swizzled (NO_SWIZZLE format)");
+#else
+  // NVIDIA
+  TORCH_CHECK_VALUE(swizzle_a == SwizzleType::SWIZZLE_32_4_4, "scale_a must be swizzled to SWIZZLE_32_4_4 format");
+  TORCH_CHECK_VALUE(swizzle_b == SwizzleType::SWIZZLE_32_4_4, "scale_b must be swizzled to SWIZZLE_32_4_4 format");
+#endif
+
   TORCH_CHECK_VALUE(scale_a.is_contiguous() && scale_b.is_contiguous(),
         "For Blockwise scaling both scales should be contiguous");
 
   TORCH_CHECK_VALUE(out.scalar_type() == out_dtype, "expected out.scalar_type() to be ", out_dtype, ", but got ", out_dtype);
 
+#ifdef USE_ROCM
+  // AMD
   auto scaling_choice_a = ScalingType::BlockWise1x32;
   auto scaling_choice_b = ScalingType::BlockWise1x32;
 
@@ -1121,11 +1179,30 @@ _scaled_mxfp4_mxfp4(
   TORCH_CHECK_VALUE(out.scalar_type() == ScalarType::BFloat16 ||
               out.scalar_type() == ScalarType::Half,
               "Block-wise scaling only supports BFloat16 or Half output types");
-#else
-    TORCH_CHECK_NOT_IMPLEMENTED(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later");
 #endif
 
   return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out);
+#else
+  // NVIDIA
+  // NOTE(slayton58): fbgemm_gpu::f4f4bf16 does *not* allow passing an output tensor,
+  //                  but we have one we need to use. Two clear options are to copy into
+  //                  our output (slow), or use a move-assignment-operator (faster).
+  //                  However, the compiler can complain about the explicit move preventing
+  //                  copy elision because the return from f4f4bf16 is a temporary object.
+  //                  So we don't explicitly move, and trust the compiler here...
+  //                  In the longer term this should be fixed on the FBGemm side.
+  out = fbgemm_gpu::f4f4bf16(
+      mat_a,
+      mat_b.transpose(-2, -1),
+      scale_a,
+      scale_b,
+      std::nullopt, /* global_scale */
+      true          /* use_mx */
+  );
+
+  return out;
+#endif
+#endif
 }
 
 Tensor&
@@ -1250,17 +1327,20 @@ _scaled_mm_cuda_v2_out(
         mat_a.size(0), "x", mat_a.size(1), " and ", mat_b.size(0), "x", mat_b.size(1), ")");
   }
 
+  // Handle fp4 packed-K dimension
+  int K_multiplier = (mat_a.scalar_type() == ScalarType::Float4_e2m1fn_x2) ? 2 : 1;
+
   TORCH_CHECK_VALUE(!bias || bias->numel() == mat_b.sizes()[1], "Bias must be size ", mat_b.sizes()[1],
        " but got ", bias->numel());
   TORCH_CHECK_VALUE(
-      mat_a.sizes()[1] % 16 == 0,
+      K_multiplier * mat_a.sizes()[1] % 16 == 0,
       "Expected trailing dimension of mat1 to be divisible by 16 ",
       "but got mat1 shape: (",
       mat_a.sizes()[0],
       "x",
-      mat_a.sizes()[1],
+      K_multiplier * mat_a.sizes()[1],
       ").");
-  TORCH_CHECK_VALUE(mat_b.sizes()[0] % 16 == 0 && mat_b.sizes()[1] % 16 == 0, "mat2 shape (", mat_b.sizes()[0], "x",
+  TORCH_CHECK_VALUE(K_multiplier * mat_b.sizes()[0] % 16 == 0 && mat_b.sizes()[1] % 16 == 0, "mat2 shape (", mat_b.sizes()[0], "x",
        mat_b.sizes()[1], ") must be divisible by 16");
 
   // TODO(slayton): Existing checks, not sure if they should really be here.
diff --git a/aten/src/ATen/native/cuda/ScaledGroupMM.cu b/aten/src/ATen/native/cuda/ScaledGroupMM.cu
index 9a06c5907febc..71c9c8dac766d 100644
--- a/aten/src/ATen/native/cuda/ScaledGroupMM.cu
+++ b/aten/src/ATen/native/cuda/ScaledGroupMM.cu
@@ -364,9 +364,9 @@ void f8f8bf16_grouped_gemm_impl_sm90(
   //       reinterpret_cast<ProblemShape::UnderlyingProblemShape*>(
   //           stride_output_h + group_count);
 
-  //   std::cout << "PTRS " << mat_a.data_ptr() << " " << mat_b.data_ptr() << "
+  //   std::cout << "PTRS " << mat_a.data_ptr() << ' ' << mat_b.data_ptr() << "
   //   "
-  //             << out.data_ptr() << " " << scale_a.data_ptr() << " "
+  //             << out.data_ptr() << ' ' << scale_a.data_ptr() << ' '
   //             << scale_b.data_ptr() << "\n";
   //   for (int i = 0; i < group_count; i++) {
   //     std::cout << "A " << (void*)inputA_ptrs_h[i] << "\n";
diff --git a/aten/src/ATen/native/cuda/ScanUtils.cuh b/aten/src/ATen/native/cuda/ScanUtils.cuh
index c4d86acb43e7b..693ad0cb6ce10 100644
--- a/aten/src/ATen/native/cuda/ScanUtils.cuh
+++ b/aten/src/ATen/native/cuda/ScanUtils.cuh
@@ -267,15 +267,15 @@ void scan_dim_with_indices(const TensorBase& self, const TensorBase& values, con
  * outer dimensions, which contains several "inner rows").
  * Each thread processes a single inner row at a time.
  */
-template<typename scalar_t, class BinaryOp>
+template<typename scalar_t, typename index_t, class BinaryOp>
 __global__ void tensor_kernel_scan_outer_dim(scalar_t *tgt_, const scalar_t *src_,
                                               const uint32_t num_orows, const uint32_t num_irows, const uint32_t row_size,
                                               const scalar_t init, BinaryOp binary_op)
 {
   for (uint32_t orow = blockIdx.x; orow < num_orows; orow += gridDim.x) {
     for (uint32_t irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) {
-      const scalar_t *src = src_ + orow * row_size * num_irows + irow;
-      scalar_t *tgt = tgt_ + orow * row_size * num_irows + irow;
+      const scalar_t *src = src_ + static_cast<index_t>(orow) * row_size * num_irows + irow;
+      scalar_t *tgt = tgt_ + (index_t) orow * row_size * num_irows + irow;
       scalar_t acc = init;
 
       for (uint32_t col = 0; col < row_size; ++col) {
@@ -409,10 +409,15 @@ __host__ void scan_outer_dim(const TensorBase& self, const TensorBase& result,
   check_fits_in_unsigned(num_irows, "num_irows");
   check_fits_in_unsigned(num_orows, "num_orows");
   check_fits_in_unsigned(row_size, "row_size");
-
-  tensor_kernel_scan_outer_dim<scalar_t><<<grid, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+  if (static_cast<size_t>(num_irows) * num_orows * row_size <= UINT_MAX) {
+  tensor_kernel_scan_outer_dim<scalar_t, uint32_t><<<grid, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+    result.mutable_data_ptr<scalar_t>(), self.const_data_ptr<scalar_t>(),
+    num_orows, num_irows, row_size, init, binary_op);
+  } else  {
+  tensor_kernel_scan_outer_dim<scalar_t, size_t><<<grid, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
     result.mutable_data_ptr<scalar_t>(), self.const_data_ptr<scalar_t>(),
     num_orows, num_irows, row_size, init, binary_op);
+  }
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
diff --git a/aten/src/ATen/native/cuda/jit_utils.cpp b/aten/src/ATen/native/cuda/jit_utils.cpp
index 09c8e74d4b2cf..e65fa4ceb38e9 100644
--- a/aten/src/ATen/native/cuda/jit_utils.cpp
+++ b/aten/src/ATen/native/cuda/jit_utils.cpp
@@ -1057,14 +1057,14 @@ std::string generate_code(
     // TODO these arrays are potentially of the different types, use function
     // traits to determine the types
     declare_load_arrays << f_inputs_type << " arg" << std::to_string(i)
-                        << "[" << std::to_string(thread_work_size) << "];\n";
+                        << '[' << std::to_string(thread_work_size) << "];\n";
   }
   env.s("declare_load_arrays", declare_load_arrays.str());
 
   std::stringstream declare_store_arrays;
   for (int i = 0; i < nOutputs; i++) {
     declare_store_arrays << result_type << " out" << std::to_string(i)
-                        << "[" << std::to_string(thread_work_size) << "];\n";
+                        << '[' << std::to_string(thread_work_size) << "];\n";
   }
   env.s("declare_store_arrays", declare_store_arrays.str());
 
@@ -1217,7 +1217,7 @@ std::string generate_code(
   for (const auto i : c10::irange(nInputs)){
     auto i_string = std::to_string(i);
     vector_inputs << "auto * input" << i_string <<
-        " = reinterpret_cast<const scalar_t*>(data[" << i_string << "+" << nOutputs << "])" <<
+        " = reinterpret_cast<const scalar_t*>(data[" << i_string << '+' << nOutputs << "])" <<
         " + block_work_size * idx;\n";
   }
   env.s("vector_inputs", vector_inputs.str());
@@ -1543,17 +1543,17 @@ NvrtcFunction jit_pwise_function(
 
     // Constructs file path by appending constructed cubin name to cache path
     std::stringstream ss;
-    ss << *cache_dir << "/";
+    ss << *cache_dir << '/';
     ss << kernel_name;
 #ifdef USE_ROCM
     ss << "_arch" << prop->gcnArchName;
 #else
-    ss << "_arch" << cuda_major << "." << cuda_minor;
+    ss << "_arch" << cuda_major << '.' << cuda_minor;
 #endif
-    ss << "_nvrtc" << nvrtc_major << "." << nvrtc_minor;
+    ss << "_nvrtc" << nvrtc_major << '.' << nvrtc_minor;
     ss << (compile_to_sass ? "_sass" : "_ptx");
-    ss << "_" << code.length();
-    ss << "_" << hash_code;
+    ss << '_' << code.length();
+    ss << '_' << hash_code;
     file_path = ss.str();
 
     std::ifstream readin{file_path, std::ios::in | std::ifstream::binary};
diff --git a/aten/src/ATen/native/cudnn/ConvShared.cpp b/aten/src/ATen/native/cudnn/ConvShared.cpp
index 325b082f314d9..1584d5e9acd38 100644
--- a/aten/src/ATen/native/cudnn/ConvShared.cpp
+++ b/aten/src/ATen/native/cudnn/ConvShared.cpp
@@ -82,15 +82,15 @@ namespace native {
 
 std::ostream& operator<<(std::ostream& out, const ConvolutionParams& params) {
   out << "ConvolutionParams \n"
-      << "    memory_format = " << params.memory_format << "\n"
-      << "    data_type = " << cudnnTypeToString(params.dataType) << "\n"
-      << "    padding = " << ArrayRef<int>{params.padding} << "\n"
-      << "    stride = " << ArrayRef<int>{params.stride} << "\n"
-      << "    dilation = " << ArrayRef<int>{params.dilation} << "\n"
-      << "    groups = " << params.groups << "\n"
+      << "    memory_format = " << params.memory_format << '\n'
+      << "    data_type = " << cudnnTypeToString(params.dataType) << '\n'
+      << "    padding = " << ArrayRef<int>{params.padding} << '\n'
+      << "    stride = " << ArrayRef<int>{params.stride} << '\n'
+      << "    dilation = " << ArrayRef<int>{params.dilation} << '\n'
+      << "    groups = " << params.groups << '\n'
       << "    deterministic = " << (params.deterministic ? "true" : "false")
-      << "\n"
-      << "    allow_tf32 = " << (params.allow_tf32 ? "true" : "false") << "\n";
+      << '\n'
+      << "    allow_tf32 = " << (params.allow_tf32 ? "true" : "false") << '\n';
 
   return out;
 }
@@ -173,16 +173,16 @@ std::string repro_from_args(const ConvolutionParams& params) {
             at::globalContext().float32Precision(
                 at::Float32Backend::CUDA, at::Float32Op::MATMUL) ==
             at::Float32Precision::TF32)
-     << "\n";
+     << '\n';
   ss << "torch.backends.cudnn.benchmark = "
-     << pybool(at::globalContext().benchmarkCuDNN()) << "\n";
+     << pybool(at::globalContext().benchmarkCuDNN()) << '\n';
   ss << "torch.backends.cudnn.deterministic = " << pybool(params.deterministic)
-     << "\n";
+     << '\n';
   ss << "torch.backends.cudnn.allow_tf32 = " << pybool(params.allow_tf32)
-     << "\n";
+     << '\n';
   ss << "data = torch.randn(" << ArrayRef<int>(params.input_size, dim)
      << ", dtype=" << full_dtype << ", ";
-  ss << "device='cuda', requires_grad=True)" << to_channels_last << "\n";
+  ss << "device='cuda', requires_grad=True)" << to_channels_last << '\n';
   ss << "net = torch.nn.Conv" << dim - 2 << "d(" << in_channels << ", "
      << out_channels << ", ";
   ss << "kernel_size=" << ArrayRef<int>(&params.weight_size[2], dim - 2)
@@ -192,7 +192,7 @@ std::string repro_from_args(const ConvolutionParams& params) {
   ss << "dilation=" << ArrayRef<int>(params.dilation, dim - 2) << ", ";
   ss << "groups=" << params.groups << ")\n";
   ss << "net = net.cuda()." << partial_dtype << "()" << to_channels_last
-     << "\n";
+     << '\n';
   ss << "out = net(data)\n";
   ss << "out.backward(torch.randn_like(out))\n";
   ss << "torch.cuda.synchronize()\n\n";
diff --git a/aten/src/ATen/native/cudnn/Conv_v7.cpp b/aten/src/ATen/native/cudnn/Conv_v7.cpp
index bc064e3ad3167..d5102910c6471 100644
--- a/aten/src/ATen/native/cudnn/Conv_v7.cpp
+++ b/aten/src/ATen/native/cudnn/Conv_v7.cpp
@@ -93,11 +93,10 @@ std::ostream& operator<<(std::ostream& out, const ConvolutionArgs& args) {
       << "input: " << args.idesc // already has a trailing newline
       << "output: " << args.odesc // already has a trailing newline
       << "weight: " << args.wdesc // already has a trailing newline
-      << "Pointer addresses: "
-      << "\n"
-      << "    input: " << args.input.const_data_ptr() << "\n"
-      << "    output: " << args.output.const_data_ptr() << "\n"
-      << "    weight: " << args.weight.const_data_ptr() << "\n";
+      << "Pointer addresses: " << '\n'
+      << "    input: " << args.input.const_data_ptr() << '\n'
+      << "    output: " << args.output.const_data_ptr() << '\n'
+      << "    weight: " << args.weight.const_data_ptr() << '\n';
 
   return out;
 }
diff --git a/aten/src/ATen/native/hip/ck_group_gemm.h b/aten/src/ATen/native/hip/ck_group_gemm.h
new file mode 100644
index 0000000000000..c50307c9f8ea3
--- /dev/null
+++ b/aten/src/ATen/native/hip/ck_group_gemm.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <c10/core/ScalarType.h>
+#include <optional>
+
+namespace at {
+namespace hip {
+namespace detail {
+void group_gemm_ck(
+    const at::Tensor& mat_a,
+    const at::Tensor& mat_b,
+    const std::optional<at::Tensor>& offs,
+    const std::optional<at::Tensor>& bias,
+    at::Tensor& out);
+
+} // namespace detail
+} // namespace hip
+} // namespace at
diff --git a/aten/src/ATen/native/hip/ck_group_gemm.hip b/aten/src/ATen/native/hip/ck_group_gemm.hip
new file mode 100644
index 0000000000000..c436ad660c1c7
--- /dev/null
+++ b/aten/src/ATen/native/hip/ck_group_gemm.hip
@@ -0,0 +1,462 @@
+#undef __HIP_NO_HALF_CONVERSIONS__
+#include <ATen/hip/HIPContext.h>
+#include <ATen/Tensor.h>
+#include <ATen/TensorAccessor.h>
+#include <c10/hip/HIPStream.h>
+#include <iostream>
+#include <vector>
+#include <optional>
+#include <type_traits>
+
+#include <ck/ck.hpp>
+#include <ck/tensor_operation/gpu/device/tensor_layout.hpp>
+#include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
+#include <ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp>
+#include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
+#include <ck/utility/tuple.hpp>
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+namespace at {
+namespace hip {
+namespace detail {
+
+namespace CkTypes {
+    using BF16 = ck::bhalf_t;
+    using F16 = ck::half_t;
+    using F32 = float;
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+}
+
+template <typename ALayout, typename BLayout, typename DataType>
+using GroupedGemmKernel = ck::tensor_operation::device::DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage<
+    ALayout, BLayout, ck::Tuple<>, ck::tensor_layout::gemm::RowMajor,
+    DataType, DataType, CkTypes::F32, DataType, ck::Tuple<>, DataType,
+    CkTypes::PassThrough, CkTypes::PassThrough, CkTypes::PassThrough,
+    ck::tensor_operation::device::GemmSpecialization::MNKPadding,
+    1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2,
+    S<1,4,64,1>,  S<0,2,1,3>, S<0,2,1,3>,
+    3, 8, 8, 1,
+    S<1,4,64,1>,  S<0,2,1,3>, S<0,2,1,3>,
+    3, 8, 8, 1,
+    1, 1,
+    S<1,32,1,8>, 4
+>;
+
+template <typename ALayout, typename BLayout, typename DataType>
+void launch_grouped_bgemm_ck_impl_dispatch(
+    const at::Tensor& mat_a,
+    const at::Tensor& mat_b,
+    const std::optional<at::Tensor>& offs,
+    at::Tensor& out)
+{
+    using DeviceOp = GroupedGemmKernel<ALayout, BLayout, DataType>;
+    using PassThrough = CkTypes::PassThrough;
+
+    std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
+    std::vector<const void*> p_a_ptrs, p_b_ptrs;
+    std::vector<void*> p_e_ptrs;
+    // Note: d_ptrs will be resized after we populate the other vectors
+
+    const int mat_a_dim = mat_a.dim();
+    const int mat_b_dim = mat_b.dim();
+
+    const char* a_ptr_base = reinterpret_cast<const char*>(mat_a.data_ptr());
+    const char* b_ptr_base = reinterpret_cast<const char*>(mat_b.data_ptr());
+    char* out_ptr_base = reinterpret_cast<char*>(out.data_ptr());
+    const size_t a_element_size = mat_a.element_size();
+    const size_t b_element_size = mat_b.element_size();
+    const size_t out_element_size = out.element_size();
+
+    // for each group, calculate m,n,k,lda,ldb,ldc and A,B,out pointer base addresses.
+    if (mat_a_dim == 2 && mat_b_dim == 2) {
+        // 2D*2D case requires offset tensor
+        auto offs_accessor = offs->accessor<int, 1>();
+        int num_groups = offs_accessor.size(0);
+        const int M = mat_a.size(0); // number of rows in A
+        const int N = mat_b.size(1); // number of columns in B
+        const int K = mat_a.size(1); // columns in A == rows in B
+        // for 2d*2d input, output is 3d.
+        // for each group, A columns (K) are sliced. M and N dimensions are not sliced.
+        for (int i = 0; i < num_groups; ++i) {
+            int start_k = (i == 0) ? 0 : offs_accessor[i-1];
+            int end_k = offs_accessor[i];
+            int k = end_k - start_k;
+
+            //K dimension are sliced, hence select stride(1) always.
+            //K dimension is always dimension 1, regardless of memory layout (row/column major)
+            const void* group_a_ptr = a_ptr_base + start_k * mat_a.stride(1) * a_element_size;
+            const void* group_b_ptr;
+            int ldb;
+
+            if (std::is_same<BLayout, ck::tensor_layout::gemm::RowMajor>::value) {
+                // Row-major B [K,N]: K values are horizontally adjacent, use stride(1) for K offset
+                group_b_ptr = b_ptr_base + start_k * mat_b.stride(1) * b_element_size;
+                // Leading dimension = distance between rows = stride(0)
+                ldb = mat_b.stride(0);
+            } else {
+                // Column-major B [K,N]: K values are vertically adjacent, use stride(0) for K offset
+                group_b_ptr = b_ptr_base + start_k * mat_b.stride(0) * b_element_size;
+                // Leading dimension = distance between columns = stride(1)
+                ldb = mat_b.stride(1);
+            }
+
+            // Calculate output pointer for group i in 3D tensor [num_groups, M, N]
+            // stride(0) = M*N elements between groups, so skip i*stride(0) elements to reach group i
+            void* group_e_ptr = out_ptr_base + i * out.stride(0) * out_element_size;
+            int lda, ldc;
+            if (std::is_same<ALayout, ck::tensor_layout::gemm::RowMajor>::value) {
+                // Row-major A [M,K]: leading dimension = distance between rows = stride(0)
+                lda = mat_a.stride(0);
+            } else {
+                // Column-major A [M,K]: leading dimension = distance between columns = stride(1)
+                lda = mat_a.stride(1);
+            }
+            // Output is always row-major in 3D tensor [num_groups, M, N]
+            // Leading dimension for each group's [M,N] slice = stride(1) = N
+            ldc = out.stride(1);
+            size_t output_group_bytes = M * N * out_element_size;
+            void* group_e_ptr_end = (char*)group_e_ptr + output_group_bytes;
+
+            gemm_descs.push_back({
+                static_cast<ck::index_t>(M),
+                static_cast<ck::index_t>(N),
+                static_cast<ck::index_t>(k),
+                static_cast<ck::index_t>(lda),
+                static_cast<ck::index_t>(ldb),
+                static_cast<ck::index_t>(ldc),
+                {} // --> stride_Ds_
+            });
+            p_a_ptrs.push_back(group_a_ptr);
+            p_b_ptrs.push_back(group_b_ptr);
+            p_e_ptrs.push_back(group_e_ptr);
+        }
+    } else if (mat_a_dim == 2 && mat_b_dim == 3) {
+        // 2D*3D case requires offset tensor
+        auto offs_accessor = offs->accessor<int, 1>();
+        int num_groups = offs_accessor.size(0);
+
+        // 2d*3d input, output is 2d.
+        // A: [m * n_groups, k], B: [n_groups, n, k] or [n_groups, k, n], Output: [m * n_groups, n]
+        // Offset divides M dimension (rows of A), each group gets different rows of A and different batch of B
+        const int K = mat_a.size(1); // columns in A
+        // For 2D-3D case: The output determines N (result width)
+        const int N = out.size(1); // N is the width of the output tensor
+
+        for (int i = 0; i < num_groups; ++i) {
+            int start_m = (i == 0) ? 0 : offs_accessor[i - 1];
+            int end_m = offs_accessor[i];
+            int m = end_m - start_m;
+
+            // Skip zero-sized groups but continue processing subsequent groups
+            if (m <= 0) {
+                continue;
+            }
+
+            // Select A rows for group i: skip start_m rows
+            const void* group_a_ptr;
+            int lda;
+            if (std::is_same<ALayout, ck::tensor_layout::gemm::RowMajor>::value) {
+                // Row-major A [total_m, K]: skip start_m rows, each row is stride(0) elements apart
+                group_a_ptr = a_ptr_base + start_m * mat_a.stride(0) * a_element_size;
+                lda = mat_a.stride(0); // distance between rows
+            } else {
+                // Column-major A [total_m, K]: skip start_m elements in the first dimension (stride(0) is between rows)
+                group_a_ptr = a_ptr_base + start_m * mat_a.stride(0) * a_element_size;
+
+                // Detect stride pattern for A tensor to determine appropriate lda calculation
+                bool a_is_strided_tensor = (mat_a.stride(0) > mat_a.size(0));
+
+                if (a_is_strided_tensor) {
+                    // For strided A tensors: stride(0) gives the actual leading dimension
+                    lda = mat_a.stride(0);
+                } else {
+                    // For non-strided A tensors: use the M dimension (total rows)
+                    lda = mat_a.size(0); // Total M dimension for column-major layout
+                }
+            }
+
+            // Select B batch for group i: B[i, :, :]
+            const void* group_b_ptr = b_ptr_base + i * mat_b.stride(0) * b_element_size;
+            int ldb;
+
+            if (std::is_same<BLayout, ck::tensor_layout::gemm::RowMajor>::value) {
+                // Row-major GEMM: expecting B as [K, N] but we have [N, K], so transpose needed
+                ldb = mat_b.stride(2); // Leading dimension for accessing as [K, N]
+            } else {
+                // Detect stride pattern to determine appropriate ldb calculation
+                bool is_strided_tensor = (mat_b.stride(2) > mat_b.size(2));
+
+                if (is_strided_tensor) {
+                    // For strided tensors: stride(2) gives the actual leading dimension
+                    ldb = mat_b.stride(2);
+                } else {
+                    // For non-strided tensors: use the N dimension
+                    ldb = mat_b.size(1);
+                }
+            }
+
+            // Output for this group: rows [start_m:end_m, :] in 2D output [total_m, N]
+            void* group_e_ptr = out_ptr_base + start_m * out.stride(0) * out_element_size;
+            int ldc = out.stride(0); // distance between rows in output (should be N for 2D case)
+
+            gemm_descs.push_back({
+                static_cast<ck::index_t>(m),
+                static_cast<ck::index_t>(N),
+                static_cast<ck::index_t>(K),
+                static_cast<ck::index_t>(lda),
+                static_cast<ck::index_t>(ldb),
+                static_cast<ck::index_t>(ldc),
+                {} // --> stride_Ds_
+            });
+            p_a_ptrs.push_back(group_a_ptr);
+            p_b_ptrs.push_back(group_b_ptr);
+            p_e_ptrs.push_back(group_e_ptr);
+        }
+    } else if (mat_a_dim == 3 && mat_b_dim == 3) {
+        // 3d*3d input, output is 3d - batched matrix multiplication
+        // A: [batch, m, k], B: [batch, k, n] or [batch, n, k] (depending on transpose), Output: [batch, m, n]
+        // Each batch is processed as a separate GEMM operation
+        const int batch_size = mat_a.size(0);
+        const int M = mat_a.size(1); // rows in each A matrix
+        const int K = mat_a.size(2); // columns in A == rows in B (or columns if B is transposed)
+
+        // Determine N from B tensor - it could be B.size(1) or B.size(2) depending on layout
+        int N;
+        if (mat_b.size(1) == K) {
+            // B is [batch, k, n] - normal layout
+            N = mat_b.size(2);
+        } else if (mat_b.size(2) == K) {
+            // B is [batch, n, k] - transposed layout
+            N = mat_b.size(1);
+        } else {
+            TORCH_CHECK(false, "CK Group GEMM 3D-3D: B tensor dimensions incompatible with A. A=[",
+                       batch_size, ",", M, ",", K, "], B=[", mat_b.size(0), ",", mat_b.size(1), ",", mat_b.size(2), "]");
+        }
+
+        for (int i = 0; i < batch_size; ++i) {
+            // Select A batch for group i: A[i, :, :]
+            const void* group_a_ptr = a_ptr_base + i * mat_a.stride(0) * a_element_size;
+
+            // Select B batch for group i: B[i, :, :]
+            const void* group_b_ptr = b_ptr_base + i * mat_b.stride(0) * b_element_size;
+
+            // Select output batch for group i: Output[i, :, :]
+            void* group_e_ptr = out_ptr_base + i * out.stride(0) * out_element_size;
+
+            int lda, ldb, ldc;
+
+            if (std::is_same<ALayout, ck::tensor_layout::gemm::RowMajor>::value) {
+                // Row-major A: leading dimension = distance between rows = stride(1)
+                lda = mat_a.stride(1);
+            } else {
+                // Column-major A: leading dimension = distance between columns = stride(2)
+                lda = mat_a.stride(2);
+            }
+
+            if (std::is_same<BLayout, ck::tensor_layout::gemm::RowMajor>::value) {
+                // Row-major B: leading dimension = distance between rows
+                if (mat_b.size(1) == K) {
+                    // B is [batch, k, n] - normal layout
+                    ldb = mat_b.stride(1); // stride between K rows
+                } else {
+                    // B is [batch, n, k] - transposed layout, treat as [k, n] for GEMM
+                    ldb = mat_b.stride(2); // stride between N rows (since we're accessing as [k,n])
+                }
+            } else {
+                // Column-major B: leading dimension = distance between columns
+                if (mat_b.size(1) == K) {
+                    // B is [batch, k, n] - normal layout
+                    ldb = mat_b.stride(2); // stride between N columns
+                } else {
+                    // B is [batch, n, k] - transposed layout
+                    ldb = mat_b.stride(1); // stride between K columns (since we're accessing as [n,k]→[k,n])
+                }
+            }
+
+            // Output is typically row-major: leading dimension = distance between rows = stride(1)
+            ldc = out.stride(1);
+
+            gemm_descs.push_back({
+                static_cast<ck::index_t>(M),
+                static_cast<ck::index_t>(N),
+                static_cast<ck::index_t>(K),
+                static_cast<ck::index_t>(lda),
+                static_cast<ck::index_t>(ldb),
+                static_cast<ck::index_t>(ldc),
+                {} // --> stride_Ds_
+            });
+            p_a_ptrs.push_back(group_a_ptr);
+            p_b_ptrs.push_back(group_b_ptr);
+            p_e_ptrs.push_back(group_e_ptr);
+        }
+    } else if (mat_a_dim == 3 && mat_b_dim == 2) {
+        // 3D*2D case requires offset tensor
+        auto offs_accessor = offs->accessor<int, 1>();
+        int num_groups = offs_accessor.size(0);
+        // 3d*2d input, output is 3d.
+        // A: [n_groups, m, k], B: [k, total_n] (assuming row-major for both)
+        // Offset divides N dimension of B, each group gets different slice of B and different batch of A
+        const int batch_size = mat_a.size(0); // n_groups
+        const int M = mat_a.size(1); // rows in each A matrix
+        const int K = mat_a.size(2); // columns in A
+
+        // For row-major A and B case: B should be [K, total_N]
+        const int total_N = mat_b.size(1); // B is [K, total_N] for row-major
+
+        for (int i = 0; i < num_groups; ++i) {
+            int start_n = (i == 0) ? 0 : offs_accessor[i - 1];
+            int end_n = offs_accessor[i];
+            int n = end_n - start_n;
+
+            // Skip zero-sized groups but continue processing subsequent groups
+            if (n <= 0) {
+                continue;
+            }
+
+            // Select A batch for group i: A[i, :, :]
+            const void* group_a_ptr = a_ptr_base + i * mat_a.stride(0) * a_element_size;
+
+            // Select B slice for group i: B[:, start_n:end_n] (B[K, total_N])
+            const void* group_b_ptr;
+            int ldb;
+
+            // Check if B is row-major or column-major
+            if (std::is_same<BLayout, ck::tensor_layout::gemm::RowMajor>::value) {
+                // Row-major B [K, total_N]: slice columns [start_n:end_n]
+                group_b_ptr = b_ptr_base + start_n * mat_b.stride(1) * b_element_size;
+                ldb = mat_b.stride(0); // distance between rows (should be total_N)
+            } else {
+                // Column-major B [K, total_N]: slice columns [start_n:end_n]
+                group_b_ptr = b_ptr_base + start_n * mat_b.stride(1) * b_element_size;
+                ldb = mat_b.stride(1); // distance between columns (should be K)
+            }
+
+            // Select output slice for group i: Output[:, start_n:end_n]
+            void* group_e_ptr = out_ptr_base + start_n * out.stride(1) * out_element_size;
+
+            int lda, ldc;
+
+            // Row-major A: leading dimension = distance between rows = stride(1)
+            lda = mat_a.stride(1);
+            // Output is row-major: leading dimension = distance between rows = stride(0)
+            ldc = out.stride(0);
+
+            gemm_descs.push_back({
+                static_cast<ck::index_t>(M),
+                static_cast<ck::index_t>(n),
+                static_cast<ck::index_t>(K),
+                static_cast<ck::index_t>(lda),
+                static_cast<ck::index_t>(ldb),
+                static_cast<ck::index_t>(ldc),
+                {} // --> stride_Ds_
+            });
+            p_a_ptrs.push_back(group_a_ptr);
+            p_b_ptrs.push_back(group_b_ptr);
+            p_e_ptrs.push_back(group_e_ptr);
+        }
+    } else {
+        TORCH_CHECK(false, "CK Group GEMM: Unsupported dimensions, mat A dim is ", mat_a_dim, ", mat B dim is ", mat_b_dim);
+    }
+
+    TORCH_INTERNAL_ASSERT(p_a_ptrs.size() > 0, "CK Group GEMM: No valid groups");
+
+    // Initialize d_ptrs with the correct size
+    std::vector<std::array<const void*, 0>> d_ptrs(p_a_ptrs.size());
+
+    static DeviceOp gemm_instance;
+    auto argument = gemm_instance.MakeArgument(
+        p_a_ptrs, p_b_ptrs, d_ptrs, p_e_ptrs,
+        gemm_descs, PassThrough{}, PassThrough{}, PassThrough{}
+    );
+    TORCH_INTERNAL_ASSERT(gemm_instance.IsSupportedArgument(argument),
+        "CK Group GEMM: argument unsupported (shape/strides/type config)");
+    size_t arg_buf_size = gemm_instance.GetDeviceKernelArgSize(&argument);
+    size_t ws_size = gemm_instance.GetWorkSpaceSize(&argument);
+
+    void* gemm_arg_buf = nullptr;
+    void* ws_buf = nullptr;
+
+    hipMalloc(&gemm_arg_buf, arg_buf_size);
+    hipMalloc(&ws_buf, ws_size);
+
+    gemm_instance.SetDeviceKernelArgs(&argument, gemm_arg_buf);
+    gemm_instance.SetWorkSpacePointer(&argument, ws_buf);
+
+    auto invoker = gemm_instance.MakeInvoker();
+    hipStream_t stream = c10::hip::getCurrentHIPStream();
+    invoker.Run(argument, {stream});
+    hipFree(gemm_arg_buf);
+    hipFree(ws_buf);
+}
+
+void group_gemm_ck(
+    const at::Tensor& input_a,
+    const at::Tensor& input_b_colmajor,
+    const std::optional<at::Tensor>& offs,
+    const std::optional<at::Tensor>& /*bias*/,
+    at::Tensor& out)
+{
+    // Detect if input_a is row-major based on stride pattern
+    bool a_row_major = (input_a.dim() == 3) ? (input_a.stride(2) == 1) : (input_a.stride(1) == 1);
+    bool b_col_major = (input_b_colmajor.dim() == 3) ? (input_b_colmajor.stride(1) == 1) : (input_b_colmajor.stride(0) == 1);
+    // Ensure tensor A is row-major and contiguous if not already
+    at::Tensor mat_a = input_a;
+    if (!a_row_major) {
+        // If A is not row-major, make it contiguous (row-major)
+        mat_a = input_a.contiguous();
+    }
+    // Force tensor B to be column-major using double transpose trick
+    // This guarantees stride(0) == 1 and stride(1) == K for [K, N] shape
+    at::Tensor mat_b = input_b_colmajor;
+    if (!b_col_major) {
+        mat_b = input_b_colmajor.transpose(-2, -1).contiguous().transpose(-2, -1);
+    }
+
+    // For 3D tensors, check the last dimension stride for row-major detection
+    a_row_major = (mat_a.dim() == 3) ? (mat_a.stride(2) == 1) : (mat_a.stride(1) == 1);
+    bool b_row_major = (mat_b.dim() == 3) ? (mat_b.stride(2) == 1) : (mat_b.stride(1) == 1);
+
+    if (mat_a.dtype() == at::kBFloat16) {
+        // bf16 path
+        if (a_row_major && b_row_major) {
+            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor, CkTypes::BF16>(mat_a, mat_b, offs, out);
+        } else if (a_row_major && !b_row_major) {
+            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::ColumnMajor, CkTypes::BF16>(mat_a, mat_b, offs, out);
+        } else if (!a_row_major && b_row_major) {
+            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::RowMajor, CkTypes::BF16>(mat_a, mat_b, offs, out);
+        } else {
+            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::ColumnMajor, CkTypes::BF16>(mat_a, mat_b, offs, out);
+        }
+    } else if (mat_a.dtype() == at::kHalf) {
+        // fp16 path
+        if (a_row_major && b_row_major) {
+            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor, CkTypes::F16>(mat_a, mat_b, offs, out);
+        } else if (a_row_major && !b_row_major) {
+            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::ColumnMajor, CkTypes::F16>(mat_a, mat_b, offs, out);
+        } else if (!a_row_major && b_row_major) {
+            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::RowMajor, CkTypes::F16>(mat_a, mat_b, offs, out);
+        } else {
+            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::ColumnMajor, CkTypes::F16>(mat_a, mat_b, offs, out);
+        }
+    } else if (mat_a.dtype() == at::kFloat) {
+        // fp32 path
+        if (a_row_major && b_row_major) {
+            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor, CkTypes::F32>(mat_a, mat_b, offs, out);
+        } else if (a_row_major && !b_row_major) {
+            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::ColumnMajor, CkTypes::F32>(mat_a, mat_b, offs, out);
+        } else if (!a_row_major && b_row_major) {
+            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::RowMajor, CkTypes::F32>(mat_a, mat_b, offs, out);
+        } else {
+            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::ColumnMajor, CkTypes::F32>(mat_a, mat_b, offs, out);
+        }
+    } else {
+        TORCH_CHECK(false, "CK Group GEMM: Unsupported mat_a dtype");
+    }
+
+}
+
+} // namespace detail
+} // namespace hip
+} // namespace at
diff --git a/aten/src/ATen/native/metal/MetalTensorImplStorage.mm b/aten/src/ATen/native/metal/MetalTensorImplStorage.mm
index f614429eefddf..20a942a9e2573 100644
--- a/aten/src/ATen/native/metal/MetalTensorImplStorage.mm
+++ b/aten/src/ATen/native/metal/MetalTensorImplStorage.mm
@@ -115,7 +115,7 @@ void copy_data_to_host(float* host) {
   std::copy(
       strides.begin(), strides.end() - 1, std::ostream_iterator<int>(oss, ","));
   oss << sizes.back();
-  output << oss.str() << "}";
+  output << oss.str() << '}';
   return output;
 }
 
diff --git a/aten/src/ATen/native/mkldnn/xpu/Conv.cpp b/aten/src/ATen/native/mkldnn/xpu/Conv.cpp
index 67558aeebbb83..6827e02cc3f42 100644
--- a/aten/src/ATen/native/mkldnn/xpu/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/Conv.cpp
@@ -53,7 +53,7 @@ std::ostream& operator<<(std::ostream& out, const ConvParams& params) {
       << "  transposed = " << params.transposed
       << "  output_padding = " << IntArrayRef{params.output_padding}
       << "  groups = " << params.groups << "  benchmark = " << params.benchmark
-      << "  deterministic = " << params.deterministic << "}";
+      << "  deterministic = " << params.deterministic << '}';
   return out;
 }
 
@@ -337,10 +337,6 @@ Tensor _convolution_out(
   TORCH_CHECK(
       3 == ndim || 4 == ndim || 5 == ndim,
       "convolution only supports 3D, 4D, 5D tensor");
-  // get computation format for Conv/TransposedConv
-  bool is_channels_last_suggested =
-      use_channels_last_for_conv(input_r, weight_r);
-
   Tensor input = input_r, weight = weight_r;
   // PyTorch does not support ChannelsLast1D case,
   // thus we need the transformation here
@@ -348,13 +344,8 @@ Tensor _convolution_out(
     input = view4d(input_r);
     weight = view4d(weight_r);
   }
-  // ensure the input/weight/bias/output are congituous in desired format
-  at::MemoryFormat mfmt = is_channels_last_suggested
-      ? get_cl_tag_by_ndim(input.ndimension())
-      : at::MemoryFormat::Contiguous;
-  auto bias = bias_r.defined() ? bias_r.contiguous() : bias_r;
-  input = input.contiguous(mfmt);
-  weight = weight.contiguous(mfmt);
+  // get computation format for Conv/TransposedConv
+  bool is_channels_last_suggested = use_channels_last_for_conv(input, weight);
 
   auto k = weight.ndimension();
   if (k == input.ndimension() + 1) {
@@ -388,6 +379,14 @@ Tensor _convolution_out(
         expand_param_if_needed(output_padding_, "output_padding", dim);
     params.groups = groups_;
   }
+
+  // ensure the input/weight/bias/output are congituous in desired format
+  at::MemoryFormat mfmt = is_channels_last_suggested
+      ? get_cl_tag_by_ndim(input.ndimension())
+      : at::MemoryFormat::Contiguous;
+  auto bias = bias_r.defined() ? bias_r.contiguous() : bias_r;
+  input = input.contiguous(mfmt);
+  weight = weight.contiguous(mfmt);
   check_shape_forward(input, weight, bias, params, true);
 
   Tensor output;
@@ -514,18 +513,9 @@ Tensor convolution_overrideable(
       at::borrow_from_optional_tensor(bias_r_opt);
   const Tensor& bias_r = *bias_r_maybe_owned;
 
-  auto k = weight_r.ndimension();
-  at::MemoryFormat backend_memory_format = at::MemoryFormat::Contiguous;
-  if (xpu_conv_use_channels_last(input_r, weight_r)) {
-    backend_memory_format = (k == 5) ? at::MemoryFormat::ChannelsLast3d
-                                     : at::MemoryFormat::ChannelsLast;
-  }
-  Tensor input_c = input_r.contiguous(backend_memory_format);
-  Tensor weight_c = weight_r.contiguous(backend_memory_format);
-
   return _convolution(
-      input_c,
-      weight_c,
+      input_r,
+      weight_r,
       bias_r,
       stride_,
       padding_,
diff --git a/aten/src/ATen/native/mkldnn/xpu/ScaledBlas.cpp b/aten/src/ATen/native/mkldnn/xpu/ScaledBlas.cpp
new file mode 100644
index 0000000000000..2b715c053abc3
--- /dev/null
+++ b/aten/src/ATen/native/mkldnn/xpu/ScaledBlas.cpp
@@ -0,0 +1,738 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/BlasBackend.h>
+#include <ATen/WrapDimUtilsMulti.h>
+#include <ATen/ceil_div.h>
+#include <ATen/native/Resize.h>
+#include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
+#include <ATen/native/xpu/Blas.h>
+#include <ATen/xpu/XPUScaledBlas.h>
+#include <torch/library.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_addmm_activation_native.h>
+#include <ATen/ops/_efficientzerotensor.h>
+#include <ATen/ops/_scaled_mm_native.h>
+#include <ATen/ops/_unsafe_view_native.h>
+#include <ATen/ops/abs.h>
+#include <ATen/ops/addmm_native.h>
+#include <ATen/ops/addmv_native.h>
+#include <ATen/ops/baddbmm_native.h>
+#include <ATen/ops/bmm_native.h>
+#include <ATen/ops/copy_native.h>
+#include <ATen/ops/dot_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_strided.h>
+#include <ATen/ops/gelu.h>
+#include <ATen/ops/max.h>
+#include <ATen/ops/mm_native.h>
+#include <ATen/ops/mul.h>
+#include <ATen/ops/ones.h>
+#include <ATen/ops/relu.h>
+#include <ATen/ops/scalar_tensor_native.h>
+#include <ATen/ops/vdot_native.h>
+#endif
+
+namespace at::native {
+
+using at::blas::ScalingType;
+using at::blas::SwizzleType;
+
+namespace {
+/*
+ * Scaling Type Determination:
+ * ---------------------------
+ * Conditions and corresponding Scaling Types:
+ *
+ * - If scale tensor is `Float8_e8m0fnu` or `Float8_e4m3fn`:
+ *   - Returns BlockWise (with additional size checks).
+ *
+ * - Else if scale.numel() == 1:
+ *   - Returns TensorWise.
+ *
+ * - Else if scale.dim() == 2 && scale.size(0) == outer_dim && scale.size(1) ==
+ * 1:
+ *   - Returns RowWise.
+ *
+ * - Otherwise:
+ *   - Returns Error.
+ */
+
+bool is_tensorwise_scaling(const at::Tensor& t, const at::Tensor& scale) {
+  return at::isFloat8Type(t.scalar_type()) &&
+      scale.scalar_type() == at::kFloat && scale.numel() == 1;
+}
+
+bool is_rowwise_scaling(const at::Tensor& t, const at::Tensor& scale) {
+  return (
+      at::isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat &&
+      scale.dim() == 2 && scale.size(0) == t.size(0) && scale.size(1) == 1 &&
+      scale.is_contiguous());
+}
+
+bool is_desired_scaling(
+    const at::Tensor& t,
+    const at::Tensor& scale,
+    ScalingType desired_scaling) {
+  auto result = desired_scaling == ScalingType::TensorWise
+      ? is_tensorwise_scaling(t, scale)
+      : is_rowwise_scaling(t, scale);
+  return result;
+}
+
+std::pair<ScalingType, ScalingType> get_joint_scaling(
+    std::initializer_list<std::pair<ScalingType, ScalingType>> options,
+    const at::Tensor& a,
+    const at::Tensor& b,
+    const at::Tensor& scale_a,
+    const at::Tensor& scale_b) {
+  for (auto [lhs, rhs] : options) {
+    if (is_desired_scaling(a, scale_a, lhs) &&
+        is_desired_scaling(b.t(), scale_b.t(), rhs)) {
+      return {lhs, rhs};
+    }
+  }
+  TORCH_CHECK(
+      false,
+      "Invalid scaling configuration.\n"
+      "- For TensorWise scaling, a and b should be float8, scales should be float and singletons.\n"
+      "- For RowWise scaling, a and b should be float8, scales should be float, scale_a should be (",
+      a.size(0),
+      ", 1) and scale_b should be (1, ",
+      b.size(1),
+      "), and both should be contiguous.\n"
+      "Got a.dtype()=",
+      a.scalar_type(),
+      ", scale_a.dtype()=",
+      scale_a.scalar_type(),
+      ", scale_a.size()=",
+      scale_a.sizes(),
+      ", scale_a.stride()=",
+      scale_a.strides(),
+      ", ",
+      "b.dtype()=",
+      b.scalar_type(),
+      ", scale_b.dtype()=",
+      scale_b.scalar_type(),
+      ", scale_b.size()=",
+      scale_b.sizes(),
+      " and scale_b.stride()=",
+      scale_b.strides());
+}
+
+Tensor& _scaled_gemm(
+    const Tensor& mat1,
+    const Tensor& mat2,
+    const Tensor& scale_a,
+    const Tensor& scale_b,
+    const ScalingType scaling_choice_a,
+    const ScalingType scaling_choice_b,
+    const std::optional<Tensor>& bias,
+    const bool use_fast_accum,
+    Tensor& out,
+    const std::optional<Tensor>& alpha = std::nullopt) {
+  // TODO: scale_result and alpha is not defined or used!
+  std::optional<Tensor> scaled_result = std::nullopt;
+  at::native::onednn::scaled_matmul(
+      mat1,
+      mat2,
+      out,
+      scale_a,
+      scale_b,
+      scaling_choice_a,
+      scaling_choice_b,
+      bias,
+      scaled_result,
+      use_fast_accum);
+
+  return out;
+}
+
+} // namespace
+
+// Computes matrix multiply + bias while applying scaling to input and output
+// matrices Scales are only applicable when matrices are of Float8 type and
+// assumed to be equal to 1.0 by default. If output matrix type is 16 or 32-bit
+// type, scale_result is not applied. Known limitations:
+//  - Only works if mat1 is row-major and mat2 is column-major
+//  - Only works if matrices sizes are divisible by 32
+//  - If 1-dimensional tensors are used then scale_a should be size =
+//  mat1.size(0)
+//    and scale_b should have size = to mat2.size(1)
+//  Arguments:
+//    - `mat1`: the first operand of the matrix multiply, can be type
+//    `torch.float8_e4m3fn` or `torch.float8_e5m2`
+//    - `mat2`: the second operand of the matrix multiply, can be type
+//    `torch.float8_e4m3fn` or `torch.float8_e5m2`
+//    - `bias`: the bias, can be type `torch.float16` or `torch.bfloat16`
+//    - `out_dtype`: the output dtype, can either be a float8 or a higher
+//    precision floating point type
+//    - `scale_a`: a tensor with the inverse scale of `mat1`, whose
+//    shape/strides/dtype depend on the scaling scheme
+//    - `scale_b`: a tensor with the inverse scale of `mat2`, whose
+//    shape/strides/dtype depend on the scaling scheme
+//    - `scale_result`: a scalar tensor with the scale of the output, only
+//    utilized if the output is a float8 type
+//    - `use_fast_accum`: Not applicable for XPU. For now, it should always be
+//    false.
+//    - `out`: a reference to the output tensor
+
+Tensor& _scaled_mm_out_xpu(
+    const Tensor& mat1,
+    const Tensor& mat2,
+    const Tensor& scale_a,
+    const Tensor& scale_b,
+    const std::optional<at::Tensor>& bias,
+    const std::optional<at::Tensor>& scale_result,
+    std::optional<c10::ScalarType> out_dtype,
+    bool use_fast_accum,
+    Tensor& out) {
+  // Note: fast_accum is not supported in XPU for now.
+  TORCH_CHECK(!use_fast_accum, "fast_accum is not supported in XPU for now.");
+
+  TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix");
+  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix");
+
+  TORCH_CHECK(
+      mat1.sizes()[1] == mat2.sizes()[0],
+      "mat1 and mat2 shapes cannot be multiplied (",
+      mat1.sizes()[0],
+      "x",
+      mat1.sizes()[1],
+      " and ",
+      mat2.sizes()[0],
+      "x",
+      mat2.sizes()[1],
+      ")");
+
+  // Check what type of scaling we are doing based on inputs. This list is
+  // sorted by decreasing priority.
+
+  // List of supported datatypes for XPU with oneDNN:
+  // https://uxlfoundation.github.io/oneDNN/dev_guide_matmul.html#data-types
+  auto [scaling_choice_a, scaling_choice_b] = get_joint_scaling(
+      {
+          std::make_pair(ScalingType::TensorWise, ScalingType::TensorWise),
+          std::make_pair(ScalingType::RowWise, ScalingType::RowWise),
+      },
+      mat1,
+      mat2,
+      scale_a,
+      scale_b);
+  TORCH_CHECK(
+      !scale_result ||
+          (scale_result->numel() == 1 && scale_result->scalar_type() == kFloat),
+      "scale_result must be a float scalar");
+  TORCH_CHECK(
+      !bias || bias->numel() == mat2.sizes()[1],
+      "Bias must be size ",
+      mat2.sizes()[1],
+      " but got ",
+      bias->numel());
+  TORCH_CHECK(
+      mat1.sizes()[1] % 16 == 0,
+      "Expected trailing dimension of mat1 to be divisible by 16 ",
+      "but got mat1 shape: (",
+      mat1.sizes()[0],
+      "x",
+      mat1.sizes()[1],
+      ").");
+  TORCH_CHECK(
+      mat2.sizes()[0] % 16 == 0 && mat2.sizes()[1] % 16 == 0,
+      "mat2 shape (",
+      mat2.sizes()[0],
+      "x",
+      mat2.sizes()[1],
+      ") must be divisible by 16");
+  // Check types
+  TORCH_CHECK(
+      !out_dtype || *out_dtype == out.scalar_type(),
+      "out_dtype must match output matrix type");
+  TORCH_CHECK(
+      at::isFloat8Type(mat1.scalar_type()),
+      "Expected mat1 to be Float8 matrix got ",
+      mat1.scalar_type());
+  TORCH_CHECK(
+      at::isFloat8Type(mat2.scalar_type()),
+      "Expected mat2 to be Float8 matrix got ",
+      mat2.scalar_type());
+  // TODO: oneDNN Currently only supports e4m3 with group scales on BMG. Not
+  // support 2D scales, only 1D. Needs to add more checks there.
+
+  if (bias) {
+    TORCH_CHECK(
+        bias->scalar_type() == kFloat ||
+            bias->scalar_type() == c10::ScalarType::BFloat16 ||
+            bias->scalar_type() == c10::ScalarType::Half,
+        "Bias must be Float32 or BFloat16 or Half, but got ",
+        bias->scalar_type());
+  }
+
+  {
+    auto bias_ = bias.value_or(Tensor());
+    auto scale_result_ = scale_result.value_or(Tensor());
+
+    // NOLINTNEXTLINE(*c-array*)
+    TensorArg targs[]{
+        {out, "out", 0},
+        {mat1, "mat1", 1},
+        {mat2, "mat2", 2},
+        {bias_, "bias", 3},
+        {scale_a, "scale_a", 4},
+        {scale_b, "scale_b", 5},
+        {scale_result_, "scale_result", 6}};
+    checkAllSameGPU(__func__, targs);
+  }
+
+  // Validation checks have passed lets resize the output to actual size
+  IntArrayRef mat1_sizes = mat1.sizes();
+  IntArrayRef mat2_sizes = mat2.sizes();
+  at::native::resize_output(out, {mat1_sizes[0], mat2_sizes[1]});
+
+  // If any of M, K, N is 0 - return early (the tensorwise/rowwise float8 gemm
+  // kernels do not support this case).
+  if (mat1_sizes[0] == 0 || mat1_sizes[1] == 0 || mat2_sizes[1] == 0) {
+    // `out` was created with `at::empty`. In the case where we are multiplying
+    // MxK by KxN and K is the zero dim, we need to initialize here to properly
+    // return a tensor of zeros.
+    if (mat1_sizes[1] == 0) {
+      out.zero_();
+    }
+
+    return out;
+  }
+
+  // TODO: Scale_result is not supported by now!!
+  return _scaled_gemm(
+      mat1,
+      mat2,
+      scale_a,
+      scale_b,
+      scaling_choice_a,
+      scaling_choice_b,
+      bias,
+      use_fast_accum,
+      out);
+}
+
+Tensor _scaled_mm_xpu(
+    const Tensor& mat_a,
+    const Tensor& mat_b,
+    const Tensor& scale_a,
+    const Tensor& scale_b,
+    const std::optional<at::Tensor>& bias,
+    const std::optional<at::Tensor>& scale_result,
+    std::optional<c10::ScalarType> out_dtype,
+    bool use_fast_accum) {
+  const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
+  Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_));
+  return _scaled_mm_out_xpu(
+      mat_a,
+      mat_b,
+      scale_a,
+      scale_b,
+      bias,
+      scale_result,
+      out_dtype,
+      use_fast_accum,
+      out);
+}
+
+using acceptance_fn = std::function<bool(
+    c10::ScalarType,
+    std::vector<ScalingType>&,
+    ArrayRef<Tensor>&,
+    c10::ScalarType,
+    std::vector<ScalingType>&,
+    ArrayRef<Tensor>&)>;
+using namespace std::placeholders;
+
+namespace scaled_blas = at::native::onednn::scaled;
+using scaled_blas::convert_int_to_enum;
+using scaled_blas::ScaledGemmImplementation;
+
+std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 2>
+    scale_kernel_dispatch = {{
+        {"tensorwise_tensorwise",
+         scaled_blas::check_tensorwise_recipe,
+         ScaledGemmImplementation::TENSORWISE_TENSORWISE},
+        {"rowwise_rowwise",
+         scaled_blas::check_rowwise_recipe,
+         ScaledGemmImplementation::ROWWISE_ROWWISE},
+
+    }};
+
+Tensor& _scaled_tensorwise_tensorwise(
+    const Tensor& mat_a,
+    const Tensor& mat_b,
+    const Tensor& scale_a,
+    const Tensor& scale_b,
+    const std::optional<Tensor>& bias,
+    const c10::ScalarType out_dtype,
+    bool use_fast_accum,
+    Tensor& out) {
+  // Restrictions:
+  // A, B are FP8, scales are fp32
+
+  TORCH_CHECK_VALUE(
+      isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()),
+      "mat_a and mat_b must be fp8 types, got: ",
+      mat_a.scalar_type(),
+      mat_b.scalar_type());
+  TORCH_CHECK_VALUE(
+      scale_a.numel() == 1 && scale_a.scalar_type() == kFloat,
+      "scale_a must have 1 Float element")
+  TORCH_CHECK_VALUE(
+      scale_b.numel() == 1 && scale_b.scalar_type() == kFloat,
+      "scale_b must have 1 Float element")
+
+  auto scaling_choice_a = ScalingType::TensorWise;
+  auto scaling_choice_b = ScalingType::TensorWise;
+
+  _scaled_gemm(
+      mat_a,
+      mat_b,
+      scale_a,
+      scale_b,
+      scaling_choice_a,
+      scaling_choice_b,
+      bias,
+      use_fast_accum,
+      out);
+
+  return out;
+}
+
+Tensor& _scaled_rowwise_rowwise(
+    const Tensor& mat_a,
+    const Tensor& mat_b,
+    const Tensor& scale_a,
+    const Tensor& scale_b,
+    const std::optional<Tensor>& bias,
+    const c10::ScalarType out_dtype,
+    bool use_fast_accum,
+    Tensor& out) {
+  // Restrictions:
+  // A, B are FP8, scales are fp32, shape M/N for A/B
+  TORCH_CHECK_VALUE(
+      isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()),
+      "mat_a and mat_b must be fp8 types, got: ",
+      mat_a.scalar_type(),
+      mat_b.scalar_type());
+  TORCH_CHECK_VALUE(
+      scale_a.size(0) == mat_a.size(0) && scale_a.size(1) == 1,
+      "scale_a must have shape [",
+      mat_a.size(0),
+      ", 1], got [",
+      scale_a.sizes(),
+      "]");
+  TORCH_CHECK_VALUE(
+      scale_a.numel() == mat_a.size(0) && scale_a.scalar_type() == kFloat,
+      "scale_a must have ",
+      mat_a.size(0),
+      " Float elements, got ",
+      scale_a.numel())
+  TORCH_CHECK_VALUE(
+      scale_b.numel() == mat_b.size(1) && scale_b.scalar_type() == kFloat,
+      "scale_b must have ",
+      mat_b.size(1),
+      " Float elements, got ",
+      scale_b.numel())
+
+  TORCH_CHECK_VALUE(
+      scale_a.stride(1) == 1,
+      "expected scale_a.stride(1) to be 1, but got ",
+      scale_a.stride(1));
+  TORCH_CHECK_VALUE(
+      scale_b.stride(1) == 1,
+      "expected scale_b.stride(1) to be 1, but got ",
+      scale_b.stride(1));
+
+  auto scaling_choice_a = ScalingType::RowWise;
+  auto scaling_choice_b = ScalingType::RowWise;
+
+  _scaled_gemm(
+      mat_a,
+      mat_b,
+      scale_a,
+      scale_b,
+      scaling_choice_a,
+      scaling_choice_b,
+      bias,
+      use_fast_accum,
+      out);
+
+  return out;
+}
+
+// V2: Computes matrix multiply + bias while applying scaling to input and
+// output matrices Scales are only applicable when matrices are of Float8 type
+// and assumed to be equal to 1.0 by default. If output matrix type is 16 or
+// 32-bit type, scale_result is not applied. Known limitations:
+//  - Only works if mat1 is row-major and mat2 is column-major
+//  - Only works if matrices sizes are divisible by 32
+//  - If 1-dimensional tensors are used then scale_a should be size =
+//  mat1.size(0)
+//    and scale_b should have size = to mat2.size(1)
+//  Arguments:
+//    - `mat_a`: the first operand of the matrix multiply, can be type
+//    `torch.float8_e4m3fn` or `torch.float8_e5m2`
+//    - `mat_b`: the second operand of the matrix multiply, can be type
+//    `torch.float8_e4m3fn` or `torch.float8_e5m2`
+//    - `scale_a`: a tensor with the inverse scale of `mat1`, whose
+//    shape/strides/dtype depend on the scaling scheme
+//    - `scale_recipe_a`: An integer corresponding to an enum describing the
+//    scaling scheme used for `scale_a`
+//    - `swizzle_a`: An integer corresponding to a `SwizzleType` enum describing
+//    the swizzling scheme for `scale_a`.
+//        Not supported for XPU for now.
+//    - `scale_b`: a tensor with the inverse scale of `mat2`, whose
+//    shape/strides/dtype depend on the scaling scheme
+//    - `scale_recipe_b`: An integer corresponding to an enum describing the
+//    scaling scheme used for `scale_b`
+//    - `swizzle_b`: An integer corresponding to a `SwizzleType` enum describing
+//    the swizzling scheme for `scale_b`.
+//        Not supported for XPU for now.
+//    - `bias`: the bias, can be type `torch.float16` or `torch.bfloat16`
+//    - `out_dtype`: the output dtype, can either be a float8 or a higher
+//    precision floating point type
+//    - `contraction_dim`: describe which dimensions are `K` in the matmul.
+//       Not supported for XPU. Should always be empty.
+//    - `use_fast_accum`: Not supported for XPU, should always be false.
+//    - `out`: a reference to the output tensor
+Tensor& _scaled_mm_xpu_v2_out(
+    const Tensor& mat_a,
+    const Tensor& mat_b,
+    ArrayRef<Tensor> scale_a,
+    IntArrayRef scale_recipe_a,
+    IntArrayRef swizzle_a,
+    ArrayRef<Tensor> scale_b,
+    IntArrayRef scale_recipe_b,
+    IntArrayRef swizzle_b,
+    const std::optional<Tensor>& bias,
+    const std::optional<c10::ScalarType> out_dtype,
+    IntArrayRef contraction_dim,
+    bool use_fast_accum,
+    Tensor& out) {
+  TORCH_CHECK_VALUE(mat_a.dim() == 2, "mat_a must be a matrix");
+  TORCH_CHECK_VALUE(mat_b.dim() == 2, "mat_b must be a matrix");
+
+  // If any of M, K, N is 0 - return early (the tensorwise/rowwise float8 gemm
+  // kernels do not support this case).
+  if (mat_a.size(0) == 0 || mat_a.size(1) == 0 || mat_b.size(1) == 0) {
+    // `out` was created with `at::empty`. In the case where we are multiplying
+    // MxK by KxN and K is the zero dim, we need to initialize here to properly
+    // return a tensor of zeros.
+    at::native::resize_output(out, {mat_a.size(0), mat_b.size(1)});
+    if (mat_a.size(1) == 0) {
+      out.zero_();
+    }
+
+    return out;
+  }
+
+  // Note: The `contraction_dim` is not actually used for now. We will need to
+  // align this code when upstreamed CUDA code is done. Currently, only keeps
+  // the code here for check.
+
+  // Check if the input matrix sizes can be multiplied
+  // - if optional contraction dims are provided, use those
+  //   -- mostly for < 1B formats (i.e. nvfp4x2) where cheap .t() is not
+  //   available.
+  if (contraction_dim.size() > 0) {
+    TORCH_CHECK_VALUE(
+        contraction_dim.size() == 2,
+        "contraction_dim must have exactly 2 elements");
+    auto mat_a_dim = contraction_dim[0];
+    auto mat_b_dim = contraction_dim[1];
+    TORCH_CHECK_VALUE(
+        mat_a.size(mat_a_dim) == mat_b.size(mat_b_dim),
+        "mat_a and mat_b shapes cannot be multiplied (",
+        mat_a.size(0),
+        "x",
+        mat_a.size(1),
+        " and ",
+        mat_b.size(0),
+        "x",
+        mat_b.size(1),
+        ") ",
+        "with contraction dims mat_a: ",
+        mat_a_dim,
+        ", mat_b: ",
+        mat_b_dim);
+  } else {
+    TORCH_CHECK_VALUE(
+        mat_a.size(1) == mat_b.size(0),
+        "mat_a and mat_b shapes cannot be multiplied (",
+        mat_a.size(0),
+        "x",
+        mat_a.size(1),
+        " and ",
+        mat_b.size(0),
+        "x",
+        mat_b.size(1),
+        ")");
+  }
+
+  TORCH_CHECK_VALUE(
+      !bias || bias->numel() == mat_b.sizes()[1],
+      "Bias must be size ",
+      mat_b.sizes()[1],
+      " but got ",
+      bias->numel());
+
+  TORCH_CHECK_VALUE(
+      !out_dtype || *out_dtype == out.scalar_type(),
+      "out_dtype must match output matrix type");
+
+  if (bias) {
+    TORCH_CHECK_VALUE(
+        bias->scalar_type() == kFloat ||
+            bias->scalar_type() == c10::ScalarType::BFloat16 ||
+            bias->scalar_type() == c10::ScalarType::Half,
+        "Bias must be Float32 or BFloat16 or Half, but got ",
+        bias->scalar_type());
+  }
+  {
+    auto bias_ = bias.value_or(Tensor());
+    // NOLINTNEXTLINE(*c-array*)
+    TensorArg targs[]{
+        {out, "out", 0},
+        {mat_a, "mat_a", 1},
+        {mat_b, "mat_b", 2},
+        {bias_, "bias", 3},
+        {scale_a[0], "scale_a", 4},
+        {scale_b[0], "scale_b", 5}};
+    checkAllSameGPU(__func__, targs);
+  }
+  // Align with CUDA's default out to be bf16
+  auto out_dtype_ = out_dtype.value_or(c10::ScalarType::BFloat16);
+
+  // Conversion of implicitly-defined enums to explicit
+  auto scale_recipe_a_enum = convert_int_to_enum<ScalingType>(scale_recipe_a);
+  auto swizzle_a_enum = convert_int_to_enum<SwizzleType>(swizzle_a);
+  auto scale_recipe_b_enum = convert_int_to_enum<ScalingType>(scale_recipe_b);
+  auto swizzle_b_enum = convert_int_to_enum<SwizzleType>(swizzle_b);
+
+  // XPU does not support swizzle for now. So directly return false.
+  TORCH_CHECK_VALUE(
+      swizzle_a_enum[0] == at::blas::SwizzleType::NO_SWIZZLE &&
+          swizzle_b_enum[0] == at::blas::SwizzleType::NO_SWIZZLE,
+      "XPU does not support swizzle yet.");
+
+  // at this point we can start working out what we want to be doing
+  // Try to do as few steps as possible.
+  // NOTE: support is deliberately sparse, can explicitly enumerate all
+  // combinations allowed. Do this via a list of defined (name, acceptance,
+  // concrete_impl) tuples.
+  bool found_impl = false;
+  ScaledGemmImplementation gemm_impl = ScaledGemmImplementation::NONE;
+
+  for (const auto& fn_entry : scale_kernel_dispatch) {
+    const auto [name, accept_fn, scaled_gemm_impl] = fn_entry;
+    bool ok = accept_fn(
+        mat_a.scalar_type(),
+        scale_recipe_a_enum,
+        scale_a,
+        mat_b.scalar_type(),
+        scale_recipe_b_enum,
+        scale_b);
+    if (ok) {
+      gemm_impl = scaled_gemm_impl;
+      found_impl = true;
+      break;
+    }
+  }
+  TORCH_CHECK_VALUE(
+      found_impl,
+      "Invalid scaling configuration.\n"
+      "- For TensorWise scaling, a and b should be float8, scales should be float and singletons.\n"
+      "- For RowWise scaling, a and b should be float8, scales should be float, scale_a should be (",
+      mat_a.size(0),
+      ", 1) and scale_b should be (1, ",
+      mat_b.size(1),
+      "), and both should be contiguous.\n"
+      "Got mat_a.dtype()=",
+      mat_a.scalar_type(),
+      ", scale_a[0].dtype()=",
+      scale_a[0].scalar_type(),
+      ", scale_a[0].size()=",
+      scale_a[0].sizes(),
+      ", scale_a[0].stride()=",
+      scale_a[0].strides(),
+      ", ",
+      "mat_b.dtype()=",
+      mat_b.scalar_type(),
+      ", scale_b[0].dtype()=",
+      scale_b[0].scalar_type(),
+      ", scale_b[0].size()=",
+      scale_b[0].sizes(),
+      " and scale_b[0].stride()=",
+      scale_b[0].strides());
+
+  at::native::resize_output(out, {mat_a.size(0), mat_b.size(1)});
+
+  auto bias_ = bias.value_or(Tensor());
+
+  // dispatch to appropriate lower-level calls for error checking & execution
+  if (gemm_impl == ScaledGemmImplementation::TENSORWISE_TENSORWISE) {
+    return _scaled_tensorwise_tensorwise(
+        mat_a,
+        mat_b,
+        scale_a[0],
+        scale_b[0],
+        bias,
+        out_dtype_,
+        use_fast_accum,
+        out);
+  } else if (gemm_impl == ScaledGemmImplementation::ROWWISE_ROWWISE) {
+    return _scaled_rowwise_rowwise(
+        mat_a,
+        mat_b,
+        scale_a[0],
+        scale_b[0],
+        bias,
+        out_dtype_,
+        use_fast_accum,
+        out);
+  } else {
+    TORCH_CHECK_VALUE(
+        false, "Invalid state - found an implementation, but not really");
+  }
+}
+
+Tensor _scaled_mm_xpu_v2(
+    const Tensor& mat_a,
+    const Tensor& mat_b,
+    ArrayRef<Tensor> scale_a,
+    IntArrayRef scale_recipe_a,
+    IntArrayRef swizzle_a,
+    ArrayRef<Tensor> scale_b,
+    IntArrayRef scale_recipe_b,
+    IntArrayRef swizzle_b,
+    const std::optional<Tensor>& bias,
+    const std::optional<c10::ScalarType> out_dtype,
+    IntArrayRef contraction_dim,
+    bool use_fast_accum) {
+  const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
+  Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_));
+
+  return _scaled_mm_xpu_v2_out(
+      mat_a,
+      mat_b,
+      scale_a,
+      scale_recipe_a,
+      swizzle_a,
+      scale_b,
+      scale_recipe_b,
+      swizzle_b,
+      bias,
+      out_dtype,
+      contraction_dim,
+      use_fast_accum,
+      out);
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/QConv.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/QConv.cpp
index 282f42f37a364..4d6cb1b81fac3 100644
--- a/aten/src/ATen/native/mkldnn/xpu/detail/QConv.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/QConv.cpp
@@ -133,7 +133,7 @@ at::Tensor quantized_convolution(
   // supported in conv.
   mask_weight = weight_zero_points.numel() > 1 ? 1 : 0;
   if (groups > 1 && weight_zero_points.numel() > 1)
-    mask_weight = (2 ^ 0) | (2 ^ 1); // 2^0 (group) | 2^1 (output channel)
+    mask_weight = (1 << 0) | (1 << 1); // 2^0 (group) | 2^1 (output channel)
   dnnl::primitive_attr pattr;
 
   bool src_need_zp = (act_zero_point != 0);
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp
index ede01093ff3e7..f79dfadd65454 100644
--- a/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp
@@ -1,3 +1,4 @@
+#include <ATen/BlasBackend.h>
 #include <ATen/Tensor.h>
 #include <ATen/core/Tensor.h>
 #include <c10/core/ScalarType.h>
@@ -8,7 +9,6 @@
 #include <oneapi/dnnl/dnnl.hpp>
 
 namespace at::native::onednn {
-
 at::Tensor broadcast_bias2D(
     at::Tensor& dst,
     at::Tensor& bias,
@@ -328,4 +328,236 @@ void quantized_matmul(
     result.copy_(dst);
 }
 
+// Describes how to configure oneDNN scales for a given role/ScalingType
+struct ScaleSpec {
+  // specifies the way scale values will be applied to an ARG tensor.
+  int mask;
+  // specifies how scales are grouped along dimensions where
+  // multiple scale factors are used.
+  dnnl::memory::dims groups;
+  // specifies data type for scale factors.
+  dnnl::memory::data_type dtype;
+
+  // Helper to compute expected number of elements for scale tensors
+  // arg_type: "src" for SRC (groups pattern {1, X}),
+  // "wei" for WEIGHTS (groups pattern {X, 1})
+  int64_t expected_numel(
+      int64_t outer_dim,
+      int64_t inner_dim,
+      const std::string& arg_type) const {
+    if (groups == dnnl::memory::dims{1, 1})
+      return 1; // tensorwise scaling
+
+    TORCH_CHECK(
+        arg_type == "src" || arg_type == "wei",
+        "Expected arg_type to be 'src' or 'wei', but got '",
+        arg_type,
+        "'");
+
+    // For rowwise: SRC groups={1, K}, WEI groups={K, 1}
+    TORCH_INTERNAL_ASSERT(
+        (groups == dnnl::memory::dims{1, inner_dim} ||
+         groups == dnnl::memory::dims{inner_dim, 1}),
+        "The groups must be either {1, inner_dim} or {inner_dim, 1}. But got ",
+        groups,
+        ".");
+    return outer_dim;
+  }
+
+  // Normalize an incoming scale tensor to contiguous storage and appropriate
+  // dtype/view
+  at::Tensor normalize(const at::Tensor& scale) const {
+    TORCH_INTERNAL_ASSERT(
+        dtype == dnnl::memory::data_type::f32,
+        "tensor scale currently must be f32, but got scale dtype: ",
+        scale.scalar_type());
+    return scale.to(at::kFloat).contiguous();
+  }
+};
+
+// This function defines how to set scales mask and groups according to:
+// https://github.com/uxlfoundation/oneDNN/blob/main/tests/benchdnn/doc/knobs_attr.md#--attr-scales
+// The returned value will be used in
+// `set_scales(arg, mask, groups, data_type)`.
+inline ScaleSpec make_scale_spec(
+    at::blas::ScalingType scaling_type,
+    int64_t M,
+    int64_t K,
+    int64_t N,
+    const std::string& arg_type) {
+  TORCH_CHECK(
+      arg_type == "src" || arg_type == "wei",
+      "Expected arg_type to be 'src' or 'wei', but got '",
+      arg_type,
+      "'");
+  TORCH_INTERNAL_ASSERT(
+      (scaling_type == at::blas::ScalingType::TensorWise ||
+       scaling_type == at::blas::ScalingType::RowWise),
+      "Currently only support scaling_type for TensorWise or RowWise");
+  int64_t dim = K; // Currently only K is used for grouping
+  bool is_src = (arg_type == "src");
+  if (scaling_type == at::blas::ScalingType::TensorWise) {
+    // Scale tensorwise. The same as `--attr-scales=common`.
+    // mask=0 : scale whole tensor
+    // groups={1, 1}: indicates that there is only one group for scaling
+    return {0, {1, 1}, dnnl::memory::data_type::f32};
+  } else {
+    // (scaling_type == at::blas::ScalingType::RowWise)
+    // Scale RowWise. The same as `--attr-scales=per_dim_01`.
+    // mask={(1 << 0) | (1 << 1)}: Scale on both dim0 and dim1
+    // SRC: groups={1, K}, WEIGHTS: groups={K, 1}
+    return {
+        (1 << 0) | (1 << 1),
+        is_src ? dnnl::memory::dims{1, dim} : dnnl::memory::dims{dim, 1},
+        dnnl::memory::data_type::f32};
+  }
+}
+
+sycl::event scaled_matmul(
+    const Tensor& mat1,
+    const Tensor& mat2,
+    Tensor& result,
+    const Tensor& scale_a,
+    const Tensor& scale_b,
+    at::blas::ScalingType scaling_choice_a,
+    at::blas::ScalingType scaling_choice_b,
+    const std::optional<at::Tensor>& bias,
+    const std::optional<at::Tensor>& scale_result,
+    bool use_fast_accum) {
+  auto& engine = GpuEngineManager::Instance().get_engine();
+  auto& stream = GpuStreamManager::Instance().get_stream();
+
+  // This function will do steps with following steps
+  // 1. create memory descriptor
+  // 2. call write_to_dnnl_memory() to actually write memory
+  // 3. execute
+
+  const int64_t M = mat1.size(0);
+  const int64_t K = mat1.size(1);
+  const int64_t N = mat2.size(1);
+
+  // 1.1 Create memory descriptor
+  dnnl::memory::desc src_md = get_onednn_md(mat1);
+  dnnl::memory::desc weights_md = get_onednn_md(mat2);
+  dnnl::memory::desc dst_md = get_onednn_md(result);
+
+  // scale_a and scale_b has already be checked in `is_desired_scaling()` call.
+  // So we could directly get their memory desc and set later.
+  dnnl::memory::desc scale_a_md = get_onednn_md(scale_a);
+  dnnl::memory::desc scale_b_md = get_onednn_md(scale_b);
+
+  dnnl::memory::desc bias_md;
+  bool with_bias = bias.has_value();
+  at::Tensor possible_reshaped_bias = bias.value_or(at::Tensor());
+  if (with_bias) {
+    if (possible_reshaped_bias.dim() == 1) {
+      possible_reshaped_bias =
+          possible_reshaped_bias.reshape({1, possible_reshaped_bias.size(0)});
+      bias_md = get_onednn_md(possible_reshaped_bias);
+    } else {
+      bias_md = get_onednn_md(possible_reshaped_bias);
+    }
+  }
+
+  // 1.2 Create primitive descriptor and set scales mask
+  const ScaleSpec src_spec = make_scale_spec(scaling_choice_a, M, K, N, "src");
+  const ScaleSpec wei_spec = make_scale_spec(scaling_choice_b, M, K, N, "wei");
+
+  dnnl::primitive_attr op_attr = dnnl::primitive_attr();
+
+#if ONEDNN_SUPPORT_DETERMINISTIC
+  if (at::globalContext().deterministicAlgorithms() ||
+      at::globalContext().deterministicMkldnn())
+    op_attr.set_deterministic(true);
+#endif
+
+  std::vector<int64_t> default_groups;
+  op_attr.set_scales(
+      DNNL_ARG_SRC, src_spec.mask, src_spec.groups, src_spec.dtype);
+  op_attr.set_scales(
+      DNNL_ARG_WEIGHTS, wei_spec.mask, wei_spec.groups, wei_spec.dtype);
+  // scale_result tensor currently only supports scalar(TensorWise Scaling).
+  bool with_dst_scale = scale_result && scale_result->defined();
+  if (with_dst_scale) {
+    op_attr.set_scales(DNNL_ARG_DST, 0, {1}, dnnl::memory::data_type::f32);
+  }
+
+  op_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+
+  // 1.3 Create the matmul primitive descriptor
+  dnnl::matmul::primitive_desc matmul_pd = with_bias
+      ? dnnl::matmul::primitive_desc(
+            engine, src_md, weights_md, bias_md, dst_md, op_attr)
+      : dnnl::matmul::primitive_desc(
+            engine, src_md, weights_md, dst_md, op_attr);
+
+  // 1.4 (Possible) Additional Checks
+  // TODO: In case there are memory desc does not align with the actual tensor,
+  // we might need to reorder weights similar to CPU's reorder_if_differ_in()
+  // call. For example, weights not the same as matmul_pd.weights_desc(),
+
+  // 2. Prepare memory
+
+  // Create memory
+  auto src_usr_m = make_onednn_memory(src_md, engine, mat1.data_ptr());
+  auto weights_usr_m = make_onednn_memory(weights_md, engine, mat2.data_ptr());
+  auto dst_usr_m = make_onednn_memory(dst_md, engine, result.data_ptr());
+  dnnl::memory b_usr_m;
+  if (with_bias) {
+    b_usr_m =
+        make_onednn_memory(bias_md, engine, possible_reshaped_bias.data_ptr());
+  }
+
+  // Prepare runtime scale memories (flat 1-D views) using the specs
+  auto make_scale_mem_from_spec = [&](const ScaleSpec& spec,
+                                      int64_t expected_numel,
+                                      const at::Tensor& scale_tensor) {
+    at::Tensor prepared = spec.normalize(scale_tensor);
+    TORCH_CHECK(
+        prepared.numel() == expected_numel,
+        "Scale buffer length mismatch. Expected ",
+        expected_numel,
+        ", got ",
+        prepared.numel());
+    dnnl::memory::desc scale_md(
+        {prepared.numel()}, spec.dtype, dnnl::memory::format_tag::x);
+    return make_onednn_memory(scale_md, engine, prepared.data_ptr());
+  };
+
+  auto scratchpad =
+      make_onednn_memory(matmul_pd.scratchpad_desc(), engine, nullptr);
+
+  // 3. Setup Args for exec
+  std::unordered_map<int, dnnl::memory> args;
+  args.insert({DNNL_ARG_SRC, src_usr_m});
+  args.insert({DNNL_ARG_WEIGHTS, weights_usr_m});
+  args.insert({DNNL_ARG_DST, dst_usr_m});
+  args.insert({DNNL_ARG_SCRATCHPAD, scratchpad});
+  if (with_bias) {
+    args.insert({DNNL_ARG_BIAS, b_usr_m});
+  }
+
+  // Attach runtime scales using specs
+  auto src_sc_mem = make_scale_mem_from_spec(
+      src_spec, src_spec.expected_numel(M, K, "src"), scale_a);
+  auto wei_sc_mem = make_scale_mem_from_spec(
+      wei_spec, wei_spec.expected_numel(N, K, "wei"), scale_b);
+  args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, src_sc_mem});
+  args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, wei_sc_mem});
+  if (with_dst_scale) {
+    // Bind single f32 scalar as DST scale
+    at::Tensor dst_scale_f32 = scale_result->to(at::kFloat).contiguous();
+    dnnl::memory::desc dst_sc_md(
+        {1}, dnnl::memory::data_type::f32, dnnl::memory::format_tag::x);
+    auto dst_sc_mem =
+        make_onednn_memory(dst_sc_md, engine, dst_scale_f32.data_ptr());
+    args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, dst_sc_mem});
+  }
+
+  dnnl::matmul matmul_p = dnnl::matmul(matmul_pd);
+  sycl::event matmul_fwd_event =
+      dnnl::sycl_interop::execute(matmul_p, stream, args);
+  return matmul_fwd_event;
+}
+
 } // namespace at::native::onednn
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp
index 15f24e9cbb3a4..a8a6b870ff6b6 100644
--- a/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp
@@ -78,6 +78,10 @@ dnnl::memory::data_type get_onednn_dtype(
       return dnnl::memory::data_type::f32;
     case at::ScalarType::BFloat16:
       return dnnl::memory::data_type::bf16;
+    case at::ScalarType::Float8_e4m3fn:
+      return dnnl::memory::data_type::f8_e4m3;
+    case at::ScalarType::Float8_e5m2:
+      return dnnl::memory::data_type::f8_e5m2;
     default:
       if (!allow_undef) {
         TORCH_CHECK(
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h
index 6b2bf01e6d73d..bbe880b672b9d 100644
--- a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <ATen/ATen.h>
+#include <ATen/BlasBackend.h>
 #include <ATen/native/mkldnn/xpu/detail/Attr.h>
 #include <ATen/native/mkldnn/xpu/detail/Utils.h>
 #include <ATen/native/mkldnn/xpu/detail/oneDNNContext.h>
@@ -202,4 +203,16 @@ void sdpa_backward(
     Tensor& grad_query,
     Tensor& grad_key,
     Tensor& grad_value);
+
+sycl::event scaled_matmul(
+    const Tensor& mat1,
+    const Tensor& mat2,
+    Tensor& result,
+    const Tensor& scale_a,
+    const Tensor& scale_b,
+    at::blas::ScalingType scaling_choice_a,
+    at::blas::ScalingType scaling_choice_b,
+    const std::optional<at::Tensor>& bias,
+    const std::optional<at::Tensor>& scale_result,
+    bool use_fast_accum);
 } // namespace at::native::onednn
diff --git a/aten/src/ATen/native/mps/OperationUtils.h b/aten/src/ATen/native/mps/OperationUtils.h
index 03b3076402d0a..cb488a3f5f117 100644
--- a/aten/src/ATen/native/mps/OperationUtils.h
+++ b/aten/src/ATen/native/mps/OperationUtils.h
@@ -40,8 +40,6 @@ using namespace at::mps;
 
 namespace at::native::mps {
 
-void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)());
-
 struct MPSScalar {
   id<MTLBuffer> getMTLBuffer() const {
     return __builtin_bit_cast(id<MTLBuffer>, buffer.get());
@@ -84,6 +82,7 @@ NSArray<NSNumber*>* getTensorAxes(const TensorBase& t);
 NSArray<NSNumber*>* getTensorAxes(const IntArrayRef& sizes, at::OptionalIntArrayRef dim);
 std::string getMPSShapeString(MPSShape* shape);
 std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype = true, bool exclude_shape = false);
+std::string to_hex_key(float);
 std::string getArrayRefString(const IntArrayRef s);
 // use has_storage() on the returned tensor to determine if src actually is a view
 Tensor gatherViewTensor(const Tensor& src, Tensor& dst);
diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
index 96cd5d41959c3..196d514a2c580 100644
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -53,21 +53,6 @@ - (MPSGraphTensor*)maximumWithNaNPropagationAndIntFallbackWithPrimaryTensor:(MPS
 @end
 
 namespace at::native::mps {
-
-void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)()) {
-  __block std::optional<std::exception_ptr> block_exception;
-  dispatch_sync(queue, ^() {
-    try {
-      block();
-    } catch (...) {
-      block_exception = std::current_exception();
-    }
-  });
-  if (block_exception) {
-    std::rethrow_exception(*block_exception);
-  }
-}
-
 /**
  * Computes distance from lowest to highest element offset in given tensor.
  */
@@ -316,6 +301,10 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) {
   return fmt::to_string(fmt::join(s, ","));
 }
 
+std::string to_hex_key(float f) {
+  return fmt::format("{:a}", f);
+}
+
 std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype, bool exclude_shape) {
   fmt::basic_memory_buffer<char, 100> buffer;
   auto buf_iterator = std::back_inserter(buffer);
diff --git a/aten/src/ATen/native/mps/kernels/Indexing.metal b/aten/src/ATen/native/mps/kernels/Indexing.metal
index b41e64d70ced5..ebe078d01781e 100644
--- a/aten/src/ATen/native/mps/kernels/Indexing.metal
+++ b/aten/src/ATen/native/mps/kernels/Indexing.metal
@@ -1,4 +1,5 @@
 #include <c10/metal/atomic.h>
+#include <c10/metal/error.h>
 #include <c10/metal/indexing.h>
 #include <metal_stdlib>
 
@@ -31,10 +32,24 @@ OffsetT index_apply_indices(
     constant IndexAB* indices,
     constant int64_t* sizes,
     constant int64_t* strides,
-    uint num_indices) {
+    uint num_indices,
+    thread bool& error,
+    device ErrorMessages* error_buf) {
   OffsetT rc = offs.x;
   for (uint i = 0; i < num_indices; i++) {
     auto idx = indices[i].indexArray[offs.y];
+    if (idx < -sizes[i] || idx >= sizes[i]) {
+      TORCH_REPORT_ERROR(
+          error_buf,
+          "index ",
+          idx,
+          " is out of bounds for dimension ",
+          i,
+          " with size ",
+          sizes[i]);
+      error = true;
+      break;
+    }
     if (idx < 0) {
       idx += sizes[i];
     }
@@ -55,6 +70,7 @@ kernel void index_select(
     constant int64_t* index_sizes,
     constant int64_t* index_strides,
     constant uint4& ndim_nindices_numel,
+    device ErrorMessages* error_buffer,
     uint thread_index [[thread_position_in_grid]]) {
   const auto ndim = ndim_nindices_numel.x;
   const auto num_indices = ndim_nindices_numel.y;
@@ -65,8 +81,19 @@ kernel void index_select(
       indices_strides,
       ndim,
       thread_index);
+  bool error = false;
   auto input_offs = index_apply_indices<OffsetT>(
-      offs.yz, indices, index_sizes, index_strides, num_indices);
+      offs.yz,
+      indices,
+      index_sizes,
+      index_strides,
+      num_indices,
+      error,
+      error_buffer);
+  if (error) {
+    output[offs.x / sizeof(T)] = 0;
+    return;
+  }
   output[offs.x / sizeof(T)] = input[input_offs / sizeof(T)];
 }
 
@@ -82,7 +109,9 @@ inline void index_put_impl(
     constant int64_t* index_sizes,
     constant int64_t* index_strides,
     constant uint4& ndim_nindices_numel,
+    device ErrorMessages* error_buffer,
     uint thread_index) {
+  bool error = false;
   const auto ndim = ndim_nindices_numel.x;
   const auto num_indices = ndim_nindices_numel.y;
   const auto offs = index_get_offsets(
@@ -93,7 +122,16 @@ inline void index_put_impl(
       ndim,
       thread_index);
   auto output_offs = index_apply_indices<OffsetT>(
-      offs.xz, indices, index_sizes, index_strides, num_indices);
+      offs.xz,
+      indices,
+      index_sizes,
+      index_strides,
+      num_indices,
+      error,
+      error_buffer);
+  if (error) {
+    return;
+  }
   output[output_offs / sizeof(T)] = input[offs.y / sizeof(T)];
 }
 
@@ -109,6 +147,7 @@ kernel void index_put(
     constant int64_t* index_sizes,
     constant int64_t* index_strides,
     constant uint4& ndim_nindices_numel,
+    device ErrorMessages* error_buffer,
     uint thread_index [[thread_position_in_grid]]) {
   index_put_impl(
       output,
@@ -121,6 +160,7 @@ kernel void index_put(
       index_sizes,
       index_strides,
       ndim_nindices_numel,
+      error_buffer,
       thread_index);
 }
 
@@ -136,6 +176,7 @@ kernel void index_put_serial(
     constant int64_t* index_sizes,
     constant int64_t* index_strides,
     constant uint4& ndim_nindices_numel,
+    device ErrorMessages* error_buffer,
     uint thread_index [[thread_position_in_grid]]) {
   (void)thread_index; // Suppress unused vairable varning
   for (uint idx = 0; idx < ndim_nindices_numel.z; ++idx) {
@@ -150,6 +191,7 @@ kernel void index_put_serial(
         index_sizes,
         index_strides,
         ndim_nindices_numel,
+        error_buffer,
         idx);
   }
 }
@@ -166,6 +208,7 @@ kernel void index_put_accumulate(
     constant int64_t* index_sizes,
     constant int64_t* index_strides,
     constant uint4& ndim_nindices_numel,
+    device ErrorMessages* error_buffer,
     uint thread_index [[thread_position_in_grid]]) {
   const auto ndim = ndim_nindices_numel.x;
   const auto num_indices = ndim_nindices_numel.y;
@@ -176,8 +219,18 @@ kernel void index_put_accumulate(
       indices_strides,
       ndim,
       thread_index);
+  bool error = false;
   auto output_offs = index_apply_indices<OffsetT>(
-      offs.xz, indices, index_sizes, index_strides, num_indices);
+      offs.xz,
+      indices,
+      index_sizes,
+      index_strides,
+      num_indices,
+      error,
+      error_buffer);
+  if (error) {
+    return;
+  }
   AtomicType<T>::atomic_add(
       reinterpret_cast<device AtomicType_t<T>*>(output),
       output_offs / sizeof(T),
@@ -197,6 +250,7 @@ kernel void index_put_accumulate(
           constant int64_t* index_sizes,                            \
           constant int64_t* index_strides,                          \
           constant uint4& ndim_nindices_numel,                      \
+          device ErrorMessages* error_buffer,                       \
           uint thread_index [[thread_position_in_grid]])
 
 #define REGISTER_INDEX_OP_ALL_DTYPES(OP_NAME) \
diff --git a/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal b/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal
index c356dbf9ecb38..ecb2ddefd1fc1 100644
--- a/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal
+++ b/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal
@@ -40,7 +40,7 @@ inline c10::metal::opmath_t<T> matmul_inner(
     threadgroup_barrier(mem_flags::mem_threadgroup);
 
     for (uint k = 0; k < TILE_DIM; k++) {
-      sum += A_tile[tid.y][k] * B_tile[k][tid.x];
+      sum += c10::metal::mul(A_tile[tid.y][k], B_tile[k][tid.x]);
     }
 
     threadgroup_barrier(mem_flags::mem_threadgroup);
@@ -96,7 +96,9 @@ kernel void addmm(
     auto bias =
         biasData[thread_id.y * strides[3].x + thread_id.x * strides[3].y];
     outputData[thread_id.y * strides[2].x + thread_id.x * strides[2].y] =
-        static_cast<T>(alpha_beta[0] * sum + alpha_beta[1] * bias);
+        static_cast<T>(
+            c10::metal::mul(alpha_beta[0], sum) +
+            c10::metal::mul(alpha_beta[1], bias));
   }
 }
 
@@ -832,6 +834,10 @@ INSTANTIATE_MM_OPS(float);
 INSTANTIATE_MM_OPS(half);
 INSTANTIATE_MM_OPS(bfloat);
 
+// Complex MM
+INSTANTIATE_MM_OPS(float2);
+INSTANTIATE_MM_OPS(half2);
+
 // Integral MM
 INSTANTIATE_MM_OPS(long);
 INSTANTIATE_MM_OPS(int);
diff --git a/aten/src/ATen/native/mps/operations/Blas.mm b/aten/src/ATen/native/mps/operations/Blas.mm
index 16d744cedb8ef..5ebf5f604bfc1 100644
--- a/aten/src/ATen/native/mps/operations/Blas.mm
+++ b/aten/src/ATen/native/mps/operations/Blas.mm
@@ -141,6 +141,9 @@ Tensor dot_mps(const Tensor& self, const Tensor& other) {
   };
 
   MPSStream* stream = at::mps::getCurrentMPSStream();
+  if (result.numel() == 0) {
+    return result;
+  }
   Tensor matMulVec = at::mm(mat, vec.unsqueeze(1)).squeeze(1);
 
   @autoreleasepool {
diff --git a/aten/src/ATen/native/mps/operations/EmbeddingBag.mm b/aten/src/ATen/native/mps/operations/EmbeddingBag.mm
index e6690b2531f0d..d7916ccdf875d 100644
--- a/aten/src/ATen/native/mps/operations/EmbeddingBag.mm
+++ b/aten/src/ATen/native/mps/operations/EmbeddingBag.mm
@@ -220,7 +220,7 @@ Tensor _embedding_bag_dense_backward_mps(const Tensor& output_grad,
   auto num_threads = (params.mode == EmbeddingBagMode::MAX) ? output_grad.numel() : num_indices * params.feature_size;
   MPSStream* stream = getCurrentMPSStream();
 
-  mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
     @autoreleasepool {
       id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
       auto pipeline_state = lib.getPipelineStateForFunc(fmt::format("embedding_bag_backward_{}_{}",
@@ -273,7 +273,7 @@ Tensor _embedding_bag_per_sample_weights_backward_mps(const Tensor& output_grad,
   auto num_threads = num_indices * feature_size;
   MPSStream* stream = getCurrentMPSStream();
 
-  mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
     @autoreleasepool {
       id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
       auto pipeline_state = lib.getPipelineStateForFunc(fmt::format("embedding_bag_per_sample_weights_backward_{}_{}",
diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
index 0b0a84c45a52c..2a21f3f8aadca 100644
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -179,7 +179,8 @@ static void dispatch_index_kernel(TensorIteratorBase& iter,
                    iter.strides(2),
                    index_size,
                    index_stride,
-                   ndim_nindiees);
+                   ndim_nindiees,
+                   mpsStream->getErrorBuffer());
     mtl_dispatch1DJob(computeEncoder, indexSelectPSO, serial ? 1 : iter.numel());
   });
 }
@@ -299,7 +300,7 @@ static Tensor nonzero_fallback(const Tensor& self) {
   MPSStream* stream = getCurrentMPSStream();
   using CachedGraph = MPSUnaryCachedGraph;
 
-  dispatch_sync(stream->queue(), ^() {
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
     stream->synchronize(SyncType::COMMIT_AND_WAIT);
   });
   int64_t total_nonzero = at::count_nonzero(self).item<int64_t>();
@@ -384,7 +385,7 @@ static Tensor nonzero_fallback(const Tensor& self) {
   MPSStream* stream = getCurrentMPSStream();
   using CachedGraph = MPSUnaryCachedGraph;
 
-  dispatch_sync(stream->queue(), ^() {
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
     stream->synchronize(SyncType::COMMIT_AND_WAIT);
   });
   int64_t total_nonzero = at::count_nonzero(self).item<int64_t>();
diff --git a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
index aed417ca9ca92..ca19d121bb718 100644
--- a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
+++ b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
@@ -121,7 +121,7 @@
                        const Scalar& alpha,
                        const Scalar& beta,
                        const Tensor& bias) {
-  if (beta.toDouble() == 0 && alpha.toDouble() == 1) {
+  if (beta.isFloatingPoint() && alpha.isFloatingPoint() && beta.toDouble() == 0 && alpha.toDouble() == 1) {
     return do_metal_mm(self, other, output);
   }
   auto stream = getCurrentMPSStream();
@@ -147,13 +147,15 @@
         std::array<int64_t, 2> i64;
         std::array<int32_t, 2> i32;
         std::array<float, 2> f32;
-      } alpha_beta;
+        std::array<c10::complex<float>, 2> c64;
+      } alpha_beta{};
       if (output.scalar_type() == kLong) {
         alpha_beta.i64 = {alpha.toLong(), beta.toLong()};
       } else if (c10::isIntegralType(output.scalar_type(), true)) {
         alpha_beta.i32 = {alpha.toInt(), beta.toInt()};
+      } else if (c10::isComplexType(output.scalar_type())) {
+        alpha_beta.c64 = {alpha.toComplexFloat(), beta.toComplexFloat()};
       } else {
-        TORCH_INTERNAL_ASSERT(c10::isFloatingType(output.scalar_type()));
         alpha_beta.f32 = {alpha.toFloat(), beta.toFloat()};
       }
       constexpr uint32_t TILE_DIM = 16; // fastest performance from tests on multiple macs
@@ -190,10 +192,16 @@
 bool use_metal_mm(const Tensor& self, const Tensor& other, const Tensor& output) {
   static bool always_use_metal = c10::utils::has_env("PYTORCH_MPS_PREFER_METAL");
   constexpr auto max_stride_size = 32768;
+  constexpr auto max_complex_inner_size = 2048;
   static bool is_macos_14_4_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_4_PLUS);
   if (always_use_metal || c10::isIntegralType(self.scalar_type(), true)) {
     return true;
   }
+  // multiplicationWithPrimaryTensor: returns incorrect results if inner size exceeds 2048
+  // See https://github.com/pytorch/pytorch/issues/167727#issuecomment-3529308548
+  if (c10::isComplexType(self.scalar_type()) && self.size(1) > max_complex_inner_size) {
+    return true;
+  }
   return !is_macos_14_4_or_newer &&
       (self.stride(0) > max_stride_size || self.stride(1) > max_stride_size || self.size(0) > max_stride_size ||
        self.size(1) > max_stride_size || other.stride(0) > max_stride_size || other.stride(1) > max_stride_size ||
diff --git a/aten/src/ATen/native/mps/operations/LossOps.mm b/aten/src/ATen/native/mps/operations/LossOps.mm
index c995b8fc237f3..f0bbcdabfa5cd 100644
--- a/aten/src/ATen/native/mps/operations/LossOps.mm
+++ b/aten/src/ATen/native/mps/operations/LossOps.mm
@@ -212,17 +212,12 @@
   loss.resize_((reduction == Reduction::None || grad_output.defined()) ? target.sizes() : IntArrayRef({}));
   TORCH_CHECK(loss.is_mps());
 
-  Tensor loss_squeezed = loss.squeeze();
-  Tensor input_squeezed = input.squeeze();
-  Tensor target_squeezed = target.squeeze();
-
   @autoreleasepool {
-    std::string key =
-        op_name + reductionToString(reduction) + getTensorsStringKey({input_squeezed, target_squeezed, weight});
+    std::string key = op_name + reductionToString(reduction) + getTensorsStringKey({input, target, weight});
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
-      newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_squeezed);
-      newCachedGraph->targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target_squeezed);
+      newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input);
+      newCachedGraph->targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target);
 
       MPSGraphTensor* bceLossUnweighted = nil;
       // if grad_output is defined, then it's a backward pass
@@ -252,12 +247,12 @@
           newCachedGraph->gradInputTensor = bceLoss;
         }
       } else {
-        newCachedGraph->lossTensor = reduceTensor(bceLoss, reduction, mpsGraph, input_squeezed.sizes().size());
+        newCachedGraph->lossTensor = reduceTensor(bceLoss, reduction, mpsGraph, input.sizes().size());
       }
     });
-    Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor, input_squeezed);
-    Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor, target_squeezed);
-    Placeholder lossPlaceholder = Placeholder(cachedGraph->lossTensor, loss_squeezed);
+    Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor, input);
+    Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor, target);
+    Placeholder lossPlaceholder = Placeholder(cachedGraph->lossTensor, loss);
 
     NSMutableDictionary* feeds = [[NSMutableDictionary new] autorelease];
 
diff --git a/aten/src/ATen/native/mps/operations/Normalization.mm b/aten/src/ATen/native/mps/operations/Normalization.mm
index 0c95fec667e80..7441692b6c291 100644
--- a/aten/src/ATen/native/mps/operations/Normalization.mm
+++ b/aten/src/ATen/native/mps/operations/Normalization.mm
@@ -923,7 +923,7 @@ Check if running mean exists (maybe do this check before making graph)
   MPSStream* stream = getCurrentMPSStream();
   TORCH_CHECK_NOT_IMPLEMENTED(input.scalar_type() != kLong, "Not implemented for long on MPS");
   @autoreleasepool {
-    mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
+    dispatch_sync_with_rethrow(stream->queue(), ^() {
       // which kernel variant to use based on the normalized axis N size
       const int N_READS = 4;
       auto metalType = mps::scalarToMetalTypeString(input);
diff --git a/aten/src/ATen/native/mps/operations/Repeat.mm b/aten/src/ATen/native/mps/operations/Repeat.mm
index 40afa15b4f700..f350b0137b05e 100644
--- a/aten/src/ATen/native/mps/operations/Repeat.mm
+++ b/aten/src/ATen/native/mps/operations/Repeat.mm
@@ -91,26 +91,31 @@ Tensor repeat_mps(const Tensor& self, IntArrayRef repeats) {
 #include <ATen/native/mps/Repeat_metallib.h>
 #endif
 
-template <typename index_t>
-void computeRepeatIndices(const index_t* repeat_ptr,
-                          const int64_t* cumsum_ptr,
-                          index_t* result_ptr,
-                          int64_t size,
-                          int64_t result_size) {
-  id<MTLBuffer> repeatBuffer = reinterpret_cast<id<MTLBuffer>>(repeat_ptr);
-  id<MTLBuffer> cumsumBuffer = reinterpret_cast<id<MTLBuffer>>(cumsum_ptr);
-  id<MTLBuffer> resultBuffer = reinterpret_cast<id<MTLBuffer>>(result_ptr);
-  TORCH_CHECK(repeatBuffer && cumsumBuffer && resultBuffer);
-
+Tensor repeat_interleave_mps(const Tensor& repeat, std::optional<int64_t> output_size) {
+  TORCH_CHECK(repeat.dim() == 1, "repeat_interleave only accept 1D vector as repeat");
   std::string scalar_type;
-  if constexpr (std::is_same_v<index_t, int32_t>) {
+  if (repeat.scalar_type() == kInt) {
     scalar_type = "int32_t";
-  } else if constexpr (std::is_same_v<index_t, int64_t>) {
+  } else if (repeat.scalar_type() == kLong) {
     scalar_type = "int64_t";
   } else {
-    TORCH_CHECK(false, "repeat_interleave: unsupported indexing data type");
+    TORCH_CHECK(false, "repeats has to be Long or Int tensor");
+  }
+  if (repeat.size(0) == 0) {
+    return at::empty_like(repeat, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  }
+  Tensor repeat_ = repeat.contiguous();
+  Tensor cumsum = repeat.cumsum(0);
+  int64_t total = 0;
+  if (output_size.has_value()) {
+    total = output_size.value();
+  } else {
+    total = cumsum[-1].item<int64_t>();
+    TORCH_CHECK((repeat >= 0).all().item<uint8_t>(), "repeats can not be negative");
   }
 
+  auto result = at::empty({total}, repeat.options());
+
   MPSStream* mpsStream = getCurrentMPSStream();
   dispatch_sync(mpsStream->queue(), ^() {
     @autoreleasepool {
@@ -121,20 +126,13 @@ void computeRepeatIndices(const index_t* repeat_ptr,
       getMPSProfiler().beginProfileKernel(pipelineState, "repeat_interleave:" + scalar_type, false);
 
       [computeEncoder setComputePipelineState:pipelineState];
-      mps::mtl_setArgs(computeEncoder, repeatBuffer, cumsumBuffer, resultBuffer, size);
-      mps::mtl_dispatch1DJob(computeEncoder, pipelineState, size);
+      mps::mtl_setArgs(computeEncoder, repeat_, cumsum, result, repeat.size(0));
+      mps::mtl_dispatch1DJob(computeEncoder, pipelineState, repeat.size(0));
 
       getMPSProfiler().endProfileKernel(pipelineState);
     }
   });
-}
-
-Tensor repeat_interleave_mps(const Tensor& repeat, std::optional<int64_t> output_size) {
-  Tensor output;
-  AT_DISPATCH_INDEX_TYPES(repeat.scalar_type(), "repeat_interleave_mps", [&]() {
-    output = repeat_interleave_common<index_t, computeRepeatIndices<index_t>>(repeat, output_size);
-  });
-  return output;
+  return result;
 }
 
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/TensorCompare.mm b/aten/src/ATen/native/mps/operations/TensorCompare.mm
index 7b637d896f850..ed659bddd65cc 100644
--- a/aten/src/ATen/native/mps/operations/TensorCompare.mm
+++ b/aten/src/ATen/native/mps/operations/TensorCompare.mm
@@ -5,6 +5,7 @@
 #include <ATen/native/Resize.h>
 #include <ATen/native/TensorCompare.h>
 #include <ATen/native/mps/OperationUtils.h>
+#include <algorithm>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -89,13 +90,21 @@ static void check_min_max_dims(const OptionalTensorRef clamp_opt, const Tensor&
     auto clamp_shape = clamp_opt->sizes();
     auto input_shape = input_t.sizes();
 
-    TORCH_CHECK(num_clamp_dims <= num_input_dims,
-                op_name + ": clamp tensor number of dims must not be greater than that of input tensor")
+    if (num_clamp_dims > num_input_dims) {
+      auto leading_dims = num_clamp_dims - num_input_dims;
+      for (int64_t i = 0; i < leading_dims; ++i) {
+        TORCH_CHECK(clamp_shape[i] == 1,
+                    op_name + ": clamp tensor leading shape must be 1 to broadcast with input tensor");
+      }
+    }
 
-    for (int i = 0; i < num_clamp_dims; i++)
+    auto clamp_idx = num_clamp_dims - 1;
+    auto input_idx = num_input_dims - 1;
+    auto common_dims = std::min(num_clamp_dims, num_input_dims);
+    for (int64_t i = 0; i < common_dims; ++i)
       // One of the indices is allowed to be 1; will be handled by broadcast
-      TORCH_CHECK(clamp_shape[num_clamp_dims - 1 - i] == input_shape[num_input_dims - 1 - i] ||
-                      clamp_shape[num_clamp_dims - 1 - i] == 1 || input_shape[num_input_dims - 1 - i] == 1,
+      TORCH_CHECK(clamp_shape[clamp_idx - i] == input_shape[input_idx - i] || clamp_shape[clamp_idx - i] == 1 ||
+                      input_shape[input_idx - i] == 1,
                   op_name + ": clamp tensor trailing shape must match input tensor")
   }
 }
@@ -136,9 +145,6 @@ static void clamp_tensor_out_mps(const Tensor& input_t,
 
   auto result_type = output_t.scalar_type();
 
-  IntArrayRef new_min_shape;
-  IntArrayRef new_max_shape;
-
   auto num_min_dims = min_opt->dim();
   auto num_max_dims = max_opt->dim();
   auto num_input_dims = input_t.dim();
@@ -146,24 +152,32 @@ static void clamp_tensor_out_mps(const Tensor& input_t,
   std::vector<int64_t> new_min_arr(num_input_dims);
   std::vector<int64_t> new_max_arr(num_input_dims);
 
-  if (has_min && num_min_dims < num_input_dims) {
-    fill_new_shape(num_input_dims, num_min_dims, new_min_arr.data(), min_opt->sizes());
-    new_min_shape = IntArrayRef(new_min_arr);
-  }
-
-  if (has_max && num_max_dims < num_input_dims) {
-    fill_new_shape(num_input_dims, num_max_dims, new_max_arr.data(), max_opt->sizes());
-    new_max_shape = IntArrayRef(new_max_arr);
-  }
-
   Tensor min_opt_tensor;
   Tensor max_opt_tensor;
 
+  auto reshape_clamp_tensor = [&](const OptionalTensorRef clamp_tensor_ref,
+                                  int64_t num_clamp_dims,
+                                  std::vector<int64_t>& new_shape_storage) -> Tensor {
+    IntArrayRef clamp_shape = clamp_tensor_ref->sizes();
+    bool requires_view = false;
+
+    if (num_clamp_dims > num_input_dims) {
+      clamp_shape = clamp_shape.slice(num_clamp_dims - num_input_dims);
+      requires_view = true;
+    } else if (num_clamp_dims < num_input_dims) {
+      fill_new_shape(num_input_dims, num_clamp_dims, new_shape_storage.data(), clamp_shape);
+      clamp_shape = IntArrayRef(new_shape_storage);
+      requires_view = true;
+    }
+
+    return requires_view ? (*clamp_tensor_ref).view(clamp_shape) : *clamp_tensor_ref;
+  };
+
   if (has_min) {
-    min_opt_tensor = (num_min_dims < num_input_dims) ? (*min_opt).view(new_min_shape) : *min_opt;
+    min_opt_tensor = reshape_clamp_tensor(min_opt, num_min_dims, new_min_arr);
   }
   if (has_max) {
-    max_opt_tensor = (num_max_dims < num_input_dims) ? (*max_opt).view(new_max_shape) : *max_opt;
+    max_opt_tensor = reshape_clamp_tensor(max_opt, num_max_dims, new_max_arr);
   }
 
   @autoreleasepool {
@@ -244,8 +258,8 @@ static void clamp_scalar_out_mps(const Tensor& input_t,
 
   @autoreleasepool {
     // the optional min/max refs could affect how we build the cached graph
-    std::string key = op_name + (has_min ? ("_min:" + std::to_string(min_scalar)) : "") +
-        (has_max ? ("_max:" + std::to_string(max_scalar)) : "") + "_scalar:" + getTensorsStringKey({input_t});
+    std::string key = op_name + (has_min ? ("_min:" + to_hex_key(min_scalar)) : "") +
+        (has_max ? ("_max:" + to_hex_key(max_scalar)) : "") + "_scalar:" + getTensorsStringKey({input_t});
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       if (has_min)
         newCachedGraph->minTensor = [mpsGraph constantWithScalar:min_scalar
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 4424f51827d45..9a1c7c790afaa 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -192,6 +192,11 @@
     CompositeExplicitAutograd: _assert_tensor_metadata
     Meta: _assert_tensor_metadata_meta_symint
 
+- func: _async_error(str msg) -> ()
+  dispatch:
+    CompositeExplicitAutograd: _async_error
+    Meta: _async_error_meta
+
 - func: _print(str s) -> ()
   dispatch:
     CompositeExplicitAutograd: _print
@@ -2803,7 +2808,7 @@
 - func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA, MPS: floor_divide_out
+    CPU, CUDA, MPS, MTIA: floor_divide_out
     SparseCPU, SparseCUDA, SparseMPS: floor_divide_out_sparse_zerodim
 
 - func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
@@ -4220,7 +4225,7 @@
     MTIA: mm_out_mtia
     MPS: mm_out_mps
     XPU: mm_out_xpu
-    SparseCPU, SparseCUDA: _sparse_mm_out
+    SparseCPU, SparseCUDA, SparseMPS: _sparse_mm_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _sparse_csr_mm_out
 
 - func: mm.dtype(Tensor self, Tensor mat2, ScalarType out_dtype) -> Tensor
@@ -4292,6 +4297,7 @@
   dispatch:
     SparseCPU: sparse_sparse_matmul_cpu
     SparseCUDA: sparse_sparse_matmul_cuda
+    SparseMPS: sparse_sparse_matmul_mps
   autogen: _sparse_sparse_matmul.out
 
 - func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
@@ -4383,7 +4389,7 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: mv
-    SparseCPU, SparseCUDA: mv_sparse
+    SparseCPU, SparseCUDA, SparseMPS: mv_sparse
 
 - func: mv.out(Tensor self, Tensor vec, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -7512,7 +7518,7 @@
 - func: _sparse_mask_projection(Tensor self, Tensor mask, bool accumulate_matches=False) -> Tensor
   variants: method
   dispatch:
-    SparseCPU, SparseCUDA: sparse_mask_projection
+    SparseCPU, SparseCUDA, SparseMPS: sparse_mask_projection
   autogen: _sparse_mask_projection.out
 
 - func: _to_cpu(Tensor[] tensors) -> Tensor[]
@@ -9832,7 +9838,7 @@
   structured_delegate: erfinv.out
   variants: method, function
   dispatch:
-    SparseCPU, SparseCUDA: erfinv_sparse
+    SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr
   tags: pointwise
 
@@ -9841,7 +9847,7 @@
   structured_delegate: erfinv.out
   variants: method
   dispatch:
-    SparseCPU, SparseCUDA: erfinv_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr_
   tags: pointwise
 
@@ -9851,7 +9857,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: erfinv_out
-    SparseCPU, SparseCUDA: erfinv_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr_out
   tags: pointwise
 
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/buckbuild.bzl b/aten/src/ATen/native/quantized/cpu/qnnpack/buckbuild.bzl
index 180442b4b09a4..fecce634ec08c 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/buckbuild.bzl
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/buckbuild.bzl
@@ -1,7 +1,7 @@
 load("//tools/build_defs:fb_xplat_cxx_library.bzl", "fb_xplat_cxx_library")
 load("//tools/build_defs:fb_xplat_cxx_test.bzl", "fb_xplat_cxx_test")
 load("//tools/build_defs:glob_defs.bzl", "subdir_glob")
-load("//tools/build_defs:platform_defs.bzl", "ANDROID", "APPLE", "APPLETVOS", "CXX", "IOS", "MACOSX")
+load("//tools/build_defs:platform_defs.bzl", "ANDROID", "APPLE", "CXX", "IOS", "MACOSX")
 
 # Shared by internal and OSS BUCK
 def define_qnnpack(third_party, labels = []):
@@ -21,7 +21,7 @@ def define_qnnpack(third_party, labels = []):
             ("src", "requantization/*.h"),
         ]),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
             "-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION",
@@ -82,7 +82,7 @@ def define_qnnpack(third_party, labels = []):
             ("src", "requantization/*.h"),
         ]),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O3",
             "-ffast-math",
@@ -129,7 +129,7 @@ def define_qnnpack(third_party, labels = []):
             ("src", "requantization/*.h"),
         ]),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O3",
             "-ffast-math",
@@ -184,7 +184,7 @@ def define_qnnpack(third_party, labels = []):
             ("src", "requantization/*.h"),
         ]),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O3",
             "-ffast-math",
@@ -236,7 +236,7 @@ def define_qnnpack(third_party, labels = []):
             ],
         ),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION",
         ],
@@ -291,7 +291,7 @@ def define_qnnpack(third_party, labels = []):
             ("src", "qnnpack/*.h"),
             ("include", "*.h"),
         ]),
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
             "-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION",
@@ -398,7 +398,7 @@ def define_qnnpack(third_party, labels = []):
             ("src", "requantization/*.h"),
         ]),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O3",
             "-ffast-math",
@@ -465,7 +465,7 @@ def define_qnnpack(third_party, labels = []):
             ("src", "requantization/*.h"),
         ]),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION",
             "-Wno-unused-command-line-argument",
@@ -525,7 +525,7 @@ def define_qnnpack(third_party, labels = []):
             ("src", "qnnpack/*.h"),
         ]),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O3",
             "-ffast-math",
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/test/avgpool-microkernel-tester.h b/aten/src/ATen/native/quantized/cpu/qnnpack/test/avgpool-microkernel-tester.h
index 1a425146ad6c2..ac6370f8df29f 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/test/avgpool-microkernel-tester.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/test/avgpool-microkernel-tester.h
@@ -301,12 +301,12 @@ class AvgPoolMicrokernelTester {
           ASSERT_NEAR(
               float(int32_t(y[i * yStride() + k])), yFP[i * kc() + k], 0.5001f)
               << "at pixel " << i << ", channel " << k << ", n = " << n()
-              << ", ks = " << kh() << "x" << kw() << " (" << ks()
+              << ", ks = " << kh() << 'x' << kw() << " (" << ks()
               << "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
           ASSERT_EQ(
               uint32_t(yRef[i * kc() + k]), uint32_t(y[i * yStride() + k]))
               << "at pixel " << i << ", channel " << k << ", n = " << n()
-              << ", ks = " << kh() << "x" << kw() << " (" << ks()
+              << ", ks = " << kh() << 'x' << kw() << " (" << ks()
               << "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
         }
       }
@@ -396,12 +396,12 @@ class AvgPoolMicrokernelTester {
           ASSERT_NEAR(
               float(int32_t(y[i * yStride() + k])), yFP[i * kc() + k], 0.5001f)
               << "at pixel " << i << ", channel " << k << ", n = " << n()
-              << ", ks = " << kh() << "x" << kw() << " (" << ks()
+              << ", ks = " << kh() << 'x' << kw() << " (" << ks()
               << "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
           ASSERT_EQ(
               uint32_t(yRef[i * kc() + k]), uint32_t(y[i * yStride() + k]))
               << "at pixel " << i << ", channel " << k << ", n = " << n()
-              << ", ks = " << kh() << "x" << kw() << " (" << ks()
+              << ", ks = " << kh() << 'x' << kw() << " (" << ks()
               << "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
         }
       }
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/test/maxpool-microkernel-tester.h b/aten/src/ATen/native/quantized/cpu/qnnpack/test/maxpool-microkernel-tester.h
index e1583a2c058ef..fc94f9666d9d0 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/test/maxpool-microkernel-tester.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/test/maxpool-microkernel-tester.h
@@ -232,7 +232,7 @@ class MaxPoolMicrokernelTester {
           ASSERT_EQ(
               uint32_t(yRef[i * kc() + k]), uint32_t(y[i * yStride() + k]))
               << "at pixel " << i << ", channel " << k << ", n = " << n()
-              << ", ks = " << kh() << "x" << kw() << " (" << ks()
+              << ", ks = " << kh() << 'x' << kw() << " (" << ks()
               << "), kc = " << kc();
         }
       }
diff --git a/aten/src/ATen/native/sparse/cuda/SoftMax.cu b/aten/src/ATen/native/sparse/cuda/SoftMax.cu
index d39e41c532553..7e3b502bf6f41 100644
--- a/aten/src/ATen/native/sparse/cuda/SoftMax.cu
+++ b/aten/src/ATen/native/sparse/cuda/SoftMax.cu
@@ -30,10 +30,12 @@
 
 #include <thrust/binary_search.h>
 #include <thrust/device_ptr.h>
+#include <thrust/distance.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/scan.h>
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
 #include <thrust/system/cuda/execution_policy.h>
-#include <thrust/iterator/constant_iterator.h>
 
 #include <cuda_runtime_api.h>
 #include <cusparse.h>
@@ -47,6 +49,7 @@
 #include <c10/macros/Macros.h>
 #include <thrust/copy.h>
 #include <thrust/device_ptr.h>
+#include <thrust/distance.h>
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
 #include <thrust/gather.h>
diff --git a/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm b/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
index 5dbee4e38af7b..3da1cb5da53c8 100644
--- a/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
+++ b/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
@@ -10,6 +10,10 @@
 #include <ATen/NativeFunctions.h>
 #else
 #include <ATen/ops/_coalesce_native.h>
+#include <ATen/ops/repeat_interleave_native.h>
+#include <ATen/ops/cumsum.h>
+#include <ATen/ops/_sparse_sparse_matmul_native.h>
+#include <ATen/ops/_sparse_coo_tensor_unsafe.h>
 #include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
 #include <ATen/ops/cat.h>
 #include <ATen/ops/add_native.h>
@@ -441,6 +445,33 @@ Tensor addmm_sparse_dense_mps(
   return out;
 }
 
+static std::tuple<Tensor, Tensor, int64_t> mps_intersect_binary_search(
+    const Tensor& A_keys,
+    const Tensor& B_keys,
+    int64_t lenA,
+    int64_t lenB,
+    bool boolean_flag) {
+
+  auto stream = getCurrentMPSStream();
+  auto outA_idx = at::empty({lenA}, A_keys.options().dtype(at::kLong));
+  auto outB_idx = at::empty({lenA}, A_keys.options().dtype(at::kLong));
+  auto counter = at::zeros({1}, A_keys.options().dtype(at::kInt));
+
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto pso = lib.getPipelineStateForFunc("intersect_binary_search");
+      auto enc = stream->commandEncoder();
+      [enc setComputePipelineState:pso];
+      mtl_setArgs(enc, A_keys, B_keys, outA_idx, outB_idx, counter,
+                  static_cast<uint32_t>(lenB), boolean_flag);
+      mtl_dispatch1DJob(enc, pso, static_cast<uint32_t>(lenA));
+    }
+  });
+
+  const auto match_count = static_cast<int64_t>(counter.item<int32_t>());
+  return std::make_tuple(std::move(outA_idx), std::move(outB_idx), match_count);
+}
+
 
 SparseTensor& mul_out_sparse_mps(const Tensor& t_, const Tensor& src_, SparseTensor& r_) {
   TORCH_CHECK(r_.is_mps(), "mul: expected 'out' to be MPS, but got ", r_.device());
@@ -519,22 +550,10 @@ Tensor addmm_sparse_dense_mps(
   auto A_keys = A_is_lhs ? lhs_keys : rhs_keys;
   auto B_keys = A_is_lhs ? rhs_keys : lhs_keys;
 
-  auto outA_idx = at::empty({lenA}, at::device(device).dtype(kLong));
-  auto outB_idx = at::empty({lenA}, at::device(device).dtype(kLong));
-  auto counter = at::zeros({1}, at::device(device).dtype(kInt));
+  auto [outA_idx, outB_idx, M_int64] = mps_intersect_binary_search(
+      A_keys, B_keys, lenA, lenB, A_is_lhs);
 
-  dispatch_sync_with_rethrow(stream->queue(), ^() {
-    @autoreleasepool {
-      auto pso = lib.getPipelineStateForFunc("intersect_binary_search");
-      auto enc = stream->commandEncoder();
-      [enc setComputePipelineState:pso];
-      mtl_setArgs(enc, A_keys, B_keys, outA_idx, outB_idx, counter,
-                  static_cast<uint32_t>(lenB), A_is_lhs);
-      mtl_dispatch1DJob(enc, pso, static_cast<uint32_t>(lenA));
-    }
-  });
-
-  const uint32_t M = counter.item<int32_t>(); // number of structural matches
+  const auto M = static_cast<uint32_t>(M_int64); // number of structural matches
 
   r_.resize_as_(lhs);
 
@@ -758,6 +777,14 @@ Tensor addmm_sparse_dense_mps(
 
 using OptTensor = std::optional<Tensor>;
 
+static Tensor create_sparse_output_values(
+    const Tensor& template_values,
+    int64_t output_nnz,
+    ScalarType dtype) {
+  auto out_val_sizes = template_values.sizes().vec();
+  out_val_sizes[0] = output_nnz;
+  return at::zeros(out_val_sizes, template_values.options().dtype(dtype));
+}
 
 static void sparse_mask_apply_out_mps_kernel(
     Tensor& result,
@@ -779,9 +806,9 @@ static void sparse_mask_apply_out_mps_kernel(
   auto src  = src_in.coalesce();
   auto mask = coalesce_mask ? mask_in.coalesce() : mask_in;
 
-  const int64_t src_nnz = src._nnz();
-  const int64_t mask_nnz = mask._nnz();
-  const int64_t sd = src.sparse_dim();
+  const auto src_nnz = src._nnz();
+  const auto mask_nnz = mask._nnz();
+  const auto sd = src.sparse_dim();
   result.sparse_resize_(mask.sizes(), mask.sparse_dim(), mask.dense_dim());
 
   auto commonDtype = at::result_type(src, mask);
@@ -810,53 +837,27 @@ static void sparse_mask_apply_out_mps_kernel(
     return;
   }
 
+  auto mask_indices = mask._indices().contiguous();
+  auto src_values = src._values().to(commonDtype).contiguous();
+  auto out_values = create_sparse_output_values(src_values, mask_nnz, commonDtype);
+
   if (src_nnz == 0) {
-    auto out_indices = mask._indices().contiguous();
-    auto src_values  = src._values().to(commonDtype);
-    auto out_val_sizes = src_values.sizes().vec();
-    out_val_sizes[0] = mask_nnz;
-    auto out_values = at::zeros(out_val_sizes, src_values.options());
-    alias_into_sparse(result, out_indices, out_values);
+    alias_into_sparse(result, mask_indices, out_values);
     result._coalesced_(mask.is_coalesced());
     return;
   }
 
-  auto mask_indices = mask._indices().contiguous();
-  auto src_indices = src._indices().contiguous();
-  auto src_values = src._values().to(commonDtype).contiguous();
-
-  auto mask_keys = flatten_indices(mask_indices, mask.sizes().slice(0, sd)).contiguous();
-  auto src_keys  = flatten_indices(src_indices,  src.sizes().slice(0, sd)).contiguous();
+  auto mask_keys = flatten_indices(mask._indices().contiguous(), mask.sizes().slice(0, sd)).contiguous();
+  auto src_keys  = flatten_indices(src._indices().contiguous(), src.sizes().slice(0, sd)).contiguous();
 
-  const bool A_is_src = (src_nnz <= mask_nnz);
-  const int64_t lenA = A_is_src ? src_nnz  : mask_nnz;
-  const int64_t lenB = A_is_src ? mask_nnz : src_nnz;
+  const auto A_is_src = (src_nnz <= mask_nnz);
+  const auto lenA = A_is_src ? src_nnz  : mask_nnz;
+  const auto lenB = A_is_src ? mask_nnz : src_nnz;
   auto A_keys = A_is_src ? src_keys  : mask_keys;
   auto B_keys = A_is_src ? mask_keys : src_keys;
 
-  const auto device = result.device();
-  auto stream = getCurrentMPSStream();
-
-  auto outA_idx = at::empty({lenA}, at::device(device).dtype(at::kLong));
-  auto outB_idx = at::empty({lenA}, at::device(device).dtype(at::kLong));
-  auto counter = at::zeros({1}, at::device(device).dtype(at::kInt));
-
-  dispatch_sync_with_rethrow(stream->queue(), ^() {
-    @autoreleasepool {
-      auto pso = lib.getPipelineStateForFunc("intersect_binary_search");
-      auto enc = stream->commandEncoder();
-      [enc setComputePipelineState:pso];
-      mtl_setArgs(enc, A_keys, B_keys, outA_idx, outB_idx, counter,
-                  static_cast<uint32_t>(lenB), A_is_src);
-      mtl_dispatch1DJob(enc, pso, static_cast<uint32_t>(lenA));
-    }
-  });
-
-  const int64_t M = static_cast<int64_t>(counter.item<int32_t>());
-
-  auto out_val_sizes = src_values.sizes().vec();
-  out_val_sizes[0] = mask_nnz;
-  auto out_values = at::zeros(out_val_sizes, src_values.options());
+  auto [outA_idx, outB_idx, M] = mps_intersect_binary_search(
+      A_keys, B_keys, lenA, lenB, A_is_src);
 
   if (M > 0) {
     auto src_match = outA_idx.narrow(0, 0, M);
@@ -874,6 +875,70 @@ static void sparse_mask_apply_out_mps_kernel(
   result._coalesced_(mask.is_coalesced());
 }
 
+static void sparse_mask_projection_out_mps_kernel(
+    Tensor& result,
+    const Tensor& lhs,
+    const Tensor& rhs,
+    const OptTensor& /*x_hash_opt*/,
+    bool accumulate_matches) {
+
+  TORCH_CHECK(lhs.is_sparse() && rhs.is_sparse(), "sparse_mask_projection: expected sparse COO");
+  TORCH_CHECK(lhs.is_mps() && rhs.is_mps(), "sparse_mask_projection: expected MPS tensors");
+  TORCH_CHECK(lhs.sparse_dim() == rhs.sparse_dim(), "sparse_dim mismatch");
+
+  auto lhs_c = lhs.coalesce();
+  auto rhs_c = rhs.coalesce();
+
+  const auto sd = lhs_c.sparse_dim();
+  const auto lhs_nnz = lhs_c._nnz();
+  const auto rhs_nnz = rhs_c._nnz();
+
+  auto commonDtype = at::result_type(lhs_c, rhs_c);
+  TORCH_CHECK(canCast(commonDtype, result.scalar_type()),
+              "Can't convert ", commonDtype, " to output ", result.scalar_type());
+
+  result.sparse_resize_(lhs.sizes(), lhs.sparse_dim(), lhs.dense_dim());
+
+  auto lhs_indices = lhs_c._indices().contiguous();
+  auto rhs_values  = rhs_c._values().to(commonDtype).contiguous();
+  auto out_values = create_sparse_output_values(rhs_values, lhs_nnz, commonDtype);
+
+  if (lhs_nnz > 0 && rhs_nnz > 0) {
+    auto lhs_keys = flatten_indices(lhs_indices, lhs_c.sizes().slice(0, sd)).contiguous();
+    auto rhs_keys = flatten_indices(rhs_c._indices().contiguous(), rhs_c.sizes().slice(0, sd)).contiguous();
+
+    const auto A_is_lhs = (lhs_nnz <= rhs_nnz);
+    const auto lenA = A_is_lhs ? lhs_nnz : rhs_nnz;
+    const auto lenB = A_is_lhs ? rhs_nnz : lhs_nnz;
+    auto A_keys = A_is_lhs ? lhs_keys : rhs_keys;
+    auto B_keys = A_is_lhs ? rhs_keys : lhs_keys;
+
+    auto [outA_idx, outB_idx, M] = mps_intersect_binary_search(
+        A_keys, B_keys, lenA, lenB, A_is_lhs);
+
+    if (M > 0) {
+      auto idx_in_A = outA_idx.narrow(0, 0, M);
+      auto idx_in_B = outB_idx.narrow(0, 0, M);
+      auto idx_in_lhs = A_is_lhs ? idx_in_A : idx_in_B;
+      auto idx_in_rhs = A_is_lhs ? idx_in_B : idx_in_A;
+
+      const auto view_cols = rhs_values.numel() / std::max<int64_t>(rhs_nnz, 1);
+      auto rhs_rows = rhs_values.index_select(0, idx_in_rhs).contiguous();
+      auto rhs_rows_2d = rhs_rows.view({M, view_cols});
+      auto out_2d = out_values.view({lhs_nnz, view_cols});
+
+      if (accumulate_matches) {
+        out_2d.index_add_(0, idx_in_lhs, rhs_rows_2d);
+      } else {
+        out_2d.index_copy_(0, idx_in_lhs, rhs_rows_2d);
+      }
+    }
+  }
+
+  alias_into_sparse(result, lhs._indices(), out_values);
+  result._coalesced_(lhs.is_coalesced());
+}
+
 static void sparse_mask_intersection_out_mps_kernel(
     Tensor& result,
     const Tensor& lhs,
@@ -888,5 +953,115 @@ static void sparse_mask_intersection_out_mps_kernel(
       /*coalesce_mask=*/false);
 }
 
+Tensor sparse_sparse_matmul_mps(const Tensor& mat1_, const Tensor& mat2_) {
+  TORCH_CHECK(mat1_.is_sparse() && mat2_.is_sparse(),
+              "sparse_sparse_matmul_mps: both inputs must be sparse COO tensors");
+  TORCH_CHECK(mat1_.is_mps() && mat2_.is_mps(),
+              "sparse_sparse_matmul_mps: both inputs must be on MPS device");
+  TORCH_CHECK(mat1_.dim() == 2 && mat2_.dim() == 2,
+              "sparse_sparse_matmul_mps: both inputs must be 2D matrices");
+  TORCH_CHECK(mat1_.dense_dim() == 0 && mat2_.dense_dim() == 0,
+              "sparse_sparse_matmul_mps: only scalar values supported (dense_dim == 0)");
+  TORCH_CHECK(mat1_.size(1) == mat2_.size(0),
+              "mat1 and mat2 shapes cannot be multiplied (", mat1_.size(0), "x", mat1_.size(1), " and ", mat2_.size(0), "x", mat2_.size(1), ")");
+  TORCH_CHECK(mat1_.scalar_type() == mat2_.scalar_type(),
+              "sparse_sparse_matmul_mps: mat1 dtype ", mat1_.scalar_type(),
+              " does not match mat2 dtype ", mat2_.scalar_type());
+
+  const auto device = mat1_.device();
+
+  auto A = mat1_.coalesce();
+  auto B = mat2_.coalesce();
+
+  const auto I = A.size(0);
+  const auto K = A.size(1);
+  const auto N = B.size(1);
+
+  const auto nnzA = A._nnz();
+  const auto nnzB = B._nnz();
+
+  // Early empty result, return an empty, coalesced tensor
+  if (I == 0 || N == 0 || K == 0 || nnzA == 0 || nnzB == 0) {
+    auto empty_idx = at::empty({2, 0}, at::device(device).dtype(at::kLong));
+    auto empty_val = at::empty({0}, at::device(device).dtype(mat1_.scalar_type()));
+    auto out = _sparse_coo_tensor_unsafe(empty_idx, empty_val, {I, N}, mat1_.options());
+    out._coalesced_(true);
+    return out;
+  }
+
+  const auto computeDtype = at::result_type(mat1_, mat2_);
+
+  auto A_idx = A._indices().contiguous();
+  auto A_val = A._values().to(computeDtype).contiguous();
+  auto A_i = A_idx.select(0, 0).contiguous();
+  auto A_k = A_idx.select(0, 1).contiguous();
+
+  auto B_idx = B._indices().contiguous();
+  auto B_val = B._values().to(computeDtype).contiguous();
+  auto B_k = B_idx.select(0, 0).contiguous();
+  auto B_j = B_idx.select(0, 1).contiguous();
+
+  // csr-style row pointers for B by k (the shared dimension)
+  Tensor row_ptr_B;
+  {
+    auto batch_ptr = at::tensor({0LL, nnzB}, at::device(device).dtype(at::kLong));
+    row_ptr_B = at::empty({K + 1}, at::device(device).dtype(at::kLong));
+    build_row_ptr_per_batch_mps(B_k, batch_ptr, /*B=*/1, /*I=*/K, row_ptr_B);
+  }
+
+  auto row_ptr_B_lo = row_ptr_B.narrow(0, 0, K);
+  auto row_ptr_B_hi = row_ptr_B.narrow(0, 1, K);
+  auto deg_B = row_ptr_B_hi.sub(row_ptr_B_lo);
+
+  auto counts = deg_B.index_select(0, A_k);
+
+  const int64_t P = counts.sum().item<int64_t>();
+  if (P == 0) {
+    auto empty_idx = at::empty({2, 0}, at::device(device).dtype(at::kLong));
+    auto empty_val = at::empty({0}, at::device(device).dtype(mat1_.scalar_type()));
+    auto out = _sparse_coo_tensor_unsafe(empty_idx, empty_val, {I, N}, mat1_.options());
+    out._coalesced_(true);
+    return out;
+  }
+
+  auto group_ids = repeat_interleave_mps(counts);
+
+  // exclusive cumsum of counts
+  auto offsets = cumsum(counts, /*dim=*/0).sub(counts);
+  auto offsets_gather = offsets.index_select(0, group_ids);
+  auto within = at::arange(P, at::device(device).dtype(at::kLong)).sub(offsets_gather);
+
+  // Map each output element to its source B row and position
+  auto k_per_out = A_k.index_select(0, group_ids);
+  auto start_in_B = row_ptr_B.index_select(0, k_per_out);
+  auto seg_index = start_in_B.add(within);
+
+  // Assemble candidate coo pairs and values
+  auto i_out = A_i.index_select(0, group_ids).contiguous();
+  auto j_out = B_j.index_select(0, seg_index).contiguous();
+  auto vA_out = A_val.index_select(0, group_ids).contiguous();
+  auto vB_out = B_val.index_select(0, seg_index).contiguous();
+  auto v_out = vA_out.mul(vB_out);
+
+  // build (2, P) indices
+  auto out_indices = at::empty({2, P}, at::device(device).dtype(at::kLong)).contiguous();
+  out_indices.select(0, 0).copy_(i_out);
+  out_indices.select(0, 1).copy_(j_out);
+
+  auto result = _sparse_coo_tensor_unsafe(
+      out_indices, v_out, {I, N}, mat1_.options().dtype(computeDtype));
+
+  result = result.coalesce();
+
+  if (result.scalar_type() != mat1_.scalar_type()) {
+    auto cast_vals = result._values().to(mat1_.scalar_type());
+    auto out = _sparse_coo_tensor_unsafe(result._indices(), cast_vals, {I, N}, mat1_.options());
+    out._coalesced_(true);
+    return out;
+  }
+  return result;
+}
+
 REGISTER_MPS_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_mps_kernel);
+REGISTER_MPS_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_mps_kernel);
 } // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
index 7fce73151b00f..a6742a7cb9e78 100644
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
@@ -478,7 +478,7 @@ bool check_cudnn_tensor_shapes(sdp_params const& params, bool debug) {
   const auto s_k = params.key.sym_size(2);
   const auto d_qk = params.query.sym_size(3);
   const auto d_v = params.value.sym_size(3);
-  long cudnn_version = at::detail::getCUDAHooks().versionCuDNN();
+  long cudnn_version = at::detail::getCUDAHooks().versionRuntimeCuDNN();
   if (cudnn_version < 8903) {
     if (debug) {
       TORCH_WARN("SDPA fprop requires cudnn 8.9.3 or higher");
@@ -709,7 +709,7 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
   return false;
 #endif
 #if defined(CUDNN_VERSION)
-  static auto cudnn_version = cudnnGetVersion();
+  static auto cudnn_version = at::detail::getCUDAHooks().versionRuntimeCuDNN();
   if (params.dropout > 0.0 && cudnn_version > 91100 && cudnn_version < 91400) {
     if (debug) {
       TORCH_WARN(CUDNN_VERSION, " cuDNN version does not support droppout in SDPA (9.11 - 9.13).");
diff --git a/aten/src/ATen/native/utils/ParamUtils.h b/aten/src/ATen/native/utils/ParamUtils.h
index c9088c03d81c1..8887664df1ce3 100644
--- a/aten/src/ATen/native/utils/ParamUtils.h
+++ b/aten/src/ATen/native/utils/ParamUtils.h
@@ -17,7 +17,7 @@ inline std::vector<T> _expand_param_if_needed(
     std::ostringstream ss;
     ss << "expected " << param_name << " to be a single integer value or a "
        << "list of " << expected_dim << " values to match the convolution "
-       << "dimensions, but got " << param_name << "=" << list_param;
+       << "dimensions, but got " << param_name << '=' << list_param;
     TORCH_CHECK(false, ss.str());
   } else {
     return list_param.vec();
diff --git a/aten/src/ATen/native/vulkan/api/Adapter.cpp b/aten/src/ATen/native/vulkan/api/Adapter.cpp
index 173479a0c2de0..350df39ea3684 100644
--- a/aten/src/ATen/native/vulkan/api/Adapter.cpp
+++ b/aten/src/ATen/native/vulkan/api/Adapter.cpp
@@ -358,9 +358,9 @@ std::string Adapter::stringize() const {
   std::string device_type = get_device_type_str(properties.deviceType);
   VkPhysicalDeviceLimits limits = properties.limits;
 
-  ss << "{" << std::endl;
+  ss << '{' << std::endl;
   ss << "  Physical Device Info {" << std::endl;
-  ss << "    apiVersion:    " << v_major << "." << v_minor << std::endl;
+  ss << "    apiVersion:    " << v_major << '.' << v_minor << std::endl;
   ss << "    driverversion: " << properties.driverVersion << std::endl;
   ss << "    deviceType:    " << device_type << std::endl;
   ss << "    deviceName:    " << properties.deviceName << std::endl;
@@ -371,7 +371,7 @@ std::string Adapter::stringize() const {
 
 #define PRINT_LIMIT_PROP_VEC3(name)                                       \
   ss << "      " << std::left << std::setw(36) << #name << limits.name[0] \
-     << "," << limits.name[1] << "," << limits.name[2] << std::endl;
+     << ',' << limits.name[1] << ',' << limits.name[2] << std::endl;
 
   ss << "    Physical Device Limits {" << std::endl;
   PRINT_LIMIT_PROP(maxImageDimension1D);
@@ -425,7 +425,7 @@ std::string Adapter::stringize() const {
     ;
   }
   ss << "  ]" << std::endl;
-  ss << "}";
+  ss << '}';
 
   return ss.str();
 }
diff --git a/aten/src/ATen/native/vulkan/api/Exception.cpp b/aten/src/ATen/native/vulkan/api/Exception.cpp
index 9b8b653e0619e..436b38cbba6c6 100644
--- a/aten/src/ATen/native/vulkan/api/Exception.cpp
+++ b/aten/src/ATen/native/vulkan/api/Exception.cpp
@@ -33,7 +33,7 @@ std::ostream& operator<<(std::ostream& out, const VkResult result) {
     VK_RESULT_CASE(VK_ERROR_FORMAT_NOT_SUPPORTED)
     VK_RESULT_CASE(VK_ERROR_FRAGMENTED_POOL)
     default:
-      out << "VK_ERROR_UNKNOWN (VkResult " << result << ")";
+      out << "VK_ERROR_UNKNOWN (VkResult " << result << ')';
       break;
   }
   return out;
@@ -46,7 +46,7 @@ std::ostream& operator<<(std::ostream& out, const VkResult result) {
 //
 
 std::ostream& operator<<(std::ostream& out, const SourceLocation& loc) {
-  out << loc.function << " at " << loc.file << ":" << loc.line;
+  out << loc.function << " at " << loc.file << ':' << loc.line;
   return out;
 }
 
@@ -66,7 +66,7 @@ Error::Error(SourceLocation source_location, const char* cond, std::string msg)
     : msg_(std::move(msg)), source_location_{source_location} {
   std::ostringstream oss;
   oss << "Exception raised from " << source_location_ << ": ";
-  oss << "(" << cond << ") is false! ";
+  oss << '(' << cond << ") is false! ";
   oss << msg_;
   what_ = oss.str();
 }
diff --git a/aten/src/ATen/native/vulkan/api/QueryPool.cpp b/aten/src/ATen/native/vulkan/api/QueryPool.cpp
index bfa92357daeed..63c163aa44aa9 100644
--- a/aten/src/ATen/native/vulkan/api/QueryPool.cpp
+++ b/aten/src/ATen/native/vulkan/api/QueryPool.cpp
@@ -173,8 +173,8 @@ void QueryPool::extract_results() {
 
 static std::string stringize(const VkExtent3D& extents) {
   std::stringstream ss;
-  ss << "{" << extents.width << ", " << extents.height << ", " << extents.depth
-     << "}";
+  ss << '{' << extents.width << ", " << extents.height << ", " << extents.depth
+     << '}';
   return ss.str();
 }
 
diff --git a/aten/src/ATen/native/vulkan/api/Runtime.cpp b/aten/src/ATen/native/vulkan/api/Runtime.cpp
index cf8402e40a0b8..a7485b706c54e 100644
--- a/aten/src/ATen/native/vulkan/api/Runtime.cpp
+++ b/aten/src/ATen/native/vulkan/api/Runtime.cpp
@@ -149,7 +149,7 @@ VKAPI_ATTR VkBool32 VKAPI_CALL debug_report_callback_fn(
   (void)flags;
 
   std::stringstream stream;
-  stream << layer_prefix << " " << message_code << " " << message << std::endl;
+  stream << layer_prefix << ' ' << message_code << ' ' << message << std::endl;
   const std::string log = stream.str();
 
   std::cout << log;
diff --git a/aten/src/ATen/native/vulkan/api/Utils.h b/aten/src/ATen/native/vulkan/api/Utils.h
index 3172c9c461079..8cd6a74c1c467 100644
--- a/aten/src/ATen/native/vulkan/api/Utils.h
+++ b/aten/src/ATen/native/vulkan/api/Utils.h
@@ -253,7 +253,7 @@ using vec4 = vec<4u>;
 
 // uvec3 is the type representing tensor extents. Useful for debugging.
 inline std::ostream& operator<<(std::ostream& os, const uvec3& v) {
-  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ")";
+  os << '(' << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ')';
   return os;
 }
 
diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt
index 81b3ce90b36bf..a522e7ab76cf4 100644
--- a/aten/src/ATen/test/CMakeLists.txt
+++ b/aten/src/ATen/test/CMakeLists.txt
@@ -61,6 +61,7 @@ list(APPEND ATen_CUDA_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_complex_math_test.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_complex_test.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_cub_test.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_cublas_handle_pool_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_device_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_distributions_test.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_dlconvertor_test.cpp
diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp
index 0937de4552821..33fe4121a040e 100644
--- a/aten/src/ATen/test/basic.cpp
+++ b/aten/src/ATen/test/basic.cpp
@@ -246,7 +246,7 @@ void TestToCFloat() {
 void TestToString() {
   Tensor b = ones({3, 7}) * .0000001f;
   std::stringstream s;
-  s << b << "\n";
+  s << b << '\n';
   std::string expect = "1e-07 *";
   ASSERT_EQ_RESOLVED(s.str().substr(0, expect.size()), expect);
 }
diff --git a/aten/src/ATen/test/cuda_cublas_handle_pool_test.cpp b/aten/src/ATen/test/cuda_cublas_handle_pool_test.cpp
new file mode 100644
index 0000000000000..535bb3d1cc2ea
--- /dev/null
+++ b/aten/src/ATen/test/cuda_cublas_handle_pool_test.cpp
@@ -0,0 +1,77 @@
+#include <gtest/gtest.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <atomic>
+#include <thread>
+#include <vector>
+
+// Test concurrent access to getCurrentCUDABlasHandle and getCUDABlasLtWorkspace
+// to verify that the data race fix is working correctly
+
+TEST(CUDABlasHandlePoolTest, ConcurrentGetAndClearWorkspaces) {
+  if (!at::cuda::is_available()) {
+    return;
+  }
+
+  constexpr int num_accessor_threads = 15;
+  constexpr int num_clear_threads = 5;
+  constexpr int iterations_per_thread = 50;
+
+  std::atomic<bool> stop{false};
+  std::atomic<int> error_count{0};
+  std::vector<std::thread> threads;
+  threads.reserve(num_accessor_threads + num_clear_threads);
+
+  // Launch accessor threads
+  for (int i = 0; i < num_accessor_threads; ++i) {
+    threads.emplace_back([&stop, &error_count]() {
+      try {
+        at::cuda::CUDAGuard device_guard(0);
+
+        while (!stop.load(std::memory_order_relaxed)) {
+          const auto handle = at::cuda::getCurrentCUDABlasHandle();
+          const auto workspace = at::cuda::getCUDABlasLtWorkspace();
+
+          if (handle == nullptr || workspace == nullptr) {
+            error_count++;
+          }
+        }
+      } catch (const std::exception& e) {
+        error_count++;
+      }
+    });
+  }
+
+  // Launch threads that clear workspaces
+  for (int i = 0; i < num_clear_threads; ++i) {
+    threads.emplace_back([&error_count]() {
+      try {
+        for (int j = 0; j < iterations_per_thread; ++j) {
+          at::cuda::clearCublasWorkspaces();
+          std::this_thread::yield();
+        }
+      } catch (const std::exception& e) {
+        error_count++;
+      }
+    });
+  }
+
+  // Let them run for a bit
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  stop.store(true, std::memory_order_relaxed);
+
+  for (auto& thread : threads) {
+    thread.join();
+  }
+
+  EXPECT_EQ(error_count.load(), 0);
+}
+
+int main(int argc, char* argv[]) {
+  ::testing::InitGoogleTest(&argc, argv);
+  c10::cuda::CUDACachingAllocator::init(1);
+  return RUN_ALL_TESTS();
+}
diff --git a/aten/src/ATen/test/scalar_test.cpp b/aten/src/ATen/test/scalar_test.cpp
index 0d7b62b44d214..a22fb0d16adf8 100644
--- a/aten/src/ATen/test/scalar_test.cpp
+++ b/aten/src/ATen/test/scalar_test.cpp
@@ -33,7 +33,7 @@ struct Foo {
   static void apply(Tensor a, Tensor b) {
     scalar_type s = 1;
     std::stringstream ss;
-    ss << "hello, dispatch: " << a.toString() << s << "\n";
+    ss << "hello, dispatch: " << a.toString() << s << '\n';
     auto data = (scalar_type*)a.data_ptr();
     (void)data;
   }
@@ -73,8 +73,8 @@ TEST(TestScalar, TestScalar) {
   Scalar bar = 3.0;
   Half h = bar.toHalf();
   Scalar h2 = h;
-  cout << "H2: " << h2.toDouble() << " " << what.toFloat() << " "
-       << bar.toDouble() << " " << what.isIntegral(false) << "\n";
+  cout << "H2: " << h2.toDouble() << ' ' << what.toFloat() << ' '
+       << bar.toDouble() << ' ' << what.isIntegral(false) << '\n';
   auto gen = at::detail::getDefaultCPUGenerator();
   {
     // See Note [Acquire lock when using random generators]
@@ -84,7 +84,7 @@ TEST(TestScalar, TestScalar) {
   }
   if (at::hasCUDA()) {
     auto t2 = zeros({4, 4}, at::kCUDA);
-    cout << &t2 << "\n";
+    cout << &t2 << '\n';
   }
   auto t = ones({4, 4});
 
@@ -129,7 +129,7 @@ TEST(TestScalar, TestScalar) {
       std::stringstream ss;
       // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
       ASSERT_NO_THROW(
-          ss << "hello, dispatch" << x.toString() << s << "\n");
+          ss << "hello, dispatch" << x.toString() << s << '\n');
       auto data = (scalar_t*)x.data_ptr();
       (void)data;
     });
diff --git a/aten/src/ATen/test/test_install/main.cpp b/aten/src/ATen/test/test_install/main.cpp
index e9a03d2303a39..3a57e0c6212bf 100644
--- a/aten/src/ATen/test/test_install/main.cpp
+++ b/aten/src/ATen/test/test_install/main.cpp
@@ -1,5 +1,5 @@
 #include <ATen/ATen.h>
 
 int main() {
-  std::cout << at::ones({3,4}, at::CPU(at::kFloat)) << "\n";
+  std::cout << at::ones({3,4}, at::CPU(at::kFloat)) << '\n';
 }
diff --git a/aten/src/ATen/test/vec_test_all_types.cpp b/aten/src/ATen/test/vec_test_all_types.cpp
index da0da76109569..c0c05c1484175 100644
--- a/aten/src/ATen/test/vec_test_all_types.cpp
+++ b/aten/src/ATen/test/vec_test_all_types.cpp
@@ -1828,9 +1828,9 @@ namespace {
       #endif
 
         EXPECT_EQ(u16, c10::detail::fp16_ieee_from_fp32_value(f32s[i]))
-            << "Test failed for float to uint16 " << f32s[i] << "\n";
+            << "Test failed for float to uint16 " << f32s[i] << '\n';
         EXPECT_EQ(x, c10::detail::fp16_ieee_to_fp32_value(u16))
-            << "Test failed for uint16 to float " << u16 << "\n";
+            << "Test failed for uint16 to float " << u16 << '\n';
       }
     }
     TEST(FP8E4M3Test, FP8E4M3ConversionFloat) {
@@ -1848,10 +1848,10 @@ namespace {
           EXPECT_TRUE(std::isnan(f32));
         } else {
           EXPECT_EQ(f32, c10::detail::fp8e4m3fn_to_fp32_value(input))
-              << "Test failed for u8 to float " << input << "\n";
+              << "Test failed for u8 to float " << input << '\n';
         }
         EXPECT_EQ(u8, c10::detail::fp8e4m3fn_from_fp32_value(f32))
-            << "Test failed for float to u8 " << f32 << "\n";
+            << "Test failed for float to u8 " << f32 << '\n';
       }
     }
     TEST(FP8E4M3Test, FP8E4M3BinaryAdd) {
@@ -2015,10 +2015,10 @@ namespace {
           EXPECT_TRUE(std::isnan(f32));
         } else {
           EXPECT_EQ(f32, c10::detail::fp8e5m2_to_fp32_value(input))
-              << "Test failed for u8 to float " << input << "\n";
+              << "Test failed for u8 to float " << input << '\n';
         }
         EXPECT_EQ(u8, c10::detail::fp8e5m2_from_fp32_value(f32))
-            << "Test failed for float to u8 " << f32 << "\n";
+            << "Test failed for float to u8 " << f32 << '\n';
       }
     }
     TEST(FP8E5M2Test, FP8E5M2BinaryAdd) {
diff --git a/aten/src/ATen/test/vitals.cpp b/aten/src/ATen/test/vitals.cpp
index cc93775bb5383..eaf1cc152bc37 100644
--- a/aten/src/ATen/test/vitals.cpp
+++ b/aten/src/ATen/test/vitals.cpp
@@ -19,7 +19,7 @@ TEST(Vitals, Basic) {
     c10::utils::set_env("TORCH_VITAL", "1");
     TORCH_VITAL_DEFINE(Testing);
     TORCH_VITAL(Testing, Attribute0) << 1;
-    TORCH_VITAL(Testing, Attribute1) << "1";
+    TORCH_VITAL(Testing, Attribute1) << '1';
     TORCH_VITAL(Testing, Attribute2) << 1.0f;
     TORCH_VITAL(Testing, Attribute3) << 1.0;
     auto t = at::ones({1, 1});
diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp
index 396ea59d2f008..29f01fbd78c51 100644
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@@ -129,14 +129,14 @@ void showRtol(const at::Tensor& a, const at::Tensor& b) {
   std::cout << "Max Diff allowed: " << maxDiff << std::endl;
   if (diff.sizes().size() == 2) {
     for (const auto y : c10::irange(diff.sizes()[0])) {
-      std::cout << y << ":";
+      std::cout << y << ':';
       for (const auto x : c10::irange(diff.sizes()[1])) {
         float diff_xy = diff[y][x].item<float>();
         if (diff_xy > maxDiff) {
           std::cout << std::setw(5) << x;
         }
         else {
-          std::cout << std::setw(5) << " ";
+          std::cout << std::setw(5) << ' ';
         }
       }
       std::cout << std::endl;
@@ -3276,7 +3276,7 @@ TEST_F(VulkanAPITest, masked_fill_invalidinputs_exceptions) {
 
 void print_shape(const std::vector<int64_t>& shape) {
   for (const auto& num : shape) {
-    std::cout << num << " ";
+    std::cout << num << ' ';
   }
 }
 
@@ -3367,7 +3367,7 @@ void test_masked_fill_scalar(
             print_shape(tmp_curr_input_shape);
             std::cout << "], and mask of shape [";
             print_shape(tmp_curr_mask_shape);
-            std::cout << "]" << std::endl;
+            std::cout << ']' << std::endl;
           }
 
           ASSERT_TRUE(check);
@@ -4542,9 +4542,9 @@ void test_softmax(const at::IntArrayRef shape, bool log_softmax = false) {
     if (!check) {
       std::cout << "Softmax test failed on axis " << dim << "for tensor dims {";
       for (uint32_t place = 0; place < shape.size() - 1; place++) {
-        std::cout << shape[place] << " ";
+        std::cout << shape[place] << ' ';
       }
-      std::cout << shape.back() << "}" << std::endl;
+      std::cout << shape.back() << '}' << std::endl;
       showRtol(out_cpu, out_vulkan.cpu());
     }
     ASSERT_TRUE(check);
diff --git a/aten/src/ATen/test/vulkan_quantized_api_test.cpp b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
index 2829aed94def9..2eff421a64ced 100644
--- a/aten/src/ATen/test/vulkan_quantized_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
@@ -95,7 +95,7 @@ void showRtol(
   std::cout << "Max Diff found is: " << diff.max().item<double>() << std::endl;
   if (diff.sizes().size() == 2) {
     for (const auto y : c10::irange(diff.sizes()[0])) {
-      std::cout << y << ":";
+      std::cout << y << ':';
       for (const auto x : c10::irange(diff.sizes()[1])) {
         double diff_xy = diff[y][x].item<double>();
         if (diff_xy > maxDiff) {
@@ -109,7 +109,7 @@ void showRtol(
             }
           }
         } else {
-          std::cout << std::setw(5) << " ";
+          std::cout << std::setw(5) << ' ';
         }
       }
       std::cout << std::endl;
@@ -148,19 +148,19 @@ using at::native::vulkan::api::utils::ivec4;
 using at::native::vulkan::api::utils::vec4;
 
 std::ostream& operator<<(std::ostream& os, const vec4& v) {
-  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", "
-     << v.data[3u] << ")";
+  os << '(' << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", "
+     << v.data[3u] << ')';
   return os;
 }
 
 std::ostream& operator<<(std::ostream& os, const ivec3& v) {
-  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ")";
+  os << '(' << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ')';
   return os;
 }
 
 std::ostream& operator<<(std::ostream& os, const ivec4& v) {
-  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", "
-     << v.data[3u] << ")";
+  os << '(' << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", "
+     << v.data[3u] << ')';
   return os;
 }
 
@@ -3379,51 +3379,51 @@ bool _test_quantized_linear(
       showRtol(out_cpu_dequant, out_vk_to_cpu_dequant);
     }
     if (xpos != -1 && ypos != -1) {
-      std::cout << "\nFailure caused on row/col: " << ypos << "/" << xpos
-                << "\n";
+      std::cout << "\nFailure caused on row/col: " << ypos << '/' << xpos
+                << '\n';
       std::cout << "Input tensor scale: " << scale << " zerop: " << zero_point
-                << "\n";
-      std::cout << "Input tensor row " << ypos << "\n";
+                << '\n';
+      std::cout << "Input tensor row " << ypos << '\n';
       for (int i = 0; i < input_cpu.sizes()[1]; i++) {
         std::cout << input_cpu[ypos][i].item<double>() << ", ";
       }
-      std::cout << "\n";
+      std::cout << '\n';
 
       std::cout << "Weight tensor scale: " << w_scale
-                << " zerop: " << w_zero_point << "\n";
-      std::cout << "Weight tensor col " << xpos << "\n";
+                << " zerop: " << w_zero_point << '\n';
+      std::cout << "Weight tensor col " << xpos << '\n';
       for (int i = 0; i < weight.sizes()[1]; i++) {
         std::cout << weight[xpos][i].item<double>() << ", ";
       }
-      std::cout << "\n";
+      std::cout << '\n';
 
       std::cout << "Input tensor quantized row " << ypos << " with dtype "
-                << (input_quant_dtype_int8 ? "QInt8" : "QUInt8") << "\n";
+                << (input_quant_dtype_int8 ? "QInt8" : "QUInt8") << '\n';
       for (int i = 0; i < input_cpu.sizes()[1]; i++) {
         std::cout << input_cpu_quantized[ypos][i].item<double>() << ", ";
       }
-      std::cout << "\n";
+      std::cout << '\n';
 
       std::cout << "Weight tensor quantized col " << xpos << " with dtype "
-                << (weight_quant_dtype_int8 ? "QInt8" : "QUInt8") << "\n";
+                << (weight_quant_dtype_int8 ? "QInt8" : "QUInt8") << '\n';
       for (int i = 0; i < weight.sizes()[1]; i++) {
         std::cout << weight_cpu_quantized[xpos][i].item<double>() << ", ";
       }
-      std::cout << "\n";
+      std::cout << '\n';
 
       std::cout << "bias tensor\n";
       for (int i = 0; i < bias.sizes()[0]; i++) {
         std::cout << bias[i].item<double>() << ", ";
       }
-      std::cout << "\n";
+      std::cout << '\n';
 
       std::cout << "out_scale: " << out_scale
-                << " out_zero_point: " << out_zero_point << "\n";
+                << " out_zero_point: " << out_zero_point << '\n';
 
       std::cout << "cpu unmatched output: "
-                << out_cpu_dequant[ypos][xpos].item<double>() << "\n";
+                << out_cpu_dequant[ypos][xpos].item<double>() << '\n';
       std::cout << "vk unmatched output: "
-                << out_vk_to_cpu_dequant[ypos][xpos].item<double>() << "\n";
+                << out_vk_to_cpu_dequant[ypos][xpos].item<double>() << '\n';
     }
   }
   return check;
diff --git a/aten/src/ATen/xpu/XPUEvent.h b/aten/src/ATen/xpu/XPUEvent.h
index 19d42aae080f1..f33fd70ac0619 100644
--- a/aten/src/ATen/xpu/XPUEvent.h
+++ b/aten/src/ATen/xpu/XPUEvent.h
@@ -1,191 +1,3 @@
 #pragma once
 #include <ATen/xpu/XPUContext.h>
-
-#include <optional>
-
-namespace at::xpu {
-
-/*
- * XPUEvent are movable not copyable wrappers around SYCL event. XPUEvent are
- * constructed lazily when first recorded. It has a device, and this device is
- * acquired from the first recording stream. Later streams that record the event
- * must match the same device.
- *
- * Currently, XPUEvent does NOT support to export an inter-process event from
- * another process via inter-process communication(IPC). So it means that
- * inter-process communication for event handles between different processes is
- * not available. This could impact some applications that rely on cross-process
- * synchronization and communication.
- */
-struct TORCH_XPU_API XPUEvent {
-  // Constructors
-  XPUEvent(bool enable_timing = false) noexcept
-      : enable_timing_{enable_timing} {}
-
-  ~XPUEvent() {
-    if (isCreated()) {
-      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
-      if (C10_UNLIKELY(interp)) {
-        (*interp)->trace_gpu_event_deletion(
-            at::kXPU, reinterpret_cast<uintptr_t>(event_.get()));
-      }
-    }
-  }
-
-  XPUEvent(const XPUEvent&) = delete;
-  XPUEvent& operator=(const XPUEvent&) = delete;
-
-  XPUEvent(XPUEvent&& other) = default;
-  XPUEvent& operator=(XPUEvent&& other) = default;
-
-  operator sycl::event&() const {
-    return event();
-  }
-
-  std::optional<at::Device> device() const {
-    if (isCreated()) {
-      return at::Device(at::kXPU, device_index_);
-    } else {
-      return std::nullopt;
-    }
-  }
-
-  inline bool isCreated() const {
-    return (event_.get() != nullptr);
-  }
-
-  DeviceIndex device_index() const {
-    return device_index_;
-  }
-
-  sycl::event& event() const {
-    return *event_;
-  }
-
-  bool query() const {
-    using namespace sycl::info;
-    if (!isCreated()) {
-      return true;
-    }
-
-    return event().get_info<event::command_execution_status>() ==
-        event_command_status::complete;
-  }
-
-  void record() {
-    record(getCurrentXPUStream());
-  }
-
-  void recordOnce(const XPUStream& stream) {
-    if (!isCreated()) {
-      record(stream);
-    }
-  }
-
-  void record(const XPUStream& stream) {
-    if (!isCreated()) {
-      device_index_ = stream.device_index();
-      assignEvent(stream.queue());
-      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
-      if (C10_UNLIKELY(interp)) {
-        (*interp)->trace_gpu_event_creation(
-            at::kXPU, reinterpret_cast<uintptr_t>(event_.get()));
-      }
-    } else {
-      TORCH_CHECK(
-          device_index_ == stream.device_index(),
-          "Event device ",
-          device_index_,
-          " does not match recording stream's device ",
-          stream.device_index(),
-          ".");
-      reassignEvent(stream.queue());
-    }
-    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
-    if (C10_UNLIKELY(interp)) {
-      (*interp)->trace_gpu_event_record(
-          at::kXPU,
-          reinterpret_cast<uintptr_t>(event_.get()),
-          reinterpret_cast<uintptr_t>(&stream.queue()));
-    }
-  }
-
-  void block(const XPUStream& stream) {
-    if (isCreated()) {
-      std::vector<sycl::event> event_list{event()};
-      // Make this stream wait until event_ is completed.
-      stream.queue().ext_oneapi_submit_barrier(event_list);
-      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
-      if (C10_UNLIKELY(interp)) {
-        (*interp)->trace_gpu_event_wait(
-            at::kXPU,
-            reinterpret_cast<uintptr_t>(event_.get()),
-            reinterpret_cast<uintptr_t>(&stream.queue()));
-      }
-    }
-  }
-
-  double elapsed_time(const XPUEvent& other) const {
-    TORCH_CHECK(
-        isCreated() && other.isCreated(),
-        "Both events must be recorded before calculating elapsed time.");
-    TORCH_CHECK(
-        query() && other.query(),
-        "Both events must be completed before calculating elapsed time.");
-    TORCH_CHECK(
-        enable_timing_ && other.enable_timing_,
-        "Both events must be created with argument 'enable_timing=True'.");
-
-#if SYCL_COMPILER_VERSION < 20250000
-    TORCH_CHECK_NOT_IMPLEMENTED(
-        false,
-        "elapsed_time of XPUEvent requires PyTorch to be built with SYCL compiler version 2025.0.0 or newer.");
-#endif
-
-    using namespace sycl::info::event_profiling;
-    // Block until both of the recorded events are completed.
-    uint64_t end_time_ns = other.event().get_profiling_info<command_end>();
-    uint64_t start_time_ns = event().get_profiling_info<command_end>();
-    // Return the eplased time in milliseconds.
-    return 1e-6 *
-        (static_cast<double>(end_time_ns) - static_cast<double>(start_time_ns));
-  }
-
-  void synchronize() const {
-    if (isCreated()) {
-      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
-      if (C10_UNLIKELY(interp)) {
-        (*interp)->trace_gpu_event_synchronization(
-            at::kXPU, reinterpret_cast<uintptr_t>(event_.get()));
-      }
-      event().wait_and_throw();
-    }
-  }
-
- private:
-  void assignEvent(sycl::queue& queue) {
-#if SYCL_COMPILER_VERSION >= 20250000
-    if (enable_timing_) {
-      event_ = std::make_unique<sycl::event>(
-          sycl::ext::oneapi::experimental::submit_profiling_tag(queue));
-    } else {
-      event_ = std::make_unique<sycl::event>(queue.ext_oneapi_submit_barrier());
-    }
-#else
-    event_ = std::make_unique<sycl::event>(queue.ext_oneapi_submit_barrier());
-#endif
-  }
-
-  void reassignEvent(sycl::queue& queue) {
-    event_.reset();
-    assignEvent(queue);
-  }
-
-  bool enable_timing_ = false;
-  DeviceIndex device_index_ = -1;
-  // Only need to track the last event, as events in an in-order queue are
-  // executed sequentially.
-  std::unique_ptr<sycl::event> event_;
-};
-
-} // namespace at::xpu
+#include <c10/xpu/XPUEvent.h>
diff --git a/aten/src/ATen/xpu/XPUScaledBlas.cpp b/aten/src/ATen/xpu/XPUScaledBlas.cpp
new file mode 100644
index 0000000000000..ea7e043da40ec
--- /dev/null
+++ b/aten/src/ATen/xpu/XPUScaledBlas.cpp
@@ -0,0 +1,122 @@
+#include <c10/core/Scalar.h>
+#include <c10/core/ScalarType.h>
+#include <c10/util/Exception.h>
+#include <c10/util/SmallVector.h>
+#include <c10/util/typeid.h>
+#include <cstdint>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/BlasBackend.h>
+#include <ATen/Dispatch.h>
+#include <ATen/ExpandUtils.h>
+#include <ATen/OpMathType.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/core/NamedTensor.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/native/GroupedMMUtils.h>
+#include <ATen/native/Resize.h>
+#include <c10/util/MaybeOwned.h>
+
+#include <ATen/ceil_div.h>
+#include <ATen/xpu/XPUScaledBlas.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_addmm_activation_native.h>
+#include <ATen/ops/_efficientzerotensor.h>
+#include <ATen/ops/_scaled_mm_native.h>
+#include <ATen/ops/_unsafe_view_native.h>
+#include <ATen/ops/abs.h>
+#include <ATen/ops/addmm_native.h>
+#include <ATen/ops/addmv_native.h>
+#include <ATen/ops/baddbmm_native.h>
+#include <ATen/ops/bmm_native.h>
+#include <ATen/ops/copy_native.h>
+#include <ATen/ops/dot_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_strided.h>
+#include <ATen/ops/gelu.h>
+#include <ATen/ops/max.h>
+#include <ATen/ops/mm_native.h>
+#include <ATen/ops/mul.h>
+#include <ATen/ops/ones.h>
+#include <ATen/ops/relu.h>
+#include <ATen/ops/scalar_tensor_native.h>
+#include <ATen/ops/vdot_native.h>
+#endif
+
+using at::blas::ScalingType;
+
+namespace at::native::onednn::scaled {
+
+/**
+ * Both inputs must be fp8,
+ * Each needs a single scale, {Tensorwise (float)}
+ */
+bool check_tensorwise_recipe(
+    c10::ScalarType type_a,
+    std::vector<ScalingType>& recipe_a,
+    ArrayRef<Tensor>& scales_a,
+    c10::ScalarType type_b,
+    std::vector<ScalingType>& recipe_b,
+    ArrayRef<Tensor>& scales_b) {
+  // both types must be fp8
+  if (!isFloat8Type(type_a) || !isFloat8Type(type_b)) {
+    return false;
+  }
+
+  // 1 scale each, {Tensorwise, float}
+  if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 ||
+      recipe_b.size() != 1) {
+    return false;
+  }
+  // Need {Blockwise_1x32, e8m0} for A & B
+  if (recipe_a[0] != ScalingType::TensorWise)
+    return false;
+  if (scales_a[0].scalar_type() != ScalarType::Float)
+    return false;
+  if (recipe_b[0] != ScalingType::TensorWise)
+    return false;
+  if (scales_b[0].scalar_type() != ScalarType::Float)
+    return false;
+
+  return true;
+}
+
+/**
+ * Both inputs must be fp8,
+ * Each needs scales, {Rowwise (float)}
+ */
+bool check_rowwise_recipe(
+    c10::ScalarType type_a,
+    std::vector<ScalingType>& recipe_a,
+    ArrayRef<Tensor>& scales_a,
+    c10::ScalarType type_b,
+    std::vector<ScalingType>& recipe_b,
+    ArrayRef<Tensor>& scales_b) {
+  // both types must be fp8
+  if (!isFloat8Type(type_a) || !isFloat8Type(type_b)) {
+    return false;
+  }
+
+  // 1 scale each, {Tensorwise, float}
+  if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 ||
+      recipe_b.size() != 1) {
+    return false;
+  }
+
+  // Need {RowWise, dp32} for A & B
+  if (recipe_a[0] != ScalingType::RowWise)
+    return false;
+  if (scales_a[0].scalar_type() != ScalarType::Float)
+    return false;
+  if (recipe_b[0] != ScalingType::RowWise)
+    return false;
+  if (scales_b[0].scalar_type() != ScalarType::Float)
+    return false;
+
+  return true;
+}
+
+} // namespace at::native::onednn::scaled
diff --git a/aten/src/ATen/xpu/XPUScaledBlas.h b/aten/src/ATen/xpu/XPUScaledBlas.h
new file mode 100644
index 0000000000000..2940dbfc56dfe
--- /dev/null
+++ b/aten/src/ATen/xpu/XPUScaledBlas.h
@@ -0,0 +1,95 @@
+#include <c10/core/Scalar.h>
+#include <c10/core/ScalarType.h>
+#include <c10/util/Exception.h>
+#include <c10/util/SmallVector.h>
+#include <c10/util/typeid.h>
+#include <cstdint>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/ExpandUtils.h>
+#include <ATen/OpMathType.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/core/NamedTensor.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/native/Resize.h>
+#include <c10/util/MaybeOwned.h>
+
+#include <ATen/BlasBackend.h>
+#include <ATen/ceil_div.h>
+
+#ifdef USE_FBGEMM_GENAI
+#include <fbgemm_gpu/torch_ops.h>
+#endif
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_addmm_activation_native.h>
+#include <ATen/ops/_efficientzerotensor.h>
+#include <ATen/ops/_scaled_mm_native.h>
+#include <ATen/ops/_unsafe_view_native.h>
+#include <ATen/ops/abs.h>
+#include <ATen/ops/addmm_native.h>
+#include <ATen/ops/addmv_native.h>
+#include <ATen/ops/baddbmm_native.h>
+#include <ATen/ops/bmm_native.h>
+#include <ATen/ops/copy_native.h>
+#include <ATen/ops/dot_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_strided.h>
+#include <ATen/ops/gelu.h>
+#include <ATen/ops/max.h>
+#include <ATen/ops/mm_native.h>
+#include <ATen/ops/mul.h>
+#include <ATen/ops/ones.h>
+#include <ATen/ops/relu.h>
+#include <ATen/ops/scalar_tensor_native.h>
+#include <ATen/ops/vdot_native.h>
+#endif
+
+using at::blas::ScalingType;
+
+namespace at::native::onednn::scaled {
+
+/**
+ * Track concrete implementations available
+ */
+enum class ScaledGemmImplementation {
+  NONE = 0,
+  TENSORWISE_TENSORWISE = 1,
+  ROWWISE_ROWWISE = 2,
+};
+
+/**
+ * Convert passed int (enum) from python back into a
+ * strictly-typed enum
+ */
+template <class EnumType, class ArrayType>
+std::vector<EnumType> convert_int_to_enum(ArrayType& v) {
+  std::vector<EnumType> converted;
+  converted.reserve(v.size());
+
+  for (auto vi : v) {
+    converted.push_back(static_cast<EnumType>(vi));
+  }
+  return converted;
+}
+
+bool check_tensorwise_recipe(
+    c10::ScalarType,
+    std::vector<ScalingType>&,
+    ArrayRef<Tensor>&,
+    c10::ScalarType,
+    std::vector<ScalingType>&,
+    ArrayRef<Tensor>&);
+
+bool check_rowwise_recipe(
+    c10::ScalarType,
+    std::vector<ScalingType>&,
+    ArrayRef<Tensor>&,
+    c10::ScalarType,
+    std::vector<ScalingType>&,
+    ArrayRef<Tensor>&);
+
+} // namespace at::native::onednn::scaled
diff --git a/aten/tools/valgrind.sup b/aten/tools/valgrind.sup
index ad5f66e0b0531..585487c4d2be2 100644
--- a/aten/tools/valgrind.sup
+++ b/aten/tools/valgrind.sup
@@ -10,6 +10,13 @@
    ...
 }
 
+{
+   ignore_empty_generic_uninitialised_conditional_jump
+   Memcheck:Cond
+   fun:_ZN2at6detail13empty_genericEN3c108ArrayRefIlEEPNS1_9AllocatorENS1_14DispatchKeySetENS1_10ScalarTypeESt8optionalINS1_12MemoryFormatEE
+   ...
+}
+
 {
    Cond_cuda
    Memcheck:Cond
diff --git a/benchmarks/dynamo/check_accuracy.py b/benchmarks/dynamo/check_accuracy.py
index 83cca8b36b993..7f8be84b93fd7 100644
--- a/benchmarks/dynamo/check_accuracy.py
+++ b/benchmarks/dynamo/check_accuracy.py
@@ -50,6 +50,7 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
                 "mobilenet_v2",
                 "pytorch_CycleGAN_and_pix2pix",
                 "pytorch_stargan",
+                "repvgg_a2",
                 "resnet152",
                 "resnet18",
                 "resnet50",
diff --git a/benchmarks/dynamo/check_perf_csv.py b/benchmarks/dynamo/check_perf_csv.py
index 320a4544f829b..08070dda4444c 100644
--- a/benchmarks/dynamo/check_perf_csv.py
+++ b/benchmarks/dynamo/check_perf_csv.py
@@ -9,28 +9,61 @@ def check_perf_csv(filename, threshold, threshold_scale):
     """
     Basic performance checking.
     """
+    try:
+        df = pd.read_csv(filename)
+    except FileNotFoundError:
+        print(f"Error: File {filename} not found")
+        sys.exit(1)
 
-    df = pd.read_csv(filename)
+    effective_threshold = threshold * threshold_scale
+    print(f"Checking {filename} (speedup threshold >= {effective_threshold:.2f}x)\n")
 
     failed = []
     for _, row in df.iterrows():
         model_name = row["name"]
-        speedup = row["speedup"]
-        if speedup < threshold * threshold_scale:
-            failed.append(model_name)
+        speedup = float(row["speedup"])
+        abs_latency = float(row["abs_latency"])
+        compilation_latency = float(row["compilation_latency"])
+        compression_ratio = float(row["compression_ratio"])
+        eager_peak_mem = float(row["eager_peak_mem"])
+        dynamo_peak_mem = float(row["dynamo_peak_mem"])
+
+        perf_summary = f"{model_name:34} speedup={speedup:.3f}x"
+        if pd.notna(abs_latency):
+            perf_summary += f", latency={abs_latency:.1f} ms/iter"
+        if pd.notna(compilation_latency):
+            perf_summary += f", compile={compilation_latency:.3f}s"
+        if pd.notna(compression_ratio):
+            perf_summary += f", mem_ratio={1 / compression_ratio:.2f}x"
+            if pd.notna(eager_peak_mem) and pd.notna(dynamo_peak_mem):
+                perf_summary += (
+                    f" (eager={eager_peak_mem:.1f} GB, dynamo={dynamo_peak_mem:.1f} GB)"
+                )
+
+        if speedup < effective_threshold:
+            failed.append((model_name, speedup))
 
-        print(f"{model_name:34} {speedup}")
+        print(perf_summary)
 
     if failed:
         print(
             textwrap.dedent(
                 f"""
-                Error {len(failed)} models performance regressed
-                    {" ".join(failed)}
+                Error {len(failed)} model(s) performance regressed
+                    {" ".join([name for name, _ in failed])}
                 """
             )
         )
+        for name, sp in sorted(failed, key=lambda x: x[1]):
+            pct_from_target = (sp / effective_threshold - 1.0) * 100.0
+            print(
+                f"  - {name}: {sp:.3f}x (< {effective_threshold:.2f}x; {pct_from_target:.1f}% from target)"
+            )
         sys.exit(1)
+    else:
+        print(
+            f"\nAll {len(df)} model(s) passed threshold check (>= {effective_threshold:.2f}x)"
+        )
 
 
 if __name__ == "__main__":
@@ -44,7 +77,7 @@ def check_perf_csv(filename, threshold, threshold_scale):
         "-s",
         type=float,
         default=1.0,
-        help="multiple threshold by this value to relax the check",
+        help="multiply threshold by this value to relax the check",
     )
     args = parser.parse_args()
     check_perf_csv(args.file, args.threshold, args.threshold_scale)
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_training.csv
index b5e457e58997d..b2f40504a4991 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_training.csv
@@ -10,7 +10,7 @@ beit_base_patch16_224,pass,7
 
 
 
-convnextv2_nano.fcmae_ft_in22k_in1k,pass,7
+convnextv2_nano.fcmae_ft_in22k_in1k,fail_accuracy,7
 
 
 
@@ -66,7 +66,7 @@ visformer_small,pass,7
 
 
 
-vit_base_patch14_dinov2.lvd142m,pass,7
+vit_base_patch14_dinov2.lvd142m,fail_accuracy,7
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_training.csv
index b2071874b70d6..2d087e6595526 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_training.csv
@@ -50,7 +50,7 @@ nfnet_l0,pass,7
 
 
 
-repvgg_a2,fail_accuracy,7
+repvgg_a2,pass,7
 
 
 
diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index e0681f52586e7..b3484e7196a83 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -952,7 +952,7 @@ def latency_experiment_summary(suite_name, args, model, timings, **kwargs):
         first_fields.append(kwargs["tag"])
     headers = first_headers + ["speedup", "abs_latency"]
     row = first_fields + [float(speedup), median[1] * 1000]
-    msg = f"{speedup:.3f}x"
+    msg = f"{median[0] * 1000} ms, {median[1] * 1000} ms, {speedup:.3f}x"
     if args.baseline:
         headers.extend(
             [
@@ -1010,7 +1010,7 @@ def latency_experiment_summary(suite_name, args, model, timings, **kwargs):
     # Hypothetically you can use this from other places, but it's currently
     # inaccessible, and when this assert fails you need to update the
     # event_name here to account for the other cases you are using this
-    assert args.quantization is not None
+    assert any([args.quantization, args.optimus])
     output_signpost(
         dict(zip(headers, row)),
         args,
@@ -2288,11 +2288,9 @@ def record_status(accuracy_status, dynamo_start_stats):
                     )
                 ):
                     is_same = False
-            except Exception as e:
+            except Exception:
                 # Sometimes torch.allclose may throw RuntimeError
-                exception_string = str(e)
-                accuracy_status = f"fail_exception: {exception_string}"
-                return record_status(accuracy_status, dynamo_start_stats=start_stats)
+                is_same = False
 
             if not is_same:
                 accuracy_status = "eager_two_runs_differ"
@@ -2381,7 +2379,9 @@ def record_status(accuracy_status, dynamo_start_stats):
                     print(
                         f"Load model outputs from {self.args.compare_model_outputs_with} to compare"
                     )
-                    saved_result = torch.load(self.args.compare_model_outputs_with)
+                    saved_result = torch.load(
+                        self.args.compare_model_outputs_with, weights_only=False
+                    )
                     is_bitwise_same = bitwise_same(saved_result, new_result)
                     if not is_bitwise_same:
                         print(
@@ -2409,11 +2409,9 @@ def record_status(accuracy_status, dynamo_start_stats):
                     force_max_multiplier=force_max_multiplier,
                 ):
                     is_same = False
-            except Exception as e:
+            except Exception:
                 # Sometimes torch.allclose may throw RuntimeError
-                exception_string = str(e)
-                accuracy_status = f"fail_exception: {exception_string}"
-                return record_status(accuracy_status, dynamo_start_stats=start_stats)
+                is_same = False
 
             if not is_same:
                 if self.args.skip_accuracy_check:
@@ -2587,6 +2585,9 @@ def warmup(fn, model, example_inputs, mode, niters=10):
                 **experiment_kwargs,
             )
 
+            # reset dynamo
+            torch._dynamo.reset()
+
             if self.args.export_aot_inductor:
                 optimized_model_iter_fn = optimize_ctx
             else:
@@ -2950,7 +2951,7 @@ def run_one_model(
             status = self.check_tolerance(name, model, example_inputs, optimize_ctx)
             print(status)
         elif self.args.performance:
-            if self.args.backend == "torchao":
+            if self.args.backend in ["torchao", "optimus"]:
                 status = self.run_performance_test_non_alternate(
                     name, model, example_inputs, optimize_ctx, experiment, tag
                 )
@@ -3526,6 +3527,12 @@ def get_example_inputs(self):
         action="store_true",
         help="Measure speedup with TorchInductor",
     )
+    group.add_argument(
+        "--optimus",
+        choices=["vertical_opt", "horizontal_opt", "all"],
+        default=None,
+        help="Measure speedup of Optimus with TorchInductor baseline",
+    )
     group.add_argument(
         "--quantization",
         choices=[
@@ -3783,6 +3790,9 @@ def run(runner, args, original_dir=None):
     if args.inductor:
         assert args.backend is None
         args.backend = "inductor"
+    if args.optimus:
+        assert args.backend is None
+        args.backend = "optimus"
     if args.quantization:
         assert args.backend is None
         args.backend = "torchao"
@@ -4067,10 +4077,22 @@ def model_iter_fn_and_mark_step(*args, **kwargs):
 
             runner.model_iter_fn = model_iter_fn_and_mark_step
             optimize_ctx = torchao_optimize_ctx(args.quantization)
+        elif args.backend == "optimus":
+            from .optimus import get_baseline_ctx, get_optimus_optimize_ctx
+
+            baseline_ctx = get_baseline_ctx(
+                nopython=args.nopython, inductor_compile_mode=args.inductor_compile_mode
+            )
+            runner.model_iter_fn = baseline_ctx(runner.model_iter_fn)
+            optimize_ctx = get_optimus_optimize_ctx(
+                args.optimus, args.nopython, args.inductor_compile_mode
+            )
         else:
             optimize_ctx = torch._dynamo.optimize(args.backend, nopython=args.nopython)
         experiment = (
-            speedup_experiment if args.backend != "torchao" else latency_experiment
+            speedup_experiment
+            if args.backend not in ["torchao", "optimus"]
+            else latency_experiment
         )
         if args.accuracy:
             output_filename = f"accuracy_{args.backend}.csv"
@@ -4091,7 +4113,12 @@ def model_iter_fn_and_mark_step(*args, **kwargs):
     if args.only in runner.disable_cudagraph_models:
         args.disable_cudagraphs = True
 
-    if args.inductor or args.backend == "inductor" or args.export_aot_inductor:
+    if (
+        args.inductor
+        or args.backend == "inductor"
+        or args.export_aot_inductor
+        or args.backend == "optimus"
+    ):
         inductor_config.triton.cudagraphs = not args.disable_cudagraphs
         inductor_config.triton.persistent_reductions = (
             not args.disable_persistent_reductions
diff --git a/benchmarks/dynamo/optimus.py b/benchmarks/dynamo/optimus.py
new file mode 100644
index 0000000000000..f188b698edd5f
--- /dev/null
+++ b/benchmarks/dynamo/optimus.py
@@ -0,0 +1,62 @@
+import functools
+
+import torch
+
+
+def get_baseline_ctx(nopython, inductor_compile_mode):
+    return functools.partial(
+        torch.compile,
+        backend="inductor",
+        fullgraph=nopython,
+        mode=inductor_compile_mode,
+    )
+
+
+def get_optimus_optimize_ctx(config, nopython, inductor_compile_mode):
+    if config == "vertical_opt":
+        optimus_inductor_config = {
+            "pre_grad_fusion_options": {
+                "normalization_pass": {},
+                "merge_splits_pass": {},
+                "split_cat_pass": {},
+                "unbind_stack_pass": {},
+                "unbind_cat_to_view_pass": {},
+            }
+        }
+    elif config == "horizontal_opt":
+        optimus_inductor_config = {
+            "pre_grad_fusion_options": {
+                "normalization_pass": {},
+                "batch_linear": {},
+                "batch_layernorm": {},
+            },
+        }
+    elif config == "all":
+        optimus_inductor_config = {
+            "pre_grad_fusion_options": {
+                "normalization_pass": {},
+                "batch_linear": {},
+                "batch_layernorm": {},
+                "merge_splits_pass": {},
+                "split_cat_pass": {},
+                "unbind_stack_pass": {},
+                "unbind_cat_to_view_pass": {},
+            },
+        }
+    else:
+        raise RuntimeError(f"Unknown optimus config: {config}")
+
+    def _inner(fn):
+        if "pre_grad_fusion_options" in optimus_inductor_config:
+            torch._inductor.config.pre_grad_fusion_options = optimus_inductor_config[
+                "pre_grad_fusion_options"
+            ]
+        if "post_grad_fusion_options" in optimus_inductor_config:
+            torch._inductor.config.post_grad_fusion_options = optimus_inductor_config[
+                "post_grad_fusion_options"
+            ]
+        return torch.compile(
+            fn, backend="inductor", fullgraph=nopython, mode=inductor_compile_mode
+        )
+
+    return _inner
diff --git a/benchmarks/dynamo/parse_logs.py b/benchmarks/dynamo/parse_logs.py
index 8704fda9b997a..a3def611bbcc2 100644
--- a/benchmarks/dynamo/parse_logs.py
+++ b/benchmarks/dynamo/parse_logs.py
@@ -2,6 +2,7 @@
 import os
 import re
 import sys
+from pathlib import Path
 
 
 # This script takes the logs produced by the benchmark scripts (e.g.,
@@ -15,8 +16,7 @@
 # This script is not very well written, feel free to rewrite it as necessary
 
 assert len(sys.argv) == 2
-
-full_log = open(sys.argv[1]).read()
+full_log = Path(sys.argv[1]).read_text()
 
 # If the log contains a gist URL, extract it so we can include it in the CSV
 gist_url = ""
diff --git a/benchmarks/dynamo/pr_time_benchmarks/benchmarks/dtensor.py b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/dtensor.py
new file mode 100644
index 0000000000000..db59dfacb3f82
--- /dev/null
+++ b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/dtensor.py
@@ -0,0 +1,62 @@
+import sys
+
+from benchmark_base import BenchmarkBase
+
+import torch
+from torch.distributed._tensor import DTensor, Replicate
+from torch.testing._internal.distributed.fake_pg import FakeStore
+
+
+class BenchmarkDTensorDispatch(BenchmarkBase):
+    def __init__(self, operator, world_size) -> None:
+        super().__init__(
+            category=f"dtensor_dispatch_{operator}",
+            device="cuda",
+        )
+        self.world_size = world_size
+
+    def name(self) -> str:
+        prefix = f"{self.category()}"
+        return prefix
+
+    def description(self) -> str:
+        return f"DTensor dispatch time for {self.category()}"
+
+    def _prepare_once(self) -> None:
+        self.mesh = torch.distributed.device_mesh.init_device_mesh(
+            "cuda", (self.world_size,), mesh_dim_names=("dp",)
+        )
+        self.a = DTensor.from_local(
+            torch.ones(10, 10, device=self.device()), self.mesh, [Replicate()]
+        )
+        self.b = DTensor.from_local(
+            torch.ones(10, 10, device=self.device()), self.mesh, [Replicate()]
+        )
+
+    def _prepare(self) -> None:
+        pass
+
+
+class BenchmarkDetach(BenchmarkDTensorDispatch):
+    def __init__(self, world_size) -> None:
+        super().__init__(operator="detach", world_size=world_size)
+
+    def _work(self) -> None:
+        self.a.detach()
+
+
+def main():
+    world_size = 256
+    fake_store = FakeStore()
+    torch.distributed.init_process_group(
+        "fake", store=fake_store, rank=0, world_size=world_size
+    )
+    result_path = sys.argv[1]
+    BenchmarkDetach(world_size).enable_instruction_count().collect_all().append_results(
+        result_path
+    )
+    torch.distributed.destroy_process_group()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/dynamo/torchbench.yaml b/benchmarks/dynamo/torchbench.yaml
index b31a85ae26763..974c3d700a045 100644
--- a/benchmarks/dynamo/torchbench.yaml
+++ b/benchmarks/dynamo/torchbench.yaml
@@ -189,6 +189,10 @@ skip:
     - hf_Whisper
     - hf_distil_whisper
     - timm_vision_transformer_large
+    # https://github.com/pytorch/pytorch/issues/167895
+    - stable_diffusion
+    - stable_diffusion_text_encoder
+    - stable_diffusion_unet
 
   device:
     cpu:
diff --git a/benchmarks/operator_benchmark/aarch64_expected_ci_operator_benchmark_eager_float32_cpu.csv b/benchmarks/operator_benchmark/aarch64_expected_ci_operator_benchmark_eager_float32_cpu.csv
index dc8b240ce570f..f3d8c7e65af04 100644
--- a/benchmarks/operator_benchmark/aarch64_expected_ci_operator_benchmark_eager_float32_cpu.csv
+++ b/benchmarks/operator_benchmark/aarch64_expected_ci_operator_benchmark_eager_float32_cpu.csv
@@ -484,24 +484,106 @@ PyTorch,sum,sum_R256_V512_dim0_contiguousTrue_cpu,short,False,50.954394,0.000000
 PyTorch,sum,sum_R256_V512_dim0_contiguousFalse_cpu,short,False,57.957757,0.000000
 PyTorch,sum,sum_R256_V512_dim1_contiguousTrue_cpu,short,False,53.592068,0.000000
 PyTorch,sum,sum_R256_V512_dim1_contiguousFalse_cpu,short,False,51.339726,0.000000
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N16_cpu,short,False,7.040985,0.000000
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N64_cpu,short,False,7.168604,0.000000
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N128_cpu,short,False,7.434442,0.000000
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N16_cpu,short,False,7.078318,0.000000
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N64_cpu,short,False,7.426670,0.000000
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N128_cpu,short,False,7.679027,0.000000
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N16_cpu,short,False,7.281365,0.000000
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N64_cpu,short,False,7.682783,0.000000
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N128_cpu,short,False,8.381938,0.000000
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N16_cpu,short,False,7.039854,0.000000
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N64_cpu,short,False,7.399855,0.000000
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N128_cpu,short,False,7.715193,0.000000
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N16_cpu,short,False,7.255140,0.000000
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N64_cpu,short,False,7.753522,0.000000
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N128_cpu,short,False,8.364281,0.000000
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N16_cpu,short,False,7.476377,0.000000
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N64_cpu,short,False,8.458564,0.000000
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N128_cpu,short,False,9.391939,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.bool,short,False,0.927,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.uint8,short,False,6.261,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int8,short,False,6.351,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int16,short,False,6.177,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int32,short,False,6.333,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int64,short,False,6.588,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float16,short,False,8.117,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.bfloat16,short,False,9.358,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float32,short,False,7.844,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float64,short,False,8.097,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.bool,short,False,6.159,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,0.926,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int8,short,False,6.192,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int16,short,False,6.276,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,6.461,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int64,short,False,6.524,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float16,short,False,8.136,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.bfloat16,short,False,6.854,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float32,short,False,6.446,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float64,short,False,6.829,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.bool,short,False,6.088,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.uint8,short,False,6.059,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int8,short,False,0.922,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int16,short,False,6.263,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int32,short,False,6.330,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int64,short,False,6.688,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float16,short,False,8.176,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.bfloat16,short,False,6.959,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float32,short,False,6.430,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float64,short,False,6.818,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.bool,short,False,6.350,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.uint8,short,False,6.221,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int8,short,False,6.193,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int16,short,False,0.922,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int32,short,False,6.263,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int64,short,False,6.525,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float16,short,False,7.960,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.bfloat16,short,False,6.801,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float32,short,False,6.594,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float64,short,False,7.089,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.bool,short,False,6.498,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,6.358,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int8,short,False,6.390,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int16,short,False,6.415,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,0.925,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int64,short,False,6.657,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float16,short,False,7.954,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.bfloat16,short,False,6.930,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float32,short,False,6.737,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float64,short,False,6.948,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.bool,short,False,6.757,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.uint8,short,False,6.402,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int8,short,False,6.550,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int16,short,False,6.518,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int32,short,False,6.766,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int64,short,False,0.929,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float16,short,False,8.557,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.bfloat16,short,False,9.045,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float32,short,False,7.672,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float64,short,False,7.276,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.bool,short,False,6.414,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.uint8,short,False,7.736,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int8,short,False,7.889,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int16,short,False,8.170,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int32,short,False,7.783,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int64,short,False,7.743,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float16,short,False,0.927,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.bfloat16,short,False,7.018,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float32,short,False,8.428,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float64,short,False,6.767,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.bool,short,False,6.479,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.uint8,short,False,7.827,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int8,short,False,6.450,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int16,short,False,6.320,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int32,short,False,6.385,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int64,short,False,8.119,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float16,short,False,8.063,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.bfloat16,short,False,0.925,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float32,short,False,8.629,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float64,short,False,6.638,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.bool,short,False,6.425,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.uint8,short,False,7.803,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int8,short,False,6.502,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int16,short,False,6.429,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int32,short,False,6.549,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int64,short,False,7.749,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float16,short,False,7.301,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.bfloat16,short,False,7.682,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float32,short,False,0.930,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float64,short,False,6.738,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.bool,short,False,6.798,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.uint8,short,False,6.506,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int8,short,False,6.494,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int16,short,False,6.668,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int32,short,False,6.696,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int64,short,False,7.115,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float16,short,False,7.910,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.bfloat16,short,False,7.410,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float32,short,False,6.868,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float64,short,False,0.924,0.000000
 PyTorch,addcmul,addcmul_M1_N2_cpu_dtypetorch.float32,short,False,4.461410,0.000000
 PyTorch,addcmul,addcmul_M1_N2_cpu_dtypetorch.bfloat16,short,False,4.560082,0.000000
 PyTorch,addcmul,addcmul_M32_N64_cpu_dtypetorch.float32,short,False,5.141248,0.000000
diff --git a/benchmarks/operator_benchmark/pt/addmm_test.py b/benchmarks/operator_benchmark/pt/addmm_test.py
index a98628944b3e8..3e94a9cd7f3dc 100644
--- a/benchmarks/operator_benchmark/pt/addmm_test.py
+++ b/benchmarks/operator_benchmark/pt/addmm_test.py
@@ -53,10 +53,8 @@ def forward(self, input_one, mat1, mat2):
         return torch.addmm(input_one, mat1, mat2)
 
 
-op_bench.generate_pt_test(addmm_long_configs + addmm_long_configs, AddmmBenchmark)
-op_bench.generate_pt_gradient_test(
-    addmm_long_configs + addmm_long_configs, AddmmBenchmark
-)
+op_bench.generate_pt_test(addmm_short_configs + addmm_long_configs, AddmmBenchmark)
+op_bench.generate_pt_gradient_test(addmm_long_configs, AddmmBenchmark)
 
 """Mircobenchmark for addbmm operator."""
 
@@ -107,9 +105,7 @@ def forward(self, input_one, batch1, batch2):
 )
 
 op_bench.generate_pt_test(addbmm_long_configs + addbmm_short_configs, AddbmmBenchmark)
-op_bench.generate_pt_gradient_test(
-    addbmm_long_configs + addbmm_short_configs, AddbmmBenchmark
-)
+op_bench.generate_pt_gradient_test(addbmm_long_configs, AddbmmBenchmark)
 
 if __name__ == "__main__":
     op_bench.benchmark_runner.main()
diff --git a/benchmarks/operator_benchmark/pt/tensor_to_test.py b/benchmarks/operator_benchmark/pt/tensor_to_test.py
index 621e58212cba2..9354c8c52eaa8 100644
--- a/benchmarks/operator_benchmark/pt/tensor_to_test.py
+++ b/benchmarks/operator_benchmark/pt/tensor_to_test.py
@@ -4,74 +4,84 @@
 
 
 tensor_conversion_short_configs = op_bench.cross_product_configs(
-    M=(
-        8,
-        16,
-        32,
-    ),
-    N=(
-        16,
-        64,
-        128,
-    ),
+    M=[32],
+    N=[128],
     device=["cpu", "cuda"],
+    dtype_one=[
+        torch.bool,
+        torch.uint8,
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.half,
+        torch.bfloat16,
+        torch.float,
+        torch.double,
+    ],
+    dtype_two=[
+        torch.bool,
+        torch.uint8,
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.half,
+        torch.bfloat16,
+        torch.float,
+        torch.double,
+    ],
     tags=["short"],
 )
 
 tensor_conversion_long_configs = op_bench.cross_product_configs(
-    M=(
-        64,
-        128,
-        256,
-        512,
-    ),
-    N=(
-        256,
-        512,
-        1024,
-        2048,
-    ),
+    M=[1024],
+    N=[1024],
     device=["cpu", "cuda"],
+    dtype_one=[
+        torch.bool,
+        torch.uint8,
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.half,
+        torch.bfloat16,
+        torch.float,
+        torch.double,
+    ],
+    dtype_two=[
+        torch.bool,
+        torch.uint8,
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.half,
+        torch.bfloat16,
+        torch.float,
+        torch.double,
+    ],
     tags=["long"],
 )
 
 
-class FloatToHalfTensorConversionBenchmark(op_bench.TorchBenchmarkBase):
-    def init(self, M, N, device):
+class TensorConversionBenchmark(op_bench.TorchBenchmarkBase):
+    def init(self, M, N, dtype_one, dtype_two, device):
         self.inputs = {
             "input": torch.rand(
                 M, N, device=device, requires_grad=False, dtype=torch.float
-            )
+            ).to(dtype=dtype_one)
         }
+        self.dtype_one = dtype_one
+        self.dtype_two = dtype_two
 
     def forward(self, input):
-        return input.to(torch.half)
+        return input.to(dtype=self.dtype_two)
 
 
-class HalfToFloatTensorConversionBenchmark(op_bench.TorchBenchmarkBase):
-    def init(self, M, N, device):
-        self.inputs = {
-            "input": torch.rand(
-                M, N, device=device, requires_grad=False, dtype=torch.half
-            )
-        }
-
-    def forward(self, input):
-        return input.to(torch.float)
-
-
-op_bench.generate_pt_test(
-    tensor_conversion_short_configs, FloatToHalfTensorConversionBenchmark
-)
-op_bench.generate_pt_test(
-    tensor_conversion_long_configs, FloatToHalfTensorConversionBenchmark
-)
-op_bench.generate_pt_test(
-    tensor_conversion_short_configs, HalfToFloatTensorConversionBenchmark
-)
-op_bench.generate_pt_test(
-    tensor_conversion_long_configs, HalfToFloatTensorConversionBenchmark
-)
+op_bench.generate_pt_test(tensor_conversion_short_configs, TensorConversionBenchmark)
+op_bench.generate_pt_test(tensor_conversion_long_configs, TensorConversionBenchmark)
 
 if __name__ == "__main__":
     op_bench.benchmark_runner.main()
diff --git a/benchmarks/operator_benchmark/x86_64_expected_ci_operator_benchmark_eager_float32_cpu.csv b/benchmarks/operator_benchmark/x86_64_expected_ci_operator_benchmark_eager_float32_cpu.csv
index d7a8e65aa85af..71a5930a01a3f 100644
--- a/benchmarks/operator_benchmark/x86_64_expected_ci_operator_benchmark_eager_float32_cpu.csv
+++ b/benchmarks/operator_benchmark/x86_64_expected_ci_operator_benchmark_eager_float32_cpu.csv
@@ -349,24 +349,106 @@ PyTorch,sum,sum_R256_V512_dim0_contiguousTrue_cpu,short,FALSE,12.5841
 PyTorch,sum,sum_R256_V512_dim0_contiguousFALSE_cpu,short,FALSE,20.8765
 PyTorch,sum,sum_R256_V512_dim1_contiguousTrue_cpu,short,FALSE,15.4414
 PyTorch,sum,sum_R256_V512_dim1_contiguousFALSE_cpu,short,FALSE,15.3287
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N16_cpu,short,FALSE,5.0499
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N64_cpu,short,FALSE,5.3229
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N128_cpu,short,FALSE,5.4418
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N16_cpu,short,FALSE,5.0868
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N64_cpu,short,FALSE,5.4495
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N128_cpu,short,FALSE,5.5578
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N16_cpu,short,FALSE,5.2631
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N64_cpu,short,FALSE,5.5646
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N128_cpu,short,FALSE,5.7898
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N16_cpu,short,FALSE,5.0228
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N64_cpu,short,FALSE,5.3692
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N128_cpu,short,FALSE,5.4006
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N16_cpu,short,FALSE,5.1107
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N64_cpu,short,FALSE,5.4119
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N128_cpu,short,FALSE,5.5583
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N16_cpu,short,FALSE,5.3818
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N64_cpu,short,FALSE,5.5742
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N128_cpu,short,FALSE,6.8414
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.bool,short,False,0.797
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.uint8,short,False,6.071
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int8,short,False,6.031
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int16,short,False,6.243
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int32,short,False,7.231
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int64,short,False,7.791
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float16,short,False,12.661
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.bfloat16,short,False,11.225
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float32,short,False,9.772
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float64,short,False,9.872
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.bool,short,False,6.033
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,0.781
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int8,short,False,6.060
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int16,short,False,6.180
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,7.258
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int64,short,False,7.758
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float16,short,False,10.504
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.bfloat16,short,False,6.749
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float32,short,False,7.679
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float64,short,False,7.797
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.bool,short,False,6.019
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.uint8,short,False,6.079
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int8,short,False,0.785
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int16,short,False,6.188
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int32,short,False,7.288
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int64,short,False,7.770
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float16,short,False,10.466
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.bfloat16,short,False,6.676
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float32,short,False,7.736
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float64,short,False,7.780
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.bool,short,False,6.130
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.uint8,short,False,6.221
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int8,short,False,6.101
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int16,short,False,0.791
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int32,short,False,6.254
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int64,short,False,7.733
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float16,short,False,10.562
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.bfloat16,short,False,6.704
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float32,short,False,7.819
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float64,short,False,8.276
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.bool,short,False,6.361
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,6.364
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int8,short,False,6.309
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int16,short,False,6.362
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,0.791
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int64,short,False,7.746
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float16,short,False,9.462
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.bfloat16,short,False,6.678
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float32,short,False,7.827
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float64,short,False,8.200
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.bool,short,False,6.925
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.uint8,short,False,6.947
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int8,short,False,6.962
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int16,short,False,6.906
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int32,short,False,7.664
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int64,short,False,0.782
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float16,short,False,10.528
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.bfloat16,short,False,10.123
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float32,short,False,9.234
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float64,short,False,8.694
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.bool,short,False,12.653
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.uint8,short,False,9.348
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int8,short,False,8.774
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int16,short,False,9.063
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int32,short,False,10.012
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int64,short,False,13.641
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float16,short,False,0.788
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.bfloat16,short,False,13.757
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float32,short,False,7.170
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float64,short,False,12.511
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.bool,short,False,6.516
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.uint8,short,False,8.539
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int8,short,False,6.483
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int16,short,False,6.468
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int32,short,False,7.752
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int64,short,False,9.868
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float16,short,False,10.556
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.bfloat16,short,False,0.792
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float32,short,False,7.577
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float64,short,False,8.267
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.bool,short,False,6.819
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.uint8,short,False,7.715
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int8,short,False,6.754
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int16,short,False,6.825
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int32,short,False,7.790
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int64,short,False,9.219
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float16,short,False,5.977
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.bfloat16,short,False,7.069
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float32,short,False,0.794
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float64,short,False,8.301
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.bool,short,False,7.401
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.uint8,short,False,7.843
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int8,short,False,7.117
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int16,short,False,7.170
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int32,short,False,8.000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int64,short,False,9.284
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float16,short,False,7.179
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.bfloat16,short,False,7.645
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float32,short,False,7.988
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float64,short,False,0.792
 PyTorch,relu,"relu_dims(3,4,5)_contigFALSE_inplaceFALSE_dtypetorch.quint8",short,FALSE,9.4657
 PyTorch,relu,"relu_dims(3,4,5)_contigFALSE_inplaceFALSE_dtypetorch.qint8",short,FALSE,9.4625
 PyTorch,relu,"relu_dims(3,4,5)_contigFALSE_inplaceFALSE_dtypetorch.qint32",short,FALSE,9.4165
diff --git a/benchmarks/sparse/spmm.py b/benchmarks/sparse/spmm.py
index b707556dd7a15..b2c658d6faeb6 100644
--- a/benchmarks/sparse/spmm.py
+++ b/benchmarks/sparse/spmm.py
@@ -52,19 +52,18 @@ def test_sparse_coo_and_csr(m, n, k, nnz, test_count):
         start.record()
         coo.matmul(mat)
         stop.record()
-
         times.append(start.elapsed_time(stop))
 
-        coo_mean_time = sum(times) / len(times)
+    coo_mean_time = sum(times) / len(times)
 
-        times = []
-        for _ in range(test_count):
-            start.record()
-            csr.matmul(mat)
-            stop.record()
-            times.append(start.elapsed_time(stop))
+    times = []
+    for _ in range(test_count):
+        start.record()
+        csr.matmul(mat)
+        stop.record()
+        times.append(start.elapsed_time(stop))
 
-            csr_mean_time = sum(times) / len(times)
+    csr_mean_time = sum(times) / len(times)
 
     return coo_mean_time, csr_mean_time
 
@@ -84,10 +83,13 @@ def test_sparse_coo_and_csr(m, n, k, nnz, test_count):
 
     if args.outfile == "stdout":
         outfile = sys.stdout
+        need_close = False
     elif args.outfile == "stderr":
         outfile = sys.stderr
+        need_close = False
     else:
         outfile = open(args.outfile, "a")
+        need_close = True
 
     test_count = args.test_count
     m = args.m
@@ -148,3 +150,5 @@ def test_sparse_coo_and_csr(m, n, k, nnz, test_count):
             time,
             file=outfile,
         )
+    if need_close:
+        outfile.close()
diff --git a/benchmarks/sparse/spmv.py b/benchmarks/sparse/spmv.py
index f8900882ca4ec..3e9502686a884 100644
--- a/benchmarks/sparse/spmv.py
+++ b/benchmarks/sparse/spmv.py
@@ -82,10 +82,13 @@ def test_sparse_coo_and_csr(m, nnz, test_count):
 
     if args.outfile == "stdout":
         outfile = sys.stdout
+        need_close = False
     elif args.outfile == "stderr":
         outfile = sys.stderr
+        need_close = False
     else:
         outfile = open(args.outfile, "a")
+        need_close = True
 
     test_count = args.test_count
     m = args.m
@@ -132,3 +135,5 @@ def test_sparse_coo_and_csr(m, nnz, test_count):
             time_csr,
             file=outfile,
         )
+    if need_close:
+        outfile.close()
diff --git a/benchmarks/sparse/triton_ops.py b/benchmarks/sparse/triton_ops.py
index 48a88d592ea2c..a49a53bcd207c 100644
--- a/benchmarks/sparse/triton_ops.py
+++ b/benchmarks/sparse/triton_ops.py
@@ -179,10 +179,13 @@ def integer_or_float_list(a):
 
     if args.outfile == "stdout":
         outfile = sys.stdout
+        need_close = False
     elif args.outfile == "stderr":
         outfile = sys.stderr
+        need_close = False
     else:
         outfile = open(args.outfile, "a")
+        need_close = True
 
     ops = args.ops.split(",")
 
@@ -434,3 +437,5 @@ def show_best_messages(best_messages=best_messages):
                 if op not in {"bsr_scatter_mm6", "bsr_dense_mm_with_meta"}:
                     # Break on operations that do not consume parameters
                     break
+    if need_close:
+        outfile.close()
diff --git a/benchmarks/transformer/score_mod.py b/benchmarks/transformer/score_mod.py
index 928cbf27df5b1..e9af132df28a9 100644
--- a/benchmarks/transformer/score_mod.py
+++ b/benchmarks/transformer/score_mod.py
@@ -125,6 +125,17 @@ def wrapper(config, *args, **kwargs):
 ]
 DtypeString = Literal["bfloat16", "float16", "float32"]
 SpeedupType = Literal["fwd", "bwd"]
+# Operator Name mapping
+backend_to_operator_name = {
+    "math": "math attention kernel",
+    "efficient": "efficient attention kernel",
+    "cudnn": "cudnn attention kernel",
+    "fav2": "flash attention 2 kernel",
+    "fav3": "flash attention 3 kernel",
+    "fakv": "flash attention kv cache kernel",
+    "og-eager": "eager attention kernel",
+    "flex": "flex attention kernel",
+}
 
 
 def benchmark_torch_function_in_microseconds(func: Callable, *args, **kwargs) -> float:
@@ -1265,12 +1276,14 @@ class BenchmarkRecord:
                 model: ModelInfo
                 metric: MetricInfo
 
+            operator_name = backend_to_operator_name.get(backend, backend)
+
             # Benchmark extra info
             benchmark_extra_info = {
                 "input_config": input_config,
                 "device": device,
                 "arch": device_arch,
-                "operator_name": backend,
+                "operator_name": operator_name,
                 "attn_type": config.attn_type,
                 "shape": str(config.shape),
                 "max_autotune": config.max_autotune,
@@ -1288,7 +1301,7 @@ class BenchmarkRecord:
                     type="attention-benchmark",
                     origins=["pytorch"],
                     extra_info={
-                        "operator_name": backend,
+                        "operator_name": operator_name,
                         "attn_type": config.attn_type,
                     },
                 ),
@@ -1315,7 +1328,7 @@ class BenchmarkRecord:
                         type="attention-benchmark",
                         origins=["pytorch"],
                         extra_info={
-                            "operator_name": backend,
+                            "operator_name": operator_name,
                         },
                     ),
                     metric=MetricInfo(
@@ -1341,7 +1354,7 @@ class BenchmarkRecord:
                         type="attention-benchmark",
                         origins=["pytorch"],
                         extra_info={
-                            "operator_name": backend,
+                            "operator_name": operator_name,
                         },
                     ),
                     metric=MetricInfo(
@@ -1371,7 +1384,7 @@ class BenchmarkRecord:
                         type="attention-benchmark",
                         origins=["pytorch"],
                         extra_info={
-                            "operator_name": backend,
+                            "operator_name": operator_name,
                         },
                     ),
                     metric=MetricInfo(
diff --git a/buckbuild.bzl b/buckbuild.bzl
index 4c1affd10e1bc..1d26485baca89 100644
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@@ -2,13 +2,14 @@
 # These load paths point to different files in internal and OSS environment
 
 load("@bazel_skylib//lib:paths.bzl", "paths")
+load("//tools/build_defs:cell_defs.bzl", "get_fbsource_cell")
 load("//tools/build_defs:fb_native_wrapper.bzl", "fb_native")
 load("//tools/build_defs:fb_xplat_cxx_library.bzl", "fb_xplat_cxx_library")
 load("//tools/build_defs:fb_xplat_genrule.bzl", "fb_xplat_genrule")
 load("//tools/build_defs/windows:windows_flag_map.bzl", "windows_convert_gcc_clang_flags")
 load("//tools/build_defs:fbsource_utils.bzl", "is_arvr_mode")
 load("//tools/build_defs:glob_defs.bzl", "subdir_glob")
-load("//tools/build_defs:platform_defs.bzl", "APPLETVOS", "IOS", "MACOSX")
+load("//tools/build_defs:platform_defs.bzl", "IOS", "MACOSX")
 load("//tools/build_defs:type_defs.bzl", "is_list", "is_string")
 load("//tools/build_defs/android:build_mode_defs.bzl", is_production_build_android = "is_production_build")
 load("//tools/build_defs/apple:build_mode_defs.bzl", is_production_build_ios = "is_production_build", is_profile_build_ios = "is_profile_build")
@@ -590,6 +591,9 @@ def pt_operator_query_codegen(
         pt_allow_forced_schema_registration = True,
         compatible_with = [],
         apple_sdks = None):
+    if get_fbsource_cell() == "fbcode":
+        return
+
     oplist_dir_name = name + "_pt_oplist"
 
     # @lint-ignore BUCKLINT
@@ -865,6 +869,9 @@ def define_buck_targets(
         pt_xplat_cxx_library = fb_xplat_cxx_library,
         c2_fbandroid_xplat_compiler_flags = [],
         labels = []):
+    if get_fbsource_cell() == "fbcode":
+        return
+
     # @lint-ignore BUCKLINT
     fb_native.filegroup(
         name = "metal_build_srcs",
@@ -1090,7 +1097,7 @@ def define_buck_targets(
         srcs = [
             "caffe2/core/common.cc",
         ],
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = get_pt_compiler_flags(),
         labels = labels,
         # @lint-ignore BUCKLINT link_whole
diff --git a/build_variables.bzl b/build_variables.bzl
index 70121e19d8099..258e739300c1e 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -1025,6 +1025,7 @@ libtorch_python_core_sources = [
 libtorch_python_distributed_core_sources = [
     "torch/csrc/distributed/c10d/init.cpp",
     "torch/csrc/distributed/c10d/python_comm_hook.cpp",
+    "torch/csrc/distributed/c10d/python_callback_work.cpp",
 ]
 
 libtorch_python_distributed_sources = libtorch_python_distributed_core_sources + [
diff --git a/c10/core/Allocator.h b/c10/core/Allocator.h
index 747b73da01352..7d2c814fe84f7 100644
--- a/c10/core/Allocator.h
+++ b/c10/core/Allocator.h
@@ -19,6 +19,17 @@
 
 namespace c10 {
 
+using CaptureId_t = unsigned long long;
+// first is set if the instance is created by CUDAGraph::capture_begin.
+// second is set if the instance is created by at::cuda::graph_pool_handle.
+using MempoolId_t = std::pair<CaptureId_t, CaptureId_t>;
+
+struct MempoolIdHash {
+  std::size_t operator()(const MempoolId_t& mempool_id) const noexcept {
+    return mempool_id.first != 0 ? mempool_id.first : mempool_id.second;
+  }
+};
+
 // A DataPtr is a unique pointer (with an attached deleter and some
 // context for the deleter) to some memory, which also records what
 // device is for its data.
diff --git a/c10/core/AutogradState.h b/c10/core/AutogradState.h
index ad168b8c05987..d2b9cc080413d 100644
--- a/c10/core/AutogradState.h
+++ b/c10/core/AutogradState.h
@@ -1,6 +1,8 @@
 #pragma once
 
+#include <c10/core/SafePyObject.h>
 #include <c10/macros/Export.h>
+#include <optional>
 
 namespace c10 {
 
@@ -15,7 +17,8 @@ struct C10_API AutogradState {
       bool inference_mode,
       bool fw_grad_mode,
       bool multithreading_enabled)
-      : grad_mode_(grad_mode),
+      : graph_exec_group_(std::nullopt),
+        grad_mode_(grad_mode),
         inference_mode_(inference_mode),
         fw_grad_mode_(fw_grad_mode),
         multithreading_enabled_(multithreading_enabled),
@@ -41,6 +44,10 @@ struct C10_API AutogradState {
     view_replay_enabled_ = view_replay_enabled;
   }
 
+  void set_graph_exec_group(std::optional<SafePyObject> group) {
+    graph_exec_group_ = std::move(group);
+  }
+
   bool get_grad_mode() const {
     return grad_mode_;
   }
@@ -61,7 +68,12 @@ struct C10_API AutogradState {
     return view_replay_enabled_;
   }
 
+  const std::optional<SafePyObject>& get_graph_exec_group() const {
+    return graph_exec_group_;
+  }
+
  private:
+  std::optional<SafePyObject> graph_exec_group_;
   bool grad_mode_ : 1;
   bool inference_mode_ : 1;
   bool fw_grad_mode_ : 1;
diff --git a/c10/core/CachingDeviceAllocator.h b/c10/core/CachingDeviceAllocator.h
index 0bec03ae417fa..c95d0714ce3bd 100644
--- a/c10/core/CachingDeviceAllocator.h
+++ b/c10/core/CachingDeviceAllocator.h
@@ -96,6 +96,13 @@ struct C10_API DeviceAllocator : public c10::Allocator {
 
   // Resets peak memory usage statistics for the specified device
   virtual void resetPeakStats(c10::DeviceIndex device) = 0;
+
+  // Return the free memory size and total memory size in bytes for the
+  // specified device.
+  virtual std::pair<size_t, size_t> getMemoryInfo(c10::DeviceIndex device) {
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false, "getMemoryInfo is not implemented for this allocator yet.");
+  }
 };
 
 // This function is used to get the DeviceAllocator for a specific device type
diff --git a/c10/core/DispatchKeySet.cpp b/c10/core/DispatchKeySet.cpp
index 72e72f49a5e40..d1ec51b6a47d6 100644
--- a/c10/core/DispatchKeySet.cpp
+++ b/c10/core/DispatchKeySet.cpp
@@ -59,6 +59,9 @@ constexpr DispatchKeySet nested_dispatch_keyset =
         {DispatchKey::AutogradNestedTensor, DispatchKey::NestedTensor}) |
     DispatchKeySet(DispatchKeySet::RAW, full_backend_mask);
 
+constexpr DispatchKeySet functorch_batched_dispatch_keyset =
+    DispatchKeySet(DispatchKey::FuncTorchBatched);
+
 DispatchKeySet getRuntimeDispatchKeySet(DispatchKey t) {
   TORCH_INTERNAL_ASSERT(t != DispatchKey::Undefined);
   switch (t) {
@@ -77,6 +80,8 @@ DispatchKeySet getRuntimeDispatchKeySet(DispatchKey t) {
       return backend_dispatch_keyset;
     case DispatchKey::CompositeExplicitAutogradNonFunctional:
       return non_functional_backend_dispatch_keyset;
+    case DispatchKey::FuncTorchBatchedDecomposition:
+      return functorch_batched_dispatch_keyset;
     default:
       return DispatchKeySet(t);
   }
@@ -171,7 +176,7 @@ std::ostream& operator<<(std::ostream& os, DispatchKeySet ts) {
     os << k;
     first = false;
   }
-  os << ")";
+  os << ')';
   return os;
 }
 
diff --git a/c10/core/SafePyObject.h b/c10/core/SafePyObject.h
index 1ec0cdb6751e9..bcace0ac358b4 100644
--- a/c10/core/SafePyObject.h
+++ b/c10/core/SafePyObject.h
@@ -44,7 +44,7 @@ struct C10_API SafePyObject {
       (*other.pyinterpreter_)->incref(other.data_);
     }
     if (data_ != nullptr) {
-      (*pyinterpreter_)->decref(data_, /*has_pyobj_slot*/ false);
+      (*pyinterpreter_)->decref(data_);
     }
     data_ = other.data_;
     pyinterpreter_ = other.pyinterpreter_;
@@ -53,7 +53,7 @@ struct C10_API SafePyObject {
 
   ~SafePyObject() {
     if (data_ != nullptr) {
-      (*pyinterpreter_)->decref(data_, /*has_pyobj_slot*/ false);
+      (*pyinterpreter_)->decref(data_);
     }
   }
 
diff --git a/c10/core/ScalarType.h b/c10/core/ScalarType.h
index ba1068e72695c..040c6abb7d8e2 100644
--- a/c10/core/ScalarType.h
+++ b/c10/core/ScalarType.h
@@ -27,26 +27,13 @@
 #include <torch/headeronly/core/ScalarType.h>
 
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum")
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default")
 
 namespace c10 {
 
 // See [dtype Macros note] in torch/headeronly/core/ScalarType.h
 // regarding macros.
 
-template <typename T>
-struct CppTypeToScalarType;
-
-#define SPECIALIZE_CppTypeToScalarType(cpp_type, scalar_type)                  \
-  template <>                                                                  \
-  struct CppTypeToScalarType<cpp_type>                                         \
-      : std::                                                                  \
-            integral_constant<c10::ScalarType, c10::ScalarType::scalar_type> { \
-  };
-
-AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_CppTypeToScalarType)
-
-#undef SPECIALIZE_CppTypeToScalarType
-
 #define DEFINE_CONSTANT(_, name) \
   constexpr ScalarType k##name = ScalarType::name;
 
@@ -105,13 +92,6 @@ inline bool isComplexType(ScalarType t) {
       t == ScalarType::ComplexDouble);
 }
 
-inline bool isQIntType(ScalarType t) {
-  // Don't forget to extend this when adding new QInt types
-  return t == ScalarType::QInt8 || t == ScalarType::QUInt8 ||
-      t == ScalarType::QInt32 || t == ScalarType::QUInt4x2 ||
-      t == ScalarType::QUInt2x4;
-}
-
 inline bool isBitsType(ScalarType t) {
   return t == ScalarType::Bits1x8 || t == ScalarType::Bits2x4 ||
       t == ScalarType::Bits4x2 || t == ScalarType::Bits8 ||
@@ -205,6 +185,12 @@ inline bool isSignedType(ScalarType t) {
       break;
       // Do not add default here, but rather define behavior of every new entry
       // here.  `-Wswitch-enum` would raise a warning in those cases.
+      // TODO: get PyTorch to adopt exhaustive switches by default with a way to
+      // opt specific switches to being non-exhaustive.
+      // Exhaustive:
+      // `-Wswitch-enum`, `-Wswitch-default`, `-Wno-covered-switch-default`
+      // Non-Exhaustive:
+      // `-Wno-switch-enum`, `-Wswitch-default`, `-Wcovered-switch-default`
   }
   TORCH_CHECK(false, "Unknown ScalarType ", t);
 #undef CASE_ISSIGNED
diff --git a/c10/core/StorageImpl.cpp b/c10/core/StorageImpl.cpp
index a614fc9234c94..00fc03bbd0fcf 100644
--- a/c10/core/StorageImpl.cpp
+++ b/c10/core/StorageImpl.cpp
@@ -48,6 +48,30 @@ void warnDeprecatedDataPtr() {
   TORCH_CHECK(false, "Cannot access data pointer of Storage that is invalid.");
 }
 
+void StorageImpl::incref_pyobject() const {
+  // Because intrusive_ptr incref uses relaxed memory order, we need to
+  // do an acquire fence to ensure that the kHasPyObject bit was
+  // observed before the load of the PyObject* below.
+  // NB: This is a no-op on x86/x86-64
+  std::atomic_thread_fence(std::memory_order_acquire);
+
+  PyObject* obj = pyobj_slot_.load_pyobj();
+  (*pyobj_slot_.pyobj_interpreter())->incref(obj);
+}
+
+void StorageImpl::decref_pyobject() const {
+  PyObject* obj = pyobj_slot_.load_pyobj();
+  (*pyobj_slot_.pyobj_interpreter())->decref(obj);
+}
+
+bool StorageImpl::try_incref_pyobject() const {
+  c10::impl::PyInterpreter* interp = pyobj_slot_.pyobj_interpreter();
+  if (C10_UNLIKELY(!interp)) {
+    return false;
+  }
+  return (*interp)->try_incref(pyobj_slot_);
+}
+
 void SetStorageImplCreate(DeviceType t, StorageImplCreateHelper fptr) {
   // Allowlist verification.
   // Only if the devicetype is in the allowlist,
diff --git a/c10/core/StorageImpl.h b/c10/core/StorageImpl.h
index f34a1baed7a48..c7dbd5c1f005b 100644
--- a/c10/core/StorageImpl.h
+++ b/c10/core/StorageImpl.h
@@ -105,6 +105,12 @@ struct C10_API StorageImpl : public c10::intrusive_ptr_target {
     data_ptr_.clear();
   }
 
+  void incref_pyobject() const override final;
+
+  void decref_pyobject() const override final;
+
+  bool try_incref_pyobject() const override final;
+
   size_t nbytes() const {
     // OK to do this instead of maybe_as_int as nbytes is guaranteed positive
     TORCH_CHECK(!size_bytes_is_heap_allocated_);
@@ -370,4 +376,18 @@ C10_API c10::intrusive_ptr<c10::StorageImpl> make_storage_impl(
     bool resizable,
     std::optional<at::Device> device_opt);
 
+namespace detail {
+
+#ifndef C10_MOBILE
+template <class T>
+struct TargetTraits<
+    T,
+    std::enable_if_t<
+        std::is_base_of_v<c10::StorageImpl, std::remove_cv_t<T>>>> {
+  static constexpr bool can_have_pyobject = true;
+};
+#endif
+
+} // namespace detail
+
 } // namespace c10
diff --git a/c10/core/SymBool.cpp b/c10/core/SymBool.cpp
index d804eb9d27409..48c407b8b069c 100644
--- a/c10/core/SymBool.cpp
+++ b/c10/core/SymBool.cpp
@@ -1,4 +1,5 @@
 #include <c10/core/SymBool.h>
+#include <c10/core/SymInt.h>
 #include <c10/core/SymNodeImpl.h>
 
 namespace c10 {
@@ -111,4 +112,17 @@ bool SymBool::has_hint() const {
   return toSymNodeImpl()->has_hint();
 }
 
+SymInt SymBool::toSymInt() const {
+  // If concrete bool, return concrete SymInt
+  if (auto ma = maybe_as_bool()) {
+    return SymInt(*ma ? 1 : 0);
+  }
+
+  // Symbolic case: use sym_ite to convert bool to int (0 or 1)
+  auto node = toSymNodeImpl();
+  auto one_node = node->wrap_int(1);
+  auto zero_node = node->wrap_int(0);
+  return SymInt(node->sym_ite(one_node, zero_node));
+}
+
 } // namespace c10
diff --git a/c10/core/SymBool.h b/c10/core/SymBool.h
index d5d509e239b1d..a27a28a5bf8a3 100644
--- a/c10/core/SymBool.h
+++ b/c10/core/SymBool.h
@@ -12,6 +12,8 @@
 
 namespace c10 {
 
+class SymInt;
+
 class C10_API SymBool {
  public:
   /*implicit*/ SymBool(bool b) : data_(b) {}
@@ -80,6 +82,10 @@ class C10_API SymBool {
     return toSymNodeImplUnowned()->constant_bool();
   }
 
+  // Convert SymBool to SymInt (0 or 1)
+  // This is the C++ equivalent of Python's cast_symbool_to_symint_guardless
+  SymInt toSymInt() const;
+
   bool is_heap_allocated() const {
     return ptr_;
   }
diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp
index c59524a0932c2..94a7375cc32fb 100644
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@@ -277,7 +277,6 @@ void TensorImpl::release_resources() {
   if (storage_) {
     storage_ = {};
   }
-  pyobj_slot_.maybe_destroy_pyobj();
 }
 
 #ifndef C10_DISABLE_TENSORIMPL_EXTENSIBILITY
@@ -989,6 +988,30 @@ void TensorImpl::empty_tensor_restride_symint(MemoryFormat memory_format) {
   }
 }
 
+void TensorImpl::incref_pyobject() const {
+  // Because intrusive_ptr incref uses relaxed memory order, we need to
+  // do an acquire fence to ensure that the kHasPyObject bit was
+  // observed before the load of the PyObject* below.
+  // NB: This is a no-op on x86/x86-64
+  std::atomic_thread_fence(std::memory_order_acquire);
+
+  PyObject* obj = pyobj_slot_.load_pyobj();
+  (*pyobj_slot_.pyobj_interpreter())->incref(obj);
+}
+
+void TensorImpl::decref_pyobject() const {
+  PyObject* obj = pyobj_slot_.load_pyobj();
+  (*pyobj_slot_.pyobj_interpreter())->decref(obj);
+}
+
+bool TensorImpl::try_incref_pyobject() const {
+  c10::impl::PyInterpreter* interp = pyobj_slot_.pyobj_interpreter();
+  if (C10_UNLIKELY(!interp)) {
+    return false;
+  }
+  return (*interp)->try_incref(pyobj_slot_);
+}
+
 namespace impl {
 
 namespace {
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index 66893b86c8469..71a0195dde773 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -57,6 +57,8 @@ C10_DECLARE_bool(caffe2_keep_on_shrink);
 // respect caffe2_keep_on_shrink.
 C10_DECLARE_int64(caffe2_max_keep_on_shrink_memory);
 
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default")
+
 namespace at {
 class Tensor;
 class TensorBase;
@@ -2176,6 +2178,12 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     return &pyobj_slot_;
   }
 
+  void incref_pyobject() const override final;
+
+  void decref_pyobject() const override final;
+
+  bool try_incref_pyobject() const override final;
+
  private:
   // See NOTE [std::optional operator usage in CUDA]
   // We probably don't want to expose this publicly until
@@ -3077,6 +3085,19 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   friend class C10_TensorImpl_Size_Check_Dummy_Class;
 };
 
+namespace detail {
+
+#ifndef C10_MOBILE
+template <class T>
+struct TargetTraits<
+    T,
+    std::enable_if_t<std::is_base_of_v<c10::TensorImpl, std::remove_cv_t<T>>>> {
+  static constexpr bool can_have_pyobject = true;
+};
+#endif
+
+} // namespace detail
+
 // Note [TensorImpl size constraints]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 // Changed the size of TensorImpl?  If the size went down, good for
@@ -3303,3 +3324,5 @@ static_assert(
 #undef C10_GCC_VERSION_MINOR
 
 } // namespace c10
+
+C10_DIAGNOSTIC_POP()
diff --git a/c10/core/TensorOptions.cpp b/c10/core/TensorOptions.cpp
index d3282ae7114e5..b1a90cce30edc 100644
--- a/c10/core/TensorOptions.cpp
+++ b/c10/core/TensorOptions.cpp
@@ -33,7 +33,7 @@ std::ostream& operator<<(std::ostream& stream, const TensorOptions& options) {
   } else {
     stream << "(nullopt)";
   }
-  stream << ")";
+  stream << ')';
 
   return stream;
 }
diff --git a/c10/core/impl/PyInterpreter.cpp b/c10/core/impl/PyInterpreter.cpp
index 8676f0aaf8e0e..52d263fad36c5 100644
--- a/c10/core/impl/PyInterpreter.cpp
+++ b/c10/core/impl/PyInterpreter.cpp
@@ -11,8 +11,11 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
 
   void incref(PyObject* pyobj) const override {} // do nothing
 
-  void decref(PyObject* pyobj, bool has_pyobj_slot) const override {
-  } // do nothing
+  void decref(PyObject* pyobj) const override {} // do nothing
+
+  bool try_incref(const c10::impl::PyObjectSlot& pyobj_slot) const override {
+    return false;
+  }
 
 #define PANIC(m)              \
   TORCH_INTERNAL_ASSERT(      \
@@ -20,6 +23,10 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
       "attempted to call " #m \
       " on a Tensor with nontrivial PyObject after corresponding interpreter died")
 
+  size_t refcnt(PyObject* pyobj) const override {
+    PANIC(refcnt);
+  }
+
   c10::intrusive_ptr<TensorImpl> detach(const TensorImpl* self) const override {
     PANIC(detach);
   }
diff --git a/c10/core/impl/PyInterpreter.h b/c10/core/impl/PyInterpreter.h
index def708c24b802..463b1e520b36e 100644
--- a/c10/core/impl/PyInterpreter.h
+++ b/c10/core/impl/PyInterpreter.h
@@ -18,6 +18,9 @@ namespace c10 {
 struct IValue;
 class OperatorHandle;
 struct TensorImpl;
+namespace impl {
+struct PyObjectSlot;
+} // namespace impl
 } // namespace c10
 
 namespace torch::jit {
@@ -126,9 +129,12 @@ struct C10_API PyInterpreterVTable {
 
   // Run Py_INCREF on a PyObject.
   virtual void incref(PyObject* pyobj) const = 0;
-  // Run Py_DECREF on a PyObject.  We DO NOT assume the GIL is held on call
-  // See NOTE [PyInterpreter::decref takes a `has_pyobj_slot` arg]
-  virtual void decref(PyObject* pyobj, bool has_pyobj_slot) const = 0;
+  // Run Py_DECREF on a PyObject.  We DO NOT assume the GIL is held on call.
+  virtual void decref(PyObject* pyobj) const = 0;
+  // Run PyUnstable_TryIncRef on a PyObject if it's not NULL.
+  virtual bool try_incref(const c10::impl::PyObjectSlot& pyobj_slot) const = 0;
+  // Run Py_REFCNT on a PyObject.
+  virtual size_t refcnt(PyObject* pyobj) const = 0;
 
   // Perform a detach by deferring to the __torch_dispatch__ implementation of
   // detach, which will also arrange for the PyObject to get copied in this
diff --git a/c10/core/impl/PyObjectSlot.cpp b/c10/core/impl/PyObjectSlot.cpp
deleted file mode 100644
index 0f1bfb2110747..0000000000000
--- a/c10/core/impl/PyObjectSlot.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-#include <c10/core/impl/PyObjectSlot.h>
-
-namespace c10::impl {
-
-PyObjectSlot::PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}
-
-PyObjectSlot::~PyObjectSlot() {
-  maybe_destroy_pyobj();
-}
-
-void PyObjectSlot::maybe_destroy_pyobj() {
-  if (owns_pyobj()) {
-    TORCH_INTERNAL_ASSERT(pyobj_interpreter_ != nullptr);
-    TORCH_INTERNAL_ASSERT(pyobj_ != nullptr);
-    (*pyobj_interpreter_.load(std::memory_order_acquire))
-        ->decref(_unchecked_untagged_pyobj(), /*has_pyobj_slot*/ true);
-    // NB: this destructor can only be entered when there are no
-    // references to this C++ object (obviously), NOR any references
-    // to the PyObject (if there are references to the PyObject,
-    // then the PyObject holds an owning reference to the tensor).
-    // So it is OK to clear pyobj_ here as it is impossible for it to
-    // be used again (modulo weak reference races)
-    pyobj_ = nullptr; // for safety
-  }
-}
-
-PyInterpreter* PyObjectSlot::pyobj_interpreter() {
-  return pyobj_interpreter_.load(std::memory_order_acquire);
-}
-
-PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
-  // NOLINTNEXTLINE(performance-no-int-to-ptr)
-  return reinterpret_cast<PyObject*>(
-      reinterpret_cast<uintptr_t>(pyobj_) & ~0x1ULL);
-}
-
-PyInterpreter& PyObjectSlot::load_pyobj_interpreter() const {
-  auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
-  if (interpreter) {
-    return *interpreter;
-  }
-  TORCH_CHECK(false, "cannot access PyObject for Tensor - no interpreter set");
-}
-
-bool PyObjectSlot::owns_pyobj() {
-  // NOLINTNEXTLINE(performance-no-int-to-ptr)
-  return reinterpret_cast<uintptr_t>(pyobj_) & 1;
-}
-
-void PyObjectSlot::set_owns_pyobj(bool b) {
-  // NOLINTNEXTLINE(performance-no-int-to-ptr)
-  pyobj_ = reinterpret_cast<PyObject*>(
-      reinterpret_cast<uintptr_t>(_unchecked_untagged_pyobj()) | b);
-}
-
-} // namespace c10::impl
diff --git a/c10/core/impl/PyObjectSlot.h b/c10/core/impl/PyObjectSlot.h
index 58b2490eba001..a0633401b3634 100644
--- a/c10/core/impl/PyObjectSlot.h
+++ b/c10/core/impl/PyObjectSlot.h
@@ -8,117 +8,58 @@
 
 #include <atomic>
 
+namespace torch::utils {
+class PyObjectPreservation;
+}
+
 namespace c10::impl {
 
 struct C10_API PyObjectSlot {
  public:
-  PyObjectSlot();
-
-  ~PyObjectSlot();
-
-  void maybe_destroy_pyobj();
-
-  // Associate the TensorImpl with the specified PyObject, and, if necessary,
-  // also tag the interpreter.
-  //
-  // NB: This lives in a header so that we can inline away the switch on status
-  //
-  // NB: THIS FUNCTION CAN RAISE AN EXCEPTION.  Make sure to clean up after
-  // PyObject if necessary!
-  void init_pyobj(PyObject* pyobj) {
-    pyobj_interpreter_.store(
-        getGlobalPyInterpreter(), std::memory_order_relaxed);
-    pyobj_ = pyobj;
-  }
+  PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}
 
   // Query the PyObject interpreter.  This may return null if there is no
-  // interpreter.  This is racy!
-  PyInterpreter* pyobj_interpreter();
-
-  PyObject* _unchecked_untagged_pyobj() const;
-
-  // Test the interpreter tag.  If tagged for the current interpreter, return
-  // a non-nullopt (but possibly null) PyObject.  If (possibly) untagged,
-  // returns a nullopt.  If it is definitely invalid, raises an error.
-  //
-  // If `ignore_hermetic_tls` is false and this function is called from a
-  // hermetic context (ie, `HermeticPyObjectTLS::get_state()` is true), then
-  // nullopt is returned. If `ignore_hermetic_tls` is true, then the hermetic
-  // context is ignored, allowing you to check the interpreter tag of a
-  // nonhermetic PyObject from within a hermetic context. This is necessary
-  // because there are some cases where the deallocator function of a
-  // nonhermetic PyObject is called from within a hermetic context, so it must
-  // be properly treated as a nonhermetic PyObject.
-  //
-  // NB: this lives in header so that we can avoid actually creating the
-  // std::optional
+  // interpreter.
+  PyInterpreter* pyobj_interpreter() const {
+    return pyobj_interpreter_.load(std::memory_order_acquire);
+  }
 
-  // @todo alban: I'm not too sure what's going on here, we can probably delete
-  // it but it's worthwhile making sure
-  std::optional<PyObject*> check_pyobj(bool ignore_hermetic_tls = false) const {
-    impl::PyInterpreter* interpreter =
-        pyobj_interpreter_.load(std::memory_order_acquire);
-    if (interpreter == nullptr) {
-      return std::nullopt;
-    }
+  PyInterpreter& load_pyobj_interpreter() const {
+    auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
+    TORCH_INTERNAL_ASSERT(
+        interpreter, "cannot access PyObject for Tensor - no interpreter set");
+    return *interpreter;
+  }
 
-    if (!ignore_hermetic_tls && c10::impl::HermeticPyObjectTLS::get_state()) {
-      return std::nullopt;
-    } else {
-      return _unchecked_untagged_pyobj();
-    }
+  PyObject* load_pyobj() const {
+    return pyobj_.load(std::memory_order_acquire);
   }
 
-  PyInterpreter& load_pyobj_interpreter() const;
+  void store_pyobj(PyObject* obj) {
+    pyobj_.store(obj, std::memory_order_release);
+  }
 
-  bool owns_pyobj();
+  bool has_unique_reference() const {
+    PyObject* pyobj = load_pyobj();
+    return pyobj != nullptr && load_pyobj_interpreter()->refcnt(pyobj) == 1;
+  }
 
-  void set_owns_pyobj(bool b);
+  void clear() {
+    pyobj_.store(nullptr, std::memory_order_relaxed);
+    pyobj_interpreter_.store(nullptr, std::memory_order_relaxed);
+  }
 
  private:
-  // This field contains the interpreter tag for this object.  See
-  // Note [Python interpreter tag] for general context
-  //
-  // Note [Memory ordering on Python interpreter tag]
-  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  // What memory_order do we need when accessing this atomic?  We don't
-  // need a single total modification order (as provided by
-  // memory_order_seq_cst) as pyobj_interpreter_ is monotonic: it can only
-  // transition from -1 to some positive integer and never changes afterwards.
-  // Because there is only one modification, it trivially already has a total
-  // modification order (e.g., we don't need fences or locked instructions on
-  // x86)
-  //
-  // In fact, one could make a reasonable argument that relaxed reads are OK,
-  // due to the presence of external locking (GIL) to ensure that interactions
-  // with other data structures are still correctly synchronized, so that
-  // we fall in the "Single-Location Data Structures" case as described in
-  // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf
-  // However, on x86, it doesn't matter if I use acquire or relaxed on the load
-  // as I get the same assembly in both cases.  So I just use the more
-  // conservative acquire (which will impede compiler optimizations but I don't
-  // care)
+  // This is now always the global interpreter if the PyObject is set.
+  // Maybe we can remove this field some day...
   std::atomic<PyInterpreter*> pyobj_interpreter_;
 
-  // This field contains a reference to a PyObject representing this Tensor.
-  // If pyobj is nullptr, when we transfer Tensor to Python, we allocate a new
-  // PyObject for it and set this field.  This field does not have to be
-  // protected by an atomic as it is only allowed to be accessed when you hold
-  // the GIL, or during destruction of the tensor.
-  //
-  // When a PyObject dies, you are obligated to clear this field
-  // (otherwise, you will try to use-after-free the pyobj); this currently
-  // occurs in THPVariable_clear in torch/csrc/autograd/python_variable.cpp
-  //
-  // NB: Ordinarily, this should not be a strong reference, as if the
-  // PyObject owns the Tensor, this would create a reference cycle.
-  // However, sometimes this ownership flips.  To track who owns
-  // who, this has a single pointer tag indicating whether or not the
-  // C++ object owns the PyObject (the common case, zero, means PyObject
-  // owns the C++ object); see _unchecked_untagged_pyobj for raw access
-  // or check_pyobj for checked access.  See references to PyObject
-  // resurrection in torch/csrc/autograd/python_variable.cpp
-  PyObject* pyobj_;
+  // The PyObject representing this Tensor or nullptr. Ownership is managed
+  // by intrusive_ptr. By the time the PyObjectSlot is destroyed, this
+  // reference is already dead.
+  std::atomic<PyObject*> pyobj_;
+
+  friend class torch::utils::PyObjectPreservation;
 };
 
 } // namespace c10::impl
diff --git a/c10/cuda/CUDAAllocatorConfig.cpp b/c10/cuda/CUDAAllocatorConfig.cpp
index 3046259b48a3e..5414d838cd8c4 100644
--- a/c10/cuda/CUDAAllocatorConfig.cpp
+++ b/c10/cuda/CUDAAllocatorConfig.cpp
@@ -106,6 +106,9 @@ void CUDAAllocatorConfig::parseArgs(const std::string& env) {
     } else if (key == "graph_capture_record_stream_reuse") {
       i = parseGraphCaptureRecordStreamReuse(tokenizer, i);
       used_native_specific_option = true;
+    } else if (key == "per_process_memory_fraction") {
+      i = parsePerProcessMemoryFraction(tokenizer, i);
+      used_native_specific_option = true;
     } else {
       const auto& keys =
           c10::CachingAllocator::AcceleratorAllocatorConfig::getKeys();
@@ -146,6 +149,18 @@ size_t CUDAAllocatorConfig::parseGraphCaptureRecordStreamReuse(
   return i;
 }
 
+double CUDAAllocatorConfig::parsePerProcessMemoryFraction(
+    const c10::CachingAllocator::ConfigTokenizer& tokenizer,
+    size_t i) {
+  tokenizer.checkToken(++i, ":");
+  double val_env = tokenizer.toDouble(++i);
+  TORCH_CHECK_VALUE(
+      val_env >= 0.0 && val_env <= 1.0,
+      "per_process_memory_fraction is invalid, set it in [0.0, 1.0]");
+  m_per_process_memory_fraction = val_env;
+  return i;
+}
+
 size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads(
     const c10::CachingAllocator::ConfigTokenizer& tokenizer,
     size_t i) {
diff --git a/c10/cuda/CUDAAllocatorConfig.h b/c10/cuda/CUDAAllocatorConfig.h
index d61f69467a2dc..4e6097a406bc2 100644
--- a/c10/cuda/CUDAAllocatorConfig.h
+++ b/c10/cuda/CUDAAllocatorConfig.h
@@ -61,6 +61,10 @@ class C10_CUDA_API CUDAAllocatorConfig {
     return instance().m_graph_capture_record_stream_reuse;
   }
 
+  static double per_process_memory_fraction() {
+    return instance().m_per_process_memory_fraction;
+  }
+
   /** Pinned memory allocator settings */
   static bool pinned_use_cuda_host_register() {
     return instance().m_pinned_use_cuda_host_register;
@@ -152,7 +156,8 @@ class C10_CUDA_API CUDAAllocatorConfig {
         "pinned_use_hip_host_register",
         "graph_capture_record_stream_reuse",
         "pinned_reserve_segment_size_mb",
-        "pinned_num_register_threads"};
+        "pinned_num_register_threads",
+        "per_process_memory_fraction"};
     return keys;
   }
 
@@ -177,6 +182,9 @@ class C10_CUDA_API CUDAAllocatorConfig {
   size_t parseGraphCaptureRecordStreamReuse(
       const c10::CachingAllocator::ConfigTokenizer& tokenizer,
       size_t i);
+  double parsePerProcessMemoryFraction(
+      const c10::CachingAllocator::ConfigTokenizer& tokenizer,
+      size_t i);
 
   std::atomic<size_t> m_pinned_num_register_threads{1};
   std::atomic<size_t> m_pinned_reserve_segment_size_mb{0};
@@ -189,6 +197,7 @@ class C10_CUDA_API CUDAAllocatorConfig {
   std::atomic<bool> m_release_lock_on_cudamalloc{false};
   std::atomic<bool> m_pinned_use_cuda_host_register{false};
   std::atomic<bool> m_graph_capture_record_stream_reuse{false};
+  std::atomic<double> m_per_process_memory_fraction{1.0};
 };
 
 // Keep this for backwards compatibility
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index 091e580f95819..9e7823a394302 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -1012,12 +1012,6 @@ PrivatePoolState::PrivatePoolState(
   }
 }
 
-struct MempoolIdHash {
-  std::size_t operator()(const MempoolId_t& mempool_id) const noexcept {
-    return mempool_id.first != 0 ? mempool_id.first : mempool_id.second;
-  }
-};
-
 cudaError_t allocPrimitive(void** ptr, size_t size, AllocParams& p) {
   if (p.pool->owner_PrivatePool && p.pool->owner_PrivatePool->allocator()) {
     *ptr = p.pool->owner_PrivatePool->allocator()->raw_alloc(size);
@@ -1100,7 +1094,7 @@ class RingBuffer {
 } // anonymous namespace
 } // namespace Native
 
-static std::string reportProcessMemoryInfo(c10::DeviceIndex device) {
+static std::string reportProcessMemoryInfo(const cudaDeviceProp& prop) {
 #ifdef PYTORCH_C10_DRIVER_API_SUPPORTED
   void* nvml_handle = DriverAPI::get_nvml_handle();
   if (!nvml_handle) {
@@ -1111,9 +1105,6 @@ static std::string reportProcessMemoryInfo(c10::DeviceIndex device) {
     return true;
   }();
 
-  cudaDeviceProp prop{};
-  C10_CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
-
   // NOLINTNEXTLINE(*-c-arrays)
   char pci_id[80];
   snprintf(
@@ -1215,14 +1206,16 @@ class DeviceCachingAllocator {
   // record used memory.
   size_t total_allocated_memory = 0;
 
-  size_t allowed_memory_maximum = 0;
+  cudaDeviceProp device_prop;
+
+  // maximum amount of memory that device is allowed to
+  // allocate. This is set iff memory fraction is less than 1
+  std::optional<size_t> allowed_memory_maximum{std::nullopt};
 
   // all live expandable segments
   std::vector<ExpandableSegment*> expandable_segments_;
   std::vector<c10::DeviceIndex> devices_with_peer_access_;
 
-  bool set_fraction = false;
-
   bool record_history = false;
 
   std::atomic<CreateContextFn> context_recorder_;
@@ -1264,6 +1257,9 @@ class DeviceCachingAllocator {
       : device_id(id),
         large_blocks(/*small=*/false),
         small_blocks(/*small=*/true) {
+    C10_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, id));
+
+    setMemoryFraction(CUDAAllocatorConfig::per_process_memory_fraction());
     stats.max_split_size =
         static_cast<int64_t>(AcceleratorAllocatorConfig::max_split_size());
     context_recorder_.store(nullptr);
@@ -1399,7 +1395,7 @@ class DeviceCachingAllocator {
     if (!block_found) {
       // Do garbage collection if the flag is set.
       if (C10_UNLIKELY(
-              set_fraction &&
+              allowed_memory_maximum.has_value() &&
               AcceleratorAllocatorConfig::garbage_collection_threshold() >
                   0.0)) {
         garbage_collect_cached_blocks(context);
@@ -1456,11 +1452,12 @@ class DeviceCachingAllocator {
       C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
       std::string allowed_info;
 
-      if (set_fraction) {
-        allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
+      if (allowed_memory_maximum.has_value()) {
+        allowed_info =
+            format_size(allowed_memory_maximum.value()) + " allowed; ";
       }
 
-      std::string proc_info = reportProcessMemoryInfo(device_id);
+      std::string proc_info = reportProcessMemoryInfo(device_prop);
 
       record_trace(
           TraceEntry::OOM,
@@ -1518,7 +1515,7 @@ class DeviceCachingAllocator {
       for (const auto& obs : observers_local) {
         obs(device_id,
             alloc_size,
-            set_fraction ? allowed_memory_maximum : device_total,
+            allowed_memory_maximum.value_or(device_total),
             device_free);
       }
 
@@ -2015,25 +2012,26 @@ class DeviceCachingAllocator {
 
   /** get memory fraction limiting maximum allocated memory **/
   double getMemoryFraction() {
-    if (!set_fraction) {
+    if (!allowed_memory_maximum.has_value()) {
       return 1.0;
     }
 
-    size_t device_free = 0;
-    size_t device_total = 0;
-    C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
-    return static_cast<double>(allowed_memory_maximum) /
-        static_cast<double>(device_total);
+    return static_cast<double>(allowed_memory_maximum.value()) /
+        static_cast<double>(device_prop.totalGlobalMem);
   }
 
   /** set memory fraction to limit maximum allocated memory **/
   void setMemoryFraction(double fraction) {
-    size_t device_free = 0;
-    size_t device_total = 0;
-    C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
-    allowed_memory_maximum =
-        static_cast<size_t>(fraction * static_cast<double>(device_total));
-    set_fraction = true;
+    TORCH_CHECK(
+        0 <= fraction && fraction <= 1,
+        "invalid fraction:",
+        fraction,
+        ". Please set within [0, 1].");
+    allowed_memory_maximum = std::nullopt;
+    if (fraction < 1.0) {
+      allowed_memory_maximum = static_cast<size_t>(
+          fraction * static_cast<double>(device_prop.totalGlobalMem));
+    }
   }
 
   /** get expandable segment size for all the streams on device **/
@@ -3010,7 +3008,7 @@ class DeviceCachingAllocator {
     BlockPool& pool = *p.pool;
 
     if (C10_UNLIKELY(
-            set_fraction &&
+            allowed_memory_maximum.has_value() &&
             AcceleratorAllocatorConfig::garbage_collection_threshold() > 0.0)) {
       // Track block reuse interval only when garbage collection is enabled.
       ++pool.get_free_blocks_call_count;
@@ -3083,7 +3081,7 @@ class DeviceCachingAllocator {
 
     size_t gc_threshold = static_cast<size_t>(
         AcceleratorAllocatorConfig::garbage_collection_threshold() *
-        static_cast<double>(allowed_memory_maximum));
+        static_cast<double>(allowed_memory_maximum.value()));
     // No need to trigger GC yet
     if (total_allocated_memory <= gc_threshold) {
       return;
@@ -3161,8 +3159,8 @@ class DeviceCachingAllocator {
 
     bool active_pool =
         p.pool->owner_PrivatePool && p.pool->owner_PrivatePool->allocator();
-    if (set_fraction &&
-        total_allocated_memory + size > allowed_memory_maximum) {
+    if (allowed_memory_maximum.has_value() &&
+        total_allocated_memory + size > allowed_memory_maximum.value()) {
       p.err = cudaErrorMemoryAllocation;
       return false;
       // Temporarily disable checkpointing & cudagraphs internally
@@ -3859,7 +3857,6 @@ class NativeCachingAllocator : public CUDAAllocator {
         "Allocator not initialized for device ",
         device,
         ": did you call init?");
-    C10_CUDA_CHECK(c10::cuda::SetDevice(device));
     return device_allocator[device]->getMemoryFraction();
   }
 
@@ -3869,12 +3866,6 @@ class NativeCachingAllocator : public CUDAAllocator {
         "Allocator not initialized for device ",
         device,
         ": did you call init?");
-    TORCH_CHECK(
-        0 <= fraction && fraction <= 1,
-        "invalid fraction:",
-        fraction,
-        ". Please set within [0, 1].");
-    C10_CUDA_CHECK(c10::cuda::SetDevice(device));
     device_allocator[device]->setMemoryFraction(fraction);
   }
 
@@ -4513,66 +4504,3 @@ std::atomic<CUDAAllocator*> allocator;
 static BackendStaticInitializer backend_static_initializer;
 } // namespace cuda::CUDACachingAllocator
 } // namespace c10
-
-namespace c10::cuda {
-
-// uid_ is incremented when a user creates a MemPool,
-// for example: using graph_pool_handle() or c10::cuda::MemPool().
-//
-// uuid_ is incremented when CUDAGraph creates a MemPool
-// as a result of a user not providing a pool.
-//
-// MempoolId_t of {0, 0} is used to denote when no MemPool has been
-// passed to a function, either by user or CUDAGraphs. For example,
-// default value of MempoolId_t for capture_begin function is {0, 0}.
-// That's why uid_ and uuid_ start at 1.
-std::atomic<CaptureId_t> MemPool::uid_{1};
-std::atomic<CaptureId_t> MemPool::uuid_{1};
-
-MemPool::MemPool(
-    CUDACachingAllocator::CUDAAllocator* allocator,
-    bool is_user_created,
-    bool use_on_oom)
-    : allocator_(allocator), is_user_created_(is_user_created) {
-  if (is_user_created_) {
-    id_ = {0, uid_++};
-  } else {
-    id_ = {uuid_++, 0};
-  }
-  device_ = c10::cuda::current_device();
-  CUDACachingAllocator::createOrIncrefPool(device_, id_, allocator);
-  if (use_on_oom) {
-    CUDACachingAllocator::setUseOnOOM(device_, id_);
-  }
-}
-
-MemPool::~MemPool() {
-  TORCH_INTERNAL_ASSERT(use_count() == 1);
-  CUDACachingAllocator::releasePool(device_, id_);
-  c10::cuda::CUDACachingAllocator::emptyCache(id_);
-}
-
-MempoolId_t MemPool::id() {
-  return id_;
-}
-
-CUDACachingAllocator::CUDAAllocator* MemPool::allocator() {
-  return allocator_;
-}
-
-int MemPool::use_count() {
-  return CUDACachingAllocator::getPoolUseCount(device_, id_);
-}
-
-c10::DeviceIndex MemPool::device() {
-  return device_;
-}
-
-MempoolId_t MemPool::graph_pool_handle(bool is_user_created) {
-  if (is_user_created) {
-    return {0, uid_++};
-  }
-  return {uuid_++, 0};
-}
-
-} // namespace c10::cuda
diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h
index fbe5dab18e0ae..e7b45072f6c20 100644
--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@@ -2,6 +2,7 @@
 
 #include <c10/core/AllocatorConfig.h>
 #include <c10/core/CachingDeviceAllocator.h>
+#include <c10/cuda/CUDAAllocatorConfig.h>
 #include <c10/cuda/CUDAGraphsC10Utils.h>
 #include <c10/cuda/CUDAMacros.h>
 #include <c10/cuda/CUDAStream.h>
@@ -344,6 +345,13 @@ class CUDAAllocator : public DeviceAllocator {
       c10::DeviceIndex device,
       std::shared_ptr<AllocatorState> pps) = 0;
   virtual std::string name() = 0;
+  std::pair<size_t, size_t> getMemoryInfo(c10::DeviceIndex device) override {
+    c10::DeviceGuard device_guard({at::kCUDA, device});
+    size_t free = 0;
+    size_t total = 0;
+    C10_CUDA_CHECK(cudaMemGetInfo(&free, &total));
+    return {free, total};
+  }
 };
 
 // Allocator object, statically initialized
@@ -554,41 +562,7 @@ inline std::string getUserMetadata() {
 } // namespace c10::cuda::CUDACachingAllocator
 
 namespace c10::cuda {
-
 // Keep BC only
 using c10::CaptureId_t;
 using c10::MempoolId_t;
-
-// MemPool represents a pool of memory in a caching allocator. Currently,
-// it's just the ID of the pool object maintained in the CUDACachingAllocator.
-//
-// An allocator pointer can be passed to the MemPool to define how the
-// allocations should be done in the pool. For example: using a different
-// system allocator such as ncclMemAlloc.
-struct C10_CUDA_API MemPool {
-  MemPool(
-      CUDACachingAllocator::CUDAAllocator* allocator = nullptr,
-      bool is_user_created = true,
-      bool use_on_oom = false);
-  MemPool(const MemPool&) = delete;
-  MemPool(MemPool&&) = default;
-  MemPool& operator=(const MemPool&) = delete;
-  MemPool& operator=(MemPool&&) = default;
-  ~MemPool();
-
-  MempoolId_t id();
-  CUDACachingAllocator::CUDAAllocator* allocator();
-  int use_count();
-  c10::DeviceIndex device();
-  static MempoolId_t graph_pool_handle(bool is_user_created = true);
-
- private:
-  static std::atomic<CaptureId_t> uid_;
-  static std::atomic<CaptureId_t> uuid_;
-  CUDACachingAllocator::CUDAAllocator* allocator_;
-  bool is_user_created_;
-  MempoolId_t id_;
-  c10::DeviceIndex device_;
-};
-
 } // namespace c10::cuda
diff --git a/c10/cuda/CUDADeviceAssertionHost.cpp b/c10/cuda/CUDADeviceAssertionHost.cpp
index d67ee4b23e692..08e657a411614 100644
--- a/c10/cuda/CUDADeviceAssertionHost.cpp
+++ b/c10/cuda/CUDADeviceAssertionHost.cpp
@@ -136,7 +136,7 @@ std::string c10_retrieve_device_side_assertion_info() {
     // Something failed, let's talk about that
     oss << failures_found
         << " CUDA device-side assertion failures were found on GPU #"
-        << device_num << "!" << std::endl;
+        << device_num << '!' << std::endl;
     if (assertion_data_for_device.assertion_count >
         C10_CUDA_DSA_ASSERTION_COUNT) {
       oss << "But at least " << assertion_data_for_device.assertion_count
@@ -151,17 +151,17 @@ std::string c10_retrieve_device_side_assertion_info() {
       oss << "Assertion failure " << i << std::endl;
       oss << "  GPU assertion failure message = " << self.assertion_msg
           << std::endl;
-      oss << "  File containing assertion = " << self.filename << ":"
+      oss << "  File containing assertion = " << self.filename << ':'
           << self.line_number << std::endl;
       oss << "  Device function containing assertion = " << self.function_name
           << std::endl;
-      oss << "  Thread ID that failed assertion = [" << self.thread_id[0] << ","
-          << self.thread_id[1] << "," << self.thread_id[2] << "]" << std::endl;
-      oss << "  Block ID that failed assertion = [" << self.block_id[0] << ","
-          << self.block_id[1] << "," << self.block_id[2] << "]" << std::endl;
+      oss << "  Thread ID that failed assertion = [" << self.thread_id[0] << ','
+          << self.thread_id[1] << ',' << self.thread_id[2] << ']' << std::endl;
+      oss << "  Block ID that failed assertion = [" << self.block_id[0] << ','
+          << self.block_id[1] << ',' << self.block_id[2] << ']' << std::endl;
       if (launch_info.generation_number == self.caller) {
         oss << "  File containing kernel launch = "
-            << launch_info.launch_filename << ":" << launch_info.launch_linenum
+            << launch_info.launch_filename << ':' << launch_info.launch_linenum
             << std::endl;
         oss << "  Function containing kernel launch = "
             << launch_info.launch_function << std::endl;
@@ -175,7 +175,7 @@ std::string c10_retrieve_device_side_assertion_info() {
         if (launch_registry.gather_launch_stacktrace) {
           oss << "Launch stacktracing disabled." << std::endl;
         } else {
-          oss << "\n" << launch_info.launch_stacktrace << std::endl;
+          oss << '\n' << launch_info.launch_stacktrace << std::endl;
         }
       } else {
         oss << "  CPU launch site info: Unavailable, the circular queue wrapped around. Increase `CUDAKernelLaunchRegistry::max_size`."
@@ -295,11 +295,19 @@ DeviceAssertionsData* CUDAKernelLaunchRegistry::
   C10_CUDA_CHECK_WO_DSA(
       cudaMallocManaged(&uvm_assertions_ptr, sizeof(DeviceAssertionsData)));
 
+#if CUDART_VERSION >= 13000
+  cudaMemLocation cpuDevice;
+  cpuDevice.type = cudaMemLocationTypeDevice;
+  cpuDevice.id = cudaCpuDeviceId;
+#else
+  const auto cpuDevice = cudaCpuDeviceId;
+#endif
+
   C10_CUDA_CHECK_WO_DSA(cudaMemAdvise(
       uvm_assertions_ptr,
       sizeof(DeviceAssertionsData),
       cudaMemAdviseSetPreferredLocation,
-      cudaCpuDeviceId));
+      cpuDevice));
 
   // GPU will establish direct mapping of data in CPU memory, no page faults
   // will be generated
@@ -307,7 +315,7 @@ DeviceAssertionsData* CUDAKernelLaunchRegistry::
       uvm_assertions_ptr,
       sizeof(DeviceAssertionsData),
       cudaMemAdviseSetAccessedBy,
-      cudaCpuDeviceId));
+      cpuDevice));
 
   // Initialize the memory from the CPU; otherwise, pages may have to be created
   // on demand. We think that UVM documentation indicates that first access may
diff --git a/c10/cuda/CUDAMallocAsyncAllocator.cpp b/c10/cuda/CUDAMallocAsyncAllocator.cpp
index 93bce51f1b9d0..674eb00035c50 100644
--- a/c10/cuda/CUDAMallocAsyncAllocator.cpp
+++ b/c10/cuda/CUDAMallocAsyncAllocator.cpp
@@ -427,7 +427,6 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
   // on the current device each later call sees.
   void init(int dev_count) override {
     static bool called = [](int dev_count) {
-      ;
       // Are there external guarantees init will be called before
       // any of the allocator's other functions?
       // std::lock_guard<std::mutex> lk(general_mutex);
diff --git a/c10/cuda/driver_api.h b/c10/cuda/driver_api.h
index 380e7939ff76c..1ff0c9a12ac78 100644
--- a/c10/cuda/driver_api.h
+++ b/c10/cuda/driver_api.h
@@ -20,6 +20,22 @@
     }                                                                      \
   } while (0)
 
+#define C10_CUDA_DRIVER_CHECK_GOTO(EXPR, NEXT)                             \
+  do {                                                                     \
+    CUresult __err = EXPR;                                                 \
+    if (__err != CUDA_SUCCESS) {                                           \
+      const char* err_str;                                                 \
+      CUresult get_error_str_err [[maybe_unused]] =                        \
+          c10::cuda::DriverAPI::get()->cuGetErrorString_(__err, &err_str); \
+      if (get_error_str_err != CUDA_SUCCESS) {                             \
+        TORCH_WARN("CUDA driver error: unknown error");                    \
+      } else {                                                             \
+        TORCH_WARN("CUDA driver error: ", err_str);                        \
+      }                                                                    \
+      goto NEXT;                                                           \
+    }                                                                      \
+  } while (0)
+
 // The integer in the second column specifies the requested CUDA Driver API
 // version. The dynamic loader will accept a driver with a newer version, but it
 // ensures that the requested symbol exists in *at least* the specified version
diff --git a/c10/metal/error.h b/c10/metal/error.h
new file mode 100644
index 0000000000000..bed113769747a
--- /dev/null
+++ b/c10/metal/error.h
@@ -0,0 +1,111 @@
+#pragma once
+#include <c10/metal/common.h>
+
+namespace c10 {
+namespace metal {
+C10_METAL_CONSTEXPR unsigned error_message_count = 30;
+struct ErrorMessage {
+  char file[128];
+  char func[128];
+  char message[250];
+  unsigned int line;
+};
+
+struct ErrorMessages {
+#ifdef __METAL__
+  ::metal::atomic<unsigned int> count;
+#else
+  unsigned int count;
+#endif
+  ErrorMessage msg[error_message_count];
+};
+
+#ifdef __METAL__
+namespace detail {
+static uint strncpy(device char* dst, constant const char* src, unsigned len) {
+  uint i = 0;
+  while (src[i] != 0 && i < len - 1) {
+    dst[i] = src[i];
+    i++;
+  }
+  dst[i] = 0;
+  return i;
+}
+
+inline uint print_arg(
+    device char* ptr,
+    unsigned len,
+    constant const char* arg) {
+  return strncpy(ptr, arg, len);
+}
+
+// Returns number length as string in base10
+static inline uint base10_length(long num) {
+  uint rc = 1;
+  if (num < 0) {
+    num = -num;
+    rc += 1;
+  }
+  while (num > 9) {
+    num /= 10;
+    rc++;
+  }
+  return rc;
+}
+
+// Converts signed integer to string
+inline uint print_arg(device char* ptr, unsigned len, long arg) {
+  const auto arg_len = base10_length(arg);
+  if (arg_len >= len)
+    return 0;
+  if (arg < 0) {
+    ptr[0] = '-';
+    arg = -arg;
+  }
+  uint idx = 1;
+  do {
+    ptr[arg_len - idx] = '0' + (arg % 10);
+    arg /= 10;
+    idx++;
+  } while (arg > 0);
+  ptr[arg_len] = 0;
+  return arg_len;
+}
+
+template <typename T>
+inline void print_args(device char* ptr, unsigned len, T arg) {
+  print_arg(ptr, len, arg);
+}
+
+template <typename T, typename... Args>
+inline void print_args(device char* ptr, unsigned len, T arg, Args... args) {
+  const auto rc = print_arg(ptr, len, arg);
+  print_args(ptr + rc, len - rc, args...);
+}
+
+} // namespace detail
+
+template <typename... Args>
+static void report_error(
+    device ErrorMessages* msgs,
+    constant const char* file,
+    int line,
+    constant const char* func,
+    Args... args) {
+  const auto idx =
+      atomic_fetch_add_explicit(&msgs->count, 1, ::metal::memory_order_relaxed);
+  if (idx >= error_message_count) {
+    return;
+  }
+  device auto* msg = &msgs->msg[idx];
+  detail::strncpy(msg->file, file, 128);
+  detail::strncpy(msg->func, func, 128);
+  detail::print_args(msg->message, 250, args...);
+  msg->line = line;
+}
+
+#define TORCH_REPORT_ERROR(buf, ...) \
+  ::c10::metal::report_error(buf, __FILE__, __LINE__, __func__, __VA_ARGS__)
+#endif
+} // namespace metal
+} // namespace c10
diff --git a/c10/test/build.bzl b/c10/test/build.bzl
index deb917dd8fcf3..7b4028ab4afed 100644
--- a/c10/test/build.bzl
+++ b/c10/test/build.bzl
@@ -66,6 +66,15 @@ def define_targets(rules):
         ],
     )
 
+    rules.cc_test(
+        name = "util/nofatal_test",
+        srcs = ["util/nofatal_test.cpp"],
+        deps = [
+            "//c10/util:base",
+            "@com_google_googletest//:gtest_main",
+        ],
+    )
+
     rules.cc_test(
         name = "util/ssize_test",
         srcs = ["util/ssize_test.cpp"],
diff --git a/c10/test/core/DispatchKeySet_test.cpp b/c10/test/core/DispatchKeySet_test.cpp
index a93461a041c39..cdbdc150167e0 100644
--- a/c10/test/core/DispatchKeySet_test.cpp
+++ b/c10/test/core/DispatchKeySet_test.cpp
@@ -435,7 +435,7 @@ TEST(DispatchKeySet, TestFunctionalityDispatchKeyToString) {
     if (i > 0) {
       ASSERT_TRUE(res.find("Unknown") == std::string::npos)
           << i << " (before is " << toString(static_cast<DispatchKey>(i - 1))
-          << ")";
+          << ')';
     } else {
       ASSERT_TRUE(res.find("Unknown") == std::string::npos) << i;
     }
diff --git a/c10/test/util/Half_test.cpp b/c10/test/util/Half_test.cpp
index a76814615101b..33c77ead61fc8 100644
--- a/c10/test/util/Half_test.cpp
+++ b/c10/test/util/Half_test.cpp
@@ -96,10 +96,10 @@ TEST(HalfConversionTest, TestPorableConversion) {
   for (auto x : inputs) {
     auto target = c10::detail::fp16_ieee_to_fp32_value(x);
     EXPECT_EQ(halfbits2float(x), target)
-        << "Test failed for uint16 to float " << x << "\n";
+        << "Test failed for uint16 to float " << x << '\n';
     EXPECT_EQ(
         float2halfbits(target), c10::detail::fp16_ieee_from_fp32_value(target))
-        << "Test failed for float to uint16" << target << "\n";
+        << "Test failed for float to uint16" << target << '\n';
   }
 }
 
diff --git a/c10/test/util/logging_test.cpp b/c10/test/util/logging_test.cpp
index b8fc81ddc6bbe..4587130564dfc 100644
--- a/c10/test/util/logging_test.cpp
+++ b/c10/test/util/logging_test.cpp
@@ -98,7 +98,7 @@ struct Noncopyable {
 };
 
 std::ostream& operator<<(std::ostream& out, const Noncopyable& nc) {
-  out << "Noncopyable(" << nc.x << ")";
+  out << "Noncopyable(" << nc.x << ')';
   return out;
 }
 } // namespace
diff --git a/c10/test/util/nofatal_test.cpp b/c10/test/util/nofatal_test.cpp
new file mode 100644
index 0000000000000..ba4b40b6f917e
--- /dev/null
+++ b/c10/test/util/nofatal_test.cpp
@@ -0,0 +1,53 @@
+#include <gtest/gtest.h>
+
+#include <c10/util/Exception.h>
+#include <c10/util/Logging.h>
+
+namespace {
+template <typename T>
+inline void expectThrowsEq(T&& fn, const char* expected_msg) {
+  try {
+    std::forward<T>(fn)();
+  } catch (const c10::Error& e) {
+    EXPECT_TRUE(
+        std::string(e.what_without_backtrace()).find(expected_msg) !=
+        std::string::npos);
+    return;
+  }
+  ADD_FAILURE() << "Expected to throw exception with message \"" << expected_msg
+                << "\" but didn't throw";
+}
+} // namespace
+
+TEST(NofatalTest, TorchCheckComparisons) {
+  // quick make sure that no-op works as expected
+  TORCH_CHECK_EQ(1, 1) << "i am a silly message " << 1;
+  expectThrowsEq(
+      []() { TORCH_CHECK_EQ(1, 2) << "i am a silly message " << 1; },
+      "Check failed: 1 == 2 (1 vs. 2). i am a silly message 1");
+  expectThrowsEq(
+      []() { TORCH_CHECK_NE(2, 2); }, "Check failed: 2 != 2 (2 vs. 2).");
+  expectThrowsEq(
+      []() { TORCH_CHECK_LT(2, 2); }, "Check failed: 2 < 2 (2 vs. 2).");
+  expectThrowsEq(
+      []() { TORCH_CHECK_LE(3, 2); }, "Check failed: 3 <= 2 (3 vs. 2).");
+  expectThrowsEq(
+      []() { TORCH_CHECK_GT(2, 2); }, "Check failed: 2 > 2 (2 vs. 2).");
+  expectThrowsEq(
+      []() { TORCH_CHECK_GE(2, 3); }, "Check failed: 2 >= 3 (2 vs. 3).");
+  expectThrowsEq(
+      []() {
+        void* p = nullptr;
+        TORCH_CHECK_NOTNULL(p);
+      },
+      "Check failed: 'p' must be non NULL.");
+
+#if GTEST_HAS_DEATH_TEST
+#ifndef NDEBUG
+  // if dbg build, DCHECK should result in deth
+  EXPECT_DEATH(TORCH_DCHECK_EQ(1, 2), "Check failed");
+#else
+  TORCH_DCHECK_EQ(1, 2); // no-op
+#endif
+#endif // GTEST_HAS_DEATH_TEST
+}
diff --git a/c10/util/ArrayRef.h b/c10/util/ArrayRef.h
index 64605f5153595..55900b6ee43c6 100644
--- a/c10/util/ArrayRef.h
+++ b/c10/util/ArrayRef.h
@@ -18,6 +18,7 @@
 #include <c10/macros/Macros.h>
 #include <c10/util/Exception.h>
 #include <c10/util/SmallVector.h>
+#include <torch/headeronly/util/HeaderOnlyArrayRef.h>
 
 #include <array>
 #include <cstddef>
@@ -40,200 +41,105 @@ namespace c10 {
 ///
 /// This is intended to be trivially copyable, so it should be passed by
 /// value.
+///
+/// NOTE: We have refactored out the headeronly parts of the ArrayRef struct
+/// into HeaderOnlyArrayRef. As adding `virtual` would change the performance of
+/// the underlying constexpr calls, we rely on apparent-type dispatch for
+/// inheritance. This should be fine because their memory format is the same,
+/// and it is never incorrect for ArrayRef to call HeaderOnlyArrayRef methods.
+/// However, you should prefer to use ArrayRef when possible, because its use
+/// of TORCH_CHECK will lead to better user-facing error messages.
 template <typename T>
-class ArrayRef final {
+// ArrayRef cannot be derived from. Normally, we would use `final`
+// specifier to force this constraint at compile time.  However, Intel
+// compiler does not recognize ArrayRef as a class template (which is
+// required in the definition of at::TensorAccessor, for instance)
+// when `final` specifier is used. So, we cannot define ArrayRef as
+// final because of the Intel compiler issue.
+class ArrayRef : public HeaderOnlyArrayRef<T> {
  public:
-  using iterator = const T*;
-  using const_iterator = const T*;
-  using size_type = size_t;
-  using value_type = T;
-
-  using reverse_iterator = std::reverse_iterator<iterator>;
-
- private:
-  /// The start of the array, in an external buffer.
-  const T* Data;
-
-  /// The number of elements.
-  size_type Length;
-
-  void debugCheckNullptrInvariant() {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-        Data != nullptr || Length == 0,
-        "created ArrayRef with nullptr and non-zero length! std::optional relies on this being illegal");
-  }
-
- public:
-  /// @name Constructors
+  /// @name Constructors, all inherited from HeaderOnlyArrayRef except for
+  /// SmallVector. As inherited constructors won't work with class template
+  /// argument deduction (CTAD) until C++23, we add deduction guides after
+  /// the class definition to enable CTAD.
   /// @{
 
-  /// Construct an empty ArrayRef.
-  /* implicit */ constexpr ArrayRef() : Data(nullptr), Length(0) {}
-
-  /// Construct an ArrayRef from a single element.
-  // TODO Make this explicit
-  constexpr ArrayRef(const T& OneElt) : Data(&OneElt), Length(1) {}
-
-  /// Construct an ArrayRef from a pointer and length.
-  constexpr ArrayRef(const T* data, size_t length)
-      : Data(data), Length(length) {
-    debugCheckNullptrInvariant();
-  }
-
-  /// Construct an ArrayRef from a range.
-  constexpr ArrayRef(const T* begin, const T* end)
-      : Data(begin), Length(end - begin) {
-    debugCheckNullptrInvariant();
-  }
+  using HeaderOnlyArrayRef<T>::HeaderOnlyArrayRef;
 
   /// Construct an ArrayRef from a SmallVector. This is templated in order to
   /// avoid instantiating SmallVectorTemplateCommon<T> whenever we
   /// copy-construct an ArrayRef.
+  /// NOTE: this is the only constructor that is not inherited from
+  /// HeaderOnlyArrayRef.
   template <typename U>
   /* implicit */ ArrayRef(const SmallVectorTemplateCommon<T, U>& Vec)
-      : Data(Vec.data()), Length(Vec.size()) {
-    debugCheckNullptrInvariant();
-  }
-
-  template <
-      typename Container,
-      typename U = decltype(std::declval<Container>().data()),
-      typename = std::enable_if_t<
-          (std::is_same_v<U, T*> || std::is_same_v<U, T const*>)>>
-  /* implicit */ ArrayRef(const Container& container)
-      : Data(container.data()), Length(container.size()) {
-    debugCheckNullptrInvariant();
-  }
-
-  /// Construct an ArrayRef from a std::vector.
-  // The enable_if stuff here makes sure that this isn't used for
-  // std::vector<bool>, because ArrayRef can't work on a std::vector<bool>
-  // bitfield.
-  template <typename A>
-  /* implicit */ ArrayRef(const std::vector<T, A>& Vec)
-      : Data(Vec.data()), Length(Vec.size()) {
-    static_assert(
-        !std::is_same_v<T, bool>,
-        "ArrayRef<bool> cannot be constructed from a std::vector<bool> bitfield.");
-  }
-
-  /// Construct an ArrayRef from a std::array
-  template <size_t N>
-  /* implicit */ constexpr ArrayRef(const std::array<T, N>& Arr)
-      : Data(Arr.data()), Length(N) {}
-
-  /// Construct an ArrayRef from a C array.
-  template <size_t N>
-  // NOLINTNEXTLINE(*c-arrays*)
-  /* implicit */ constexpr ArrayRef(const T (&Arr)[N]) : Data(Arr), Length(N) {}
-
-  /// Construct an ArrayRef from a std::initializer_list.
-  /* implicit */ constexpr ArrayRef(const std::initializer_list<T>& Vec)
-      : Data(
-            std::begin(Vec) == std::end(Vec) ? static_cast<T*>(nullptr)
-                                             : std::begin(Vec)),
-        Length(Vec.size()) {}
+      : HeaderOnlyArrayRef<T>(Vec.data(), Vec.size()) {}
 
   /// @}
-  /// @name Simple Operations
+  /// @name Simple Operations, mostly inherited from HeaderOnlyArrayRef
   /// @{
 
-  constexpr iterator begin() const {
-    return Data;
-  }
-  constexpr iterator end() const {
-    return Data + Length;
-  }
-
-  // These are actually the same as iterator, since ArrayRef only
-  // gives you const iterators.
-  constexpr const_iterator cbegin() const {
-    return Data;
-  }
-  constexpr const_iterator cend() const {
-    return Data + Length;
-  }
-
-  constexpr reverse_iterator rbegin() const {
-    return reverse_iterator(end());
-  }
-  constexpr reverse_iterator rend() const {
-    return reverse_iterator(begin());
-  }
-
-  /// Check if all elements in the array satisfy the given expression
-  constexpr bool allMatch(const std::function<bool(const T&)>& pred) const {
-    return std::all_of(cbegin(), cend(), pred);
-  }
-
-  /// empty - Check if the array is empty.
-  constexpr bool empty() const {
-    return Length == 0;
-  }
-
-  constexpr const T* data() const {
-    return Data;
-  }
-
-  /// size - Get the array size.
-  constexpr size_t size() const {
-    return Length;
-  }
-
   /// front - Get the first element.
+  /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
+  /// STD_TORCH_CHECK
   constexpr const T& front() const {
     TORCH_CHECK(
-        !empty(), "ArrayRef: attempted to access front() of empty list");
-    return Data[0];
+        !this->empty(), "ArrayRef: attempted to access front() of empty list");
+    return this->Data[0];
   }
 
   /// back - Get the last element.
+  /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
+  /// STD_TORCH_CHECK
   constexpr const T& back() const {
-    TORCH_CHECK(!empty(), "ArrayRef: attempted to access back() of empty list");
-    return Data[Length - 1];
-  }
-
-  /// equals - Check for element-wise equality.
-  constexpr bool equals(ArrayRef RHS) const {
-    return Length == RHS.Length && std::equal(begin(), end(), RHS.begin());
+    TORCH_CHECK(
+        !this->empty(), "ArrayRef: attempted to access back() of empty list");
+    return this->Data[this->Length - 1];
   }
 
   /// slice(n, m) - Take M elements of the array starting at element N
+  /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
+  /// STD_TORCH_CHECK
   constexpr ArrayRef<T> slice(size_t N, size_t M) const {
     TORCH_CHECK(
-        N + M <= size(),
+        N + M <= this->size(),
         "ArrayRef: invalid slice, N = ",
         N,
         "; M = ",
         M,
         "; size = ",
-        size());
-    return ArrayRef<T>(data() + N, M);
+        this->size());
+    return ArrayRef<T>(this->data() + N, M);
   }
 
   /// slice(n) - Chop off the first N elements of the array.
+  /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
+  /// STD_TORCH_CHECK
   constexpr ArrayRef<T> slice(size_t N) const {
     TORCH_CHECK(
-        N <= size(), "ArrayRef: invalid slice, N = ", N, "; size = ", size());
-    return slice(N, size() - N);
+        N <= this->size(),
+        "ArrayRef: invalid slice, N = ",
+        N,
+        "; size = ",
+        this->size());
+    return slice(N, this->size() - N); // should this slice be this->slice?
   }
 
   /// @}
   /// @name Operator Overloads
   /// @{
-  constexpr const T& operator[](size_t Index) const {
-    return Data[Index];
-  }
 
   /// Vector compatibility
+  /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
+  /// STD_TORCH_CHECK
   constexpr const T& at(size_t Index) const {
     TORCH_CHECK(
-        Index < Length,
+        Index < this->Length,
         "ArrayRef: invalid index Index = ",
         Index,
         "; Length = ",
-        Length);
-    return Data[Index];
+        this->Length);
+    return this->Data[Index];
   }
 
   /// Disallow accidental assignment from a temporary.
@@ -253,26 +159,58 @@ class ArrayRef final {
   std::enable_if_t<std::is_same_v<U, T>, ArrayRef<T>>& operator=(
       std::initializer_list<U>) = delete;
 
-  /// @}
-  /// @name Expensive Operations
-  /// @{
-  std::vector<T> vec() const {
-    return std::vector<T>(Data, Data + Length);
-  }
-
   /// @}
 };
 
+/// Deduction guides for ArrayRef to support CTAD with inherited constructors
+/// These mirror the constructors inherited from HeaderOnlyArrayRef
+/// @{
+
+// Single element constructor
+template <typename T>
+ArrayRef(const T&) -> ArrayRef<T>;
+
+// Pointer and length constructor
+template <typename T>
+ArrayRef(const T*, size_t) -> ArrayRef<T>;
+
+// Range constructor (begin, end)
+template <typename T>
+ArrayRef(const T*, const T*) -> ArrayRef<T>;
+
+// Generic container constructor (anything with .data() and .size())
+template <typename Container>
+ArrayRef(const Container&) -> ArrayRef<
+    std::remove_pointer_t<decltype(std::declval<Container>().data())>>;
+
+// std::vector constructor
+template <typename T, typename A>
+ArrayRef(const std::vector<T, A>&) -> ArrayRef<T>;
+
+// std::array constructor
+template <typename T, size_t N>
+ArrayRef(const std::array<T, N>&) -> ArrayRef<T>;
+
+// C array constructor
+template <typename T, size_t N>
+ArrayRef(const T (&)[N]) -> ArrayRef<T>;
+
+// std::initializer_list constructor
+template <typename T>
+ArrayRef(const std::initializer_list<T>&) -> ArrayRef<T>;
+
+/// @}
+
 template <typename T>
 std::ostream& operator<<(std::ostream& out, ArrayRef<T> list) {
   int i = 0;
-  out << "[";
+  out << '[';
   for (const auto& e : list) {
     if (i++ > 0)
       out << ", ";
     out << e;
   }
-  out << "]";
+  out << ']';
   return out;
 }
 
diff --git a/c10/util/Backtrace.cpp b/c10/util/Backtrace.cpp
index 8838cafb029e4..29dbfe427ae01 100644
--- a/c10/util/Backtrace.cpp
+++ b/c10/util/Backtrace.cpp
@@ -106,8 +106,8 @@ class GetBacktraceImpl {
           /*length*/ &length,
           /*status*/ &status);
 
-      os << " frame #" << idx++ << "\t"
-         << ((demangled != NULL && status == 0) ? demangled : symbol) << "["
+      os << " frame #" << idx++ << '\t'
+         << ((demangled != NULL && status == 0) ? demangled : symbol) << '['
          << addr << "]\t" << std::endl;
     }
     free(demangled);
@@ -274,7 +274,7 @@ class GetBacktraceImpl {
       } else {
         // In the edge-case where we couldn't parse the frame string, we can
         // just use it directly (it may have a different format).
-        stream << symbols[frame_number] << "\n";
+        stream << symbols[frame_number] << '\n';
       }
     }
 
@@ -413,8 +413,8 @@ class GetBacktraceImpl {
              << back_trace_[i_frame] << std::dec;
       if (with_symbol) {
         stream << std::setfill('0') << std::setw(16) << std::uppercase
-               << std::hex << p_symbol->Address << std::dec << " " << module
-               << "!" << p_symbol->Name;
+               << std::hex << p_symbol->Address << std::dec << ' ' << module
+               << '!' << p_symbol->Name;
       } else {
         stream << " <unknown symbol address> " << module << "!<unknown symbol>";
       }
@@ -424,7 +424,7 @@ class GetBacktraceImpl {
       } else {
         stream << "<unknown file> @ <unknown line number>";
       }
-      stream << "]" << std::endl;
+      stream << ']' << std::endl;
     }
 
     return stream.str();
diff --git a/c10/util/Exception.cpp b/c10/util/Exception.cpp
index 1928c2c175c7b..50f423f917981 100644
--- a/c10/util/Exception.cpp
+++ b/c10/util/Exception.cpp
@@ -44,7 +44,7 @@ std::string Error::compute_what(bool include_backtrace) const {
 
   if (context_.size() == 1) {
     // Fold error and context in one line
-    oss << " (" << context_[0] << ")";
+    oss << " (" << context_[0] << ')';
   } else {
     for (const auto& c : context_) {
       oss << "\n  " << c;
@@ -52,7 +52,7 @@ std::string Error::compute_what(bool include_backtrace) const {
   }
 
   if (include_backtrace && backtrace_) {
-    oss << "\n" << backtrace_->get();
+    oss << '\n' << backtrace_->get();
   }
 
   return oss.str();
@@ -247,7 +247,7 @@ void WarningHandler::process(const Warning& warning) {
   LOG_AT_FILE_LINE(
       WARNING, warning.source_location().file, warning.source_location().line)
       << "Warning: " << warning.msg() << " (function "
-      << warning.source_location().function << ")";
+      << warning.source_location().function << ')';
 }
 
 std::string GetExceptionString(const std::exception& e) {
diff --git a/c10/util/Exception.h b/c10/util/Exception.h
index 6b2fd626bfb5e..a4537c862ae7b 100644
--- a/c10/util/Exception.h
+++ b/c10/util/Exception.h
@@ -379,7 +379,11 @@ C10_API std::string GetExceptionString(const std::exception& e);
 // ----------------------------------------------------------------------------
 
 #ifdef STRIP_ERROR_MESSAGES
-#define TORCH_RETHROW(e, ...) throw
+#define TORCH_RETHROW(e, ...)                       \
+  do {                                              \
+    (void)e; /* Suppress unused variable warning */ \
+    throw;                                          \
+  } while (false)
 #else
 #define TORCH_RETHROW(e, ...)               \
   do {                                      \
@@ -702,6 +706,98 @@ namespace c10::detail {
 #define TORCH_CHECK_ARG(cond, argN, ...) \
   TORCH_CHECK(cond, "invalid argument ", argN, ": ", __VA_ARGS__)
 
+#ifndef FATAL_IF
+#ifdef C10_USE_GLOG
+#define FATAL_IF(condition)                                              \
+  condition ? (void)0                                                    \
+            : ::c10::LoggerVoidify() &                                   \
+          ::c10::MessageLogger(__FILE__, __LINE__, ::google::GLOG_FATAL) \
+              .stream()
+#else
+#define FATAL_IF(condition)            \
+  condition ? (void)0                  \
+            : ::c10::LoggerVoidify() & \
+          ::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_FATAL).stream()
+#endif
+#endif
+
+#ifndef NON_FATAL_IF
+#ifdef C10_USE_GLOG
+#define NON_FATAL_IF(condition)                                \
+  condition ? (void)0                                          \
+            : ::c10::LoggerVoidify() &                         \
+          ::c10::MessageLogger(                                \
+              __FILE__, __LINE__, ::google::GLOG_FATAL, false) \
+              .stream()
+#else
+#define NON_FATAL_IF(condition)                                              \
+  condition ? (void)0                                                        \
+            : ::c10::LoggerVoidify() &                                       \
+          ::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_FATAL, false) \
+              .stream()
+#endif
+#endif
+
+// Binary comparison check macros
+#define TORCH_CHECK_OP(val1, val2, op)                                      \
+  NON_FATAL_IF(((val1)op(val2)))                                            \
+      << "Check failed: " #val1 " " #op " " #val2 " (" << (val1) << " vs. " \
+      << (val2) << "). "
+
+#define TORCH_DCHECK_OP(val1, val2, op)                                       \
+  FATAL_IF(((val1)op(val2))) << "Check failed: " #val1 " " #op " " #val2 " (" \
+                             << (val1) << " vs. " << (val2) << "). "
+
+#define TORCH_CHECK_EQ(val1, val2) TORCH_CHECK_OP(val1, val2, ==)
+#define TORCH_CHECK_NE(val1, val2) TORCH_CHECK_OP(val1, val2, !=)
+#define TORCH_CHECK_LE(val1, val2) TORCH_CHECK_OP(val1, val2, <=)
+#define TORCH_CHECK_LT(val1, val2) TORCH_CHECK_OP(val1, val2, <)
+#define TORCH_CHECK_GE(val1, val2) TORCH_CHECK_OP(val1, val2, >=)
+#define TORCH_CHECK_GT(val1, val2) TORCH_CHECK_OP(val1, val2, >)
+
+// Debug versions of TORCH_CHECK_OP macros
+#ifndef NDEBUG
+#define TORCH_DCHECK_EQ(val1, val2) TORCH_DCHECK_OP(val1, val2, ==)
+#define TORCH_DCHECK_NE(val1, val2) TORCH_DCHECK_OP(val1, val2, !=)
+#define TORCH_DCHECK_LE(val1, val2) TORCH_DCHECK_OP(val1, val2, <=)
+#define TORCH_DCHECK_LT(val1, val2) TORCH_DCHECK_OP(val1, val2, <)
+#define TORCH_DCHECK_GE(val1, val2) TORCH_DCHECK_OP(val1, val2, >=)
+#define TORCH_DCHECK_GT(val1, val2) TORCH_DCHECK_OP(val1, val2, >)
+#else // !NDEBUG
+// Optimized versions - generate no code
+#define TORCH_DCHECK_EQ(val1, val2) \
+  while (false)                     \
+  TORCH_DCHECK_OP(val1, val2, ==)
+#define TORCH_DCHECK_NE(val1, val2) \
+  while (false)                     \
+  TORCH_DCHECK_OP(val1, val2, !=)
+#define TORCH_DCHECK_LE(val1, val2) \
+  while (false)                     \
+  TORCH_DCHECK_OP(val1, val2, <=)
+#define TORCH_DCHECK_LT(val1, val2) \
+  while (false)                     \
+  TORCH_DCHECK_OP(val1, val2, <)
+#define TORCH_DCHECK_GE(val1, val2) \
+  while (false)                     \
+  TORCH_DCHECK_OP(val1, val2, >=)
+#define TORCH_DCHECK_GT(val1, val2) \
+  while (false)                     \
+  TORCH_DCHECK_OP(val1, val2, >)
+#endif // NDEBUG
+
+// Null pointer check macro
+#define TORCH_CHECK_NOTNULL(val) \
+  ::c10::CheckNotNull(__FILE__, __LINE__, #val, (val), false)
+
+#ifndef NDEBUG
+#define TORCH_DCHECK_NOTNULL(val) \
+  ::c10::CheckNotNull(__FILE__, __LINE__, #val, (val), true)
+#else // !NDEBUG
+#define TORCH_DCHECK_NOTNULL(val) \
+  while (false)                   \
+  TORCH_CHECK_NOTNULL(val)
+#endif // NDEBUG
+
 // ----------------------------------------------------------------------------
 // Deprecated macros
 // ----------------------------------------------------------------------------
diff --git a/c10/util/Logging.cpp b/c10/util/Logging.cpp
index 555ab685c0b5f..298503dfbe340 100644
--- a/c10/util/Logging.cpp
+++ b/c10/util/Logging.cpp
@@ -291,6 +291,32 @@ namespace c10 {
 using fLB::FLAGS_logtostderr;
 using fLI::FLAGS_minloglevel;
 using fLI::FLAGS_v;
+
+MessageLogger::MessageLogger(
+    const char* file,
+    int line,
+    int severity,
+    bool exit_on_fatal)
+    : stream_(), severity_(severity), exit_on_fatal_(exit_on_fatal) {}
+
+MessageLogger::~MessageLogger() noexcept(false) {
+  if (severity_ == ::google::GLOG_FATAL) {
+    DealWithFatal();
+  }
+}
+
+std::stringstream& MessageLogger::stream() {
+  return stream_;
+}
+
+void MessageLogger::DealWithFatal() {
+  if (exit_on_fatal_) {
+    LOG(FATAL) << stream_.str();
+  } else {
+    throw c10::Error(stream_.str(), nullptr, nullptr);
+  }
+}
+
 } // namespace c10
 
 C10_DEFINE_int(
@@ -412,17 +438,16 @@ void ShowLogInfoToStderr() {
   FLAGS_caffe2_log_level = GLOG_INFO;
 }
 
-MessageLogger::MessageLogger(const char* file, int line, int severity)
-    : severity_(severity) {
+MessageLogger::MessageLogger(
+    const char* file,
+    int line,
+    int severity,
+    bool exit_on_fatal)
+    : severity_(severity), exit_on_fatal_(exit_on_fatal) {
   if (severity_ < FLAGS_caffe2_log_level) {
     // Nothing needs to be logged.
     return;
   }
-#ifdef ANDROID
-  tag_ = "native";
-#else // !ANDROID
-  tag_ = "";
-#endif // ANDROID
 
   time_t rawtime = 0;
   time(&rawtime);
@@ -448,22 +473,22 @@ MessageLogger::MessageLogger(const char* file, int line, int severity)
   if (GLOBAL_RANK != -1) {
     stream_ << "[rank" << GLOBAL_RANK << "]:";
   }
-  stream_ << "[" << CAFFE2_SEVERITY_PREFIX[std::min(4, GLOG_FATAL - severity_)]
+  stream_ << '[' << CAFFE2_SEVERITY_PREFIX[std::min(4, GLOG_FATAL - severity_)]
           << (timeinfo->tm_mon + 1) * 100 + timeinfo->tm_mday
-          << std::setfill('0') << " " << std::setw(2) << timeinfo->tm_hour
-          << ":" << std::setw(2) << timeinfo->tm_min << ":" << std::setw(2)
-          << timeinfo->tm_sec << "." << std::setw(9) << ns << " "
-          << c10::detail::StripBasename(std::string(file)) << ":" << line
+          << std::setfill('0') << ' ' << std::setw(2) << timeinfo->tm_hour
+          << ':' << std::setw(2) << timeinfo->tm_min << ':' << std::setw(2)
+          << timeinfo->tm_sec << '.' << std::setw(9) << ns << ' '
+          << c10::detail::StripBasename(std::string(file)) << ':' << line
           << "] ";
 }
 
 // Output the contents of the stream to the proper channel on destruction.
-MessageLogger::~MessageLogger() {
+MessageLogger::~MessageLogger() noexcept(false) {
   if (severity_ < FLAGS_caffe2_log_level) {
     // Nothing needs to be logged.
     return;
   }
-  stream_ << "\n";
+  stream_ << '\n';
 #ifdef ANDROID
   static const int android_log_levels[] = {
       ANDROID_LOG_FATAL, // LOG_FATAL
@@ -498,6 +523,18 @@ MessageLogger::~MessageLogger() {
   }
 }
 
+std::stringstream& MessageLogger::stream() {
+  return stream_;
+}
+
+void MessageLogger::DealWithFatal() {
+  if (exit_on_fatal_) {
+    abort();
+  } else {
+    throw c10::Error(stream_.str(), nullptr, nullptr);
+  }
+}
+
 } // namespace c10
 
 #endif // !C10_USE_GLOG
diff --git a/c10/util/Metaprogramming.cpp b/c10/util/Metaprogramming.cpp
deleted file mode 100644
index f6ee24a79bcd8..0000000000000
--- a/c10/util/Metaprogramming.cpp
+++ /dev/null
@@ -1 +0,0 @@
-#include <c10/util/Metaprogramming.h>
diff --git a/c10/util/Metaprogramming.h b/c10/util/Metaprogramming.h
index d504706f3283a..a5912706e1ed1 100644
--- a/c10/util/Metaprogramming.h
+++ b/c10/util/Metaprogramming.h
@@ -1,224 +1 @@
-#pragma once
-
-#include <c10/util/TypeList.h>
-#include <type_traits>
-
-namespace c10::guts {
-
-/**
- * Access information about result type or arguments from a function type.
- * Example:
- * using A = function_traits<int (float, double)>::return_type // A == int
- * using A = function_traits<int (float, double)>::parameter_types::tuple_type
- * // A == tuple<float, double>
- */
-template <class Func>
-struct function_traits {
-  static_assert(
-      !std::is_same_v<Func, Func>,
-      "In function_traits<Func>, Func must be a plain function type.");
-};
-template <class Result, class... Args>
-struct function_traits<Result(Args...)> {
-  using func_type = Result(Args...);
-  using return_type = Result;
-  using parameter_types = typelist::typelist<Args...>;
-  static constexpr auto number_of_parameters = sizeof...(Args);
-};
-
-/**
- * infer_function_traits: creates a `function_traits` type for a simple
- * function (pointer) or functor (lambda/struct). Currently does not support
- * class methods.
- */
-
-template <typename Functor>
-struct infer_function_traits {
-  using type = function_traits<
-      c10::guts::detail::strip_class_t<decltype(&Functor::operator())>>;
-};
-
-template <typename Result, typename... Args>
-struct infer_function_traits<Result (*)(Args...)> {
-  using type = function_traits<Result(Args...)>;
-};
-
-template <typename Result, typename... Args>
-struct infer_function_traits<Result(Args...)> {
-  using type = function_traits<Result(Args...)>;
-};
-
-template <typename T>
-using infer_function_traits_t = typename infer_function_traits<T>::type;
-
-/**
- * make_function_traits: creates a `function_traits` type given a Return type
- * and a typelist of Argument types
- *
- * Example:
- * bool f(int, int);
- *
- * infer_function_traits_t<f> == make_function_traits_t<bool,
- * typelist::typelist<int, int>>
- */
-template <typename Result, typename ArgList>
-struct make_function_traits {
-  static_assert(
-      false_t<ArgList>::value,
-      "In guts::make_function_traits<Result, TypeList>, the ArgList argument must be typelist<...>.");
-};
-
-template <typename Result, typename... Args>
-struct make_function_traits<Result, typelist::typelist<Args...>> {
-  using type = function_traits<Result(Args...)>;
-};
-
-template <typename Result, typename ArgList>
-using make_function_traits_t =
-    typename make_function_traits<Result, ArgList>::type;
-
-/**
- * make_offset_index_sequence<Start, N>
- * Like make_index_sequence<N>, but starting from Start instead of 0.
- *
- * Example:
- *  make_offset_index_sequence<10, 3> == std::index_sequence<10, 11, 12>
- */
-template <size_t Start, size_t N, size_t... Is>
-struct make_offset_index_sequence_impl
-    : make_offset_index_sequence_impl<Start, N - 1, Start + N - 1, Is...> {
-  static_assert(
-      static_cast<int>(Start) >= 0,
-      "make_offset_index_sequence: Start < 0");
-  static_assert(static_cast<int>(N) >= 0, "make_offset_index_sequence: N < 0");
-};
-
-template <size_t Start, size_t... Is>
-struct make_offset_index_sequence_impl<Start, 0, Is...> {
-  typedef std::index_sequence<Is...> type;
-};
-
-template <size_t Start, size_t N>
-using make_offset_index_sequence =
-    typename make_offset_index_sequence_impl<Start, N>::type;
-
-/**
- * Use tuple_elements to extract a position-indexed subset of elements
- * from the argument tuple into a result tuple.
- *
- * Example:
- *  std::tuple<int, const char*, double> t = std::make_tuple(0, "HEY", 2.0);
- *  std::tuple<int, double> result = tuple_elements(t, std::index_sequence<0,
- * 2>());
- */
-template <class Tuple, size_t... Is>
-constexpr auto tuple_elements(Tuple t, std::index_sequence<Is...> /*unused*/) {
-  return std::tuple<std::tuple_element_t<Is, Tuple>...>(std::get<Is>(t)...);
-}
-
-/**
- * Use tuple_take to extract the first or last n elements from the argument
- * tuple into a result tuple.
- *
- * Example:
- *  std::tuple<int, const char*, double> t = std::make_tuple(0, "HEY", 2.0);
- *  std::tuple<int, const char*> first_two = tuple_take<decltype(t), 2>(t);
- *  std::tuple<const char*, double> last_two = tuple_take<decltype(t), -2>(t);
- */
-template <class Tuple, int N, class Enable = void>
-struct TupleTake {};
-
-template <class Tuple, int N>
-struct TupleTake<Tuple, N, std::enable_if_t<N >= 0, void>> {
-  static auto call(Tuple t) {
-    constexpr size_t size = std::tuple_size<Tuple>();
-    static_assert(N <= size, "tuple_take: N > size");
-    return tuple_elements(t, std::make_index_sequence<N>{});
-  }
-};
-
-template <class Tuple, int N>
-    struct TupleTake < Tuple,
-    N, std::enable_if_t<N<0, void>> {
-  static auto call(Tuple t) {
-    constexpr size_t size = std::tuple_size<Tuple>();
-    static_assert(-N <= size, "tuple_take: -N > size");
-    return tuple_elements(t, make_offset_index_sequence<size + N, -N>{});
-  }
-};
-
-template <class Tuple, int N>
-auto tuple_take(Tuple t) {
-  return TupleTake<Tuple, N>::call(t);
-}
-
-/**
- * Use tuple_slice to extract a contiguous subtuple from the argument.
- *
- * Example:
- *  std::tuple<int, const char*, double, bool> t = std::make_tuple(0,
- * "HEY", 2.0, false); std::tuple<int, const char*> middle_two =
- * tuple_slice<decltype(t), 1, 2>(t);
- */
-template <class Tuple, size_t Start, size_t N>
-constexpr auto tuple_slice(Tuple t) {
-  constexpr size_t size = std::tuple_size<Tuple>();
-  static_assert(Start + N <= size, "tuple_slice: Start + N > size");
-  return tuple_elements(t, make_offset_index_sequence<Start, N>{});
-}
-
-/**
- * Use tuple_map to run a mapping function over a tuple to get a new tuple.
- *
- * Example 1:
- *   auto result = tuple_map(std::tuple<int32_t, int32_t, int32_t>(3, 4, 5), []
- * (int32_t a) -> int16_t {return a+1;});
- *   // result == std::tuple<int16_t, int16_t, int16_t>(4, 5, 6)
- *
- * Example 2:
- *   struct Mapper {
- *     std::string operator()(int32_t a) const {
- *       return std::to_string(a);
- *     }
- *     int64_t operator()(const std::string& a) const {
- *        return atoi(a.c_str());
- *     }
- *   };
- *   auto result = tuple_map(std::tuple<int32_t, std::string>(3, "4"),
- * Mapper());
- *   // result == std::tuple<std::string, int64_t>("3", 4)
- *
- * Example 3:
- *   struct A final {
- *    int32_t func() {
- *      return 5;
- *    }
- *  };
- *  struct B final {
- *    std::string func() {
- *      return "5";
- *    }
- *  };
- *  auto result = tuple_map(std::make_tuple(A(), B()), [] (auto a) { return
- * a.func(); });
- *  // result == std::tuple<int32_t, std::string>(5, "5");
- */
-namespace detail {
-template <class Mapper, class... Args, size_t... Indices>
-auto tuple_map(
-    // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
-    std::tuple<Args...>&& tuple,
-    const Mapper& mapper,
-    std::index_sequence<Indices...> /*unused*/) {
-  return std::tuple<decltype(mapper(std::forward<Args>(std::get<Indices>(
-      tuple))))...>(mapper(std::forward<Args>(std::get<Indices>(tuple)))...);
-}
-} // namespace detail
-
-template <class Mapper, class... Args>
-auto tuple_map(std::tuple<Args...>&& tuple, const Mapper& mapper) {
-  return detail::tuple_map(
-      std::move(tuple), mapper, std::index_sequence_for<Args...>());
-}
-
-} // namespace c10::guts
+#include <torch/headeronly/util/Metaprogramming.h>
diff --git a/c10/util/SmallVector.h b/c10/util/SmallVector.h
index d02c9380a563d..d47f37cdf7eca 100644
--- a/c10/util/SmallVector.h
+++ b/c10/util/SmallVector.h
@@ -1412,13 +1412,13 @@ inline size_t capacity_in_bytes(const SmallVector<T, N>& X) {
 template <typename T, unsigned N>
 std::ostream& operator<<(std::ostream& out, const SmallVector<T, N>& list) {
   int i = 0;
-  out << "[";
+  out << '[';
   for (auto e : list) {
     if (i++ > 0)
       out << ", ";
     out << e;
   }
-  out << "]";
+  out << ']';
   return out;
 }
 
diff --git a/c10/util/StringUtil.cpp b/c10/util/StringUtil.cpp
index 063a8fc93ea7a..6fae2f004cc93 100644
--- a/c10/util/StringUtil.cpp
+++ b/c10/util/StringUtil.cpp
@@ -79,7 +79,7 @@ std::ostream& _str(std::ostream& ss, const std::wstring& wString) {
 } // namespace detail
 
 std::ostream& operator<<(std::ostream& out, const SourceLocation& loc) {
-  out << loc.function << " at " << loc.file << ":" << loc.line;
+  out << loc.function << " at " << loc.file << ':' << loc.line;
   return out;
 }
 
diff --git a/c10/util/StringUtil.h b/c10/util/StringUtil.h
index cbc6f4ec336bb..de241bc9f7c45 100644
--- a/c10/util/StringUtil.h
+++ b/c10/util/StringUtil.h
@@ -170,7 +170,7 @@ inline bool isPrint(char s) {
 }
 
 inline void printQuotedString(std::ostream& stmt, const std::string_view str) {
-  stmt << "\"";
+  stmt << '"';
   for (auto s : str) {
     switch (s) {
       case '\\':
@@ -224,7 +224,7 @@ inline void printQuotedString(std::ostream& stmt, const std::string_view str) {
         break;
     }
   }
-  stmt << "\"";
+  stmt << '"';
 }
 
 template <typename T>
diff --git a/c10/util/TypeList.h b/c10/util/TypeList.h
index 244e5bb141cd7..9f79099710d71 100644
--- a/c10/util/TypeList.h
+++ b/c10/util/TypeList.h
@@ -1,515 +1 @@
-#pragma once
-
-#include <c10/util/TypeTraits.h>
-#include <algorithm>
-#include <cstddef>
-#include <tuple>
-#include <type_traits>
-#include <utility>
-
-namespace c10::guts {
-
-template <class... T>
-struct false_t : std::false_type {};
-template <template <class> class... T>
-struct false_higher_t : std::false_type {};
-
-namespace typelist {
-
-/**
- * Type holding a list of types for compile time type computations
- */
-template <class... Items>
-struct typelist final {
- public:
-  typelist() = delete; // not for instantiation
-};
-
-/**
- * Returns the number of types in a typelist
- * Example:
- *   3  ==  size<typelist<int, int, double>>::value
- */
-template <class TypeList>
-struct size final {
-  static_assert(
-      false_t<TypeList>::value,
-      "In typelist::size<T>, T must be typelist<...>.");
-};
-template <class... Types>
-struct size<typelist<Types...>> final {
-  static constexpr size_t value = sizeof...(Types);
-};
-
-/**
- * Transforms a list of types into a tuple holding these types.
- * Example:
- *   std::tuple<int, string>  ==  to_tuple_t<typelist<int, string>>
- */
-template <class TypeList>
-struct to_tuple final {
-  static_assert(
-      false_t<TypeList>::value,
-      "In typelist::to_tuple<T>, T must be typelist<...>.");
-};
-template <class... Types>
-struct to_tuple<typelist<Types...>> final {
-  using type = std::tuple<Types...>;
-};
-template <class TypeList>
-using to_tuple_t = typename to_tuple<TypeList>::type;
-
-/**
- * Creates a typelist containing the types of a given tuple.
- * Example:
- *   typelist<int, string>  ==  from_tuple_t<std::tuple<int, string>>
- */
-template <class Tuple>
-struct from_tuple final {
-  static_assert(
-      false_t<Tuple>::value,
-      "In typelist::from_tuple<T>, T must be std::tuple<...>.");
-};
-template <class... Types>
-struct from_tuple<std::tuple<Types...>> final {
-  using type = typelist<Types...>;
-};
-template <class Tuple>
-using from_tuple_t = typename from_tuple<Tuple>::type;
-
-/**
- * Concatenates multiple type lists.
- * Example:
- *   typelist<int, string, int>  ==  concat_t<typelist<int, string>,
- * typelist<int>>
- */
-template <class... TypeLists>
-struct concat final {
-  static_assert(
-      false_t<TypeLists...>::value,
-      "In typelist::concat<T1, ...>, the T arguments each must be typelist<...>.");
-};
-template <class... Head1Types, class... Head2Types, class... TailLists>
-struct concat<typelist<Head1Types...>, typelist<Head2Types...>, TailLists...>
-    final {
-  using type =
-      typename concat<typelist<Head1Types..., Head2Types...>, TailLists...>::
-          type;
-};
-template <class... HeadTypes>
-struct concat<typelist<HeadTypes...>> final {
-  using type = typelist<HeadTypes...>;
-};
-template <>
-struct concat<> final {
-  using type = typelist<>;
-};
-template <class... TypeLists>
-using concat_t = typename concat<TypeLists...>::type;
-
-/**
- * Filters the types in a type list by a type trait.
- * Examples:
- *   typelist<int&, const string&&>  ==  filter_t<std::is_reference,
- * typelist<void, string, int&, bool, const string&&, int>>
- */
-template <template <class> class Condition, class TypeList>
-struct filter final {
-  static_assert(
-      false_t<TypeList>::value,
-      "In typelist::filter<Condition, TypeList>, the TypeList argument must be typelist<...>.");
-};
-template <template <class> class Condition, class Head, class... Tail>
-struct filter<Condition, typelist<Head, Tail...>> final {
-  static_assert(
-      is_type_condition<Condition>::value,
-      "In typelist::filter<Condition, TypeList>, the Condition argument must be a condition type trait, i.e. have a static constexpr bool ::value member.");
-  using type = std::conditional_t<
-      Condition<Head>::value,
-      concat_t<
-          typelist<Head>,
-          typename filter<Condition, typelist<Tail...>>::type>,
-      typename filter<Condition, typelist<Tail...>>::type>;
-};
-template <template <class> class Condition>
-struct filter<Condition, typelist<>> final {
-  static_assert(
-      is_type_condition<Condition>::value,
-      "In typelist::filter<Condition, TypeList>, the Condition argument must be a condition type trait, i.e. have a static constexpr bool ::value member.");
-  using type = typelist<>;
-};
-template <template <class> class Condition, class TypeList>
-using filter_t = typename filter<Condition, TypeList>::type;
-
-/**
- * Counts how many types in the list fulfill a type trait
- * Examples:
- *   2  ==  count_if<std::is_reference, typelist<void, string, int&, bool, const
- * string&&, int>>
- */
-template <template <class> class Condition, class TypeList>
-struct count_if final {
-  static_assert(
-      is_type_condition<Condition>::value,
-      "In typelist::count_if<Condition, TypeList>, the Condition argument must be a condition type trait, i.e. have a static constexpr bool ::value member.");
-  static_assert(
-      is_instantiation_of<typelist, TypeList>::value,
-      "In typelist::count_if<Condition, TypeList>, the TypeList argument must be typelist<...>.");
-  // TODO Direct implementation might be faster
-  static constexpr size_t value = size<filter_t<Condition, TypeList>>::value;
-};
-
-/**
- * Checks if a typelist contains a certain type.
- * Examples:
- *  contains<typelist<int, string>, string> == true_type
- *  contains<typelist<int, string>, double> == false_type
- */
-namespace detail {
-template <class TypeList, class Type, class Enable = void>
-struct contains {};
-template <class Type>
-struct contains<typelist<>, Type, void> : std::false_type {};
-template <class Type, class Head, class... Tail>
-struct contains<
-    typelist<Head, Tail...>,
-    Type,
-    std::enable_if_t<std::is_same_v<Head, Type>>> : std::true_type {};
-template <class Type, class Head, class... Tail>
-struct contains<
-    typelist<Head, Tail...>,
-    Type,
-    std::enable_if_t<!std::is_same_v<Head, Type>>>
-    : contains<typelist<Tail...>, Type> {};
-} // namespace detail
-template <class TypeList, class Type>
-using contains = typename detail::contains<TypeList, Type>::type;
-
-/**
- * Returns true iff the type trait is true for all types in the type list
- * Examples:
- *   true   ==  all<std::is_reference, typelist<int&, const float&&, const
- * MyClass&>>::value false  ==  all<std::is_reference, typelist<int&, const
- * float&&, MyClass>>::value
- */
-template <template <class> class Condition, class TypeList>
-struct all {
-  static_assert(
-      false_t<TypeList>::value,
-      "In typelist::all<Condition, TypeList>, the TypeList argument must be typelist<...>.");
-};
-template <template <class> class Condition, class... Types>
-struct all<Condition, typelist<Types...>>
-    : std::conjunction<Condition<Types>...> {
-  static_assert(
-      is_type_condition<Condition>::value,
-      "In typelist::all<Condition, TypeList>, the Condition argument must be a condition type trait, i.e. have a static constexpr bool ::value member.");
-};
-
-/**
- * Returns true iff the type trait is true for any type in the type list
- * Examples:
- *   true   ==  true_for_any_type<std::is_reference, typelist<int, const
- * float&&, const MyClass>>::value false  ==
- * true_for_any_type<std::is_reference, typelist<int, const float,
- * MyClass>>::value
- */
-template <template <class> class Condition, class TypeList>
-struct true_for_any_type final {
-  static_assert(
-      false_t<TypeList>::value,
-      "In typelist::true_for_any_type<Condition, TypeList>, the TypeList argument must be typelist<...>.");
-};
-template <template <class> class Condition, class... Types>
-struct true_for_any_type<Condition, typelist<Types...>> final
-    : std::disjunction<Condition<Types>...> {
-  static_assert(
-      is_type_condition<Condition>::value,
-      "In typelist::true_for_any_type<Condition, TypeList>, the Condition argument must be a condition type trait, i.e. have a static constexpr bool ::value member.");
-};
-
-/**
- * Maps types of a type list using a type trait
- * Example:
- *  typelist<int&, double&, string&>  ==  map_t<std::add_lvalue_reference_t,
- * typelist<int, double, string>>
- */
-template <template <class> class Mapper, class TypeList>
-struct map final {
-  static_assert(
-      false_t<TypeList>::value,
-      "In typelist::map<Mapper, TypeList>, the TypeList argument must be typelist<...>.");
-};
-template <template <class> class Mapper, class... Types>
-struct map<Mapper, typelist<Types...>> final {
-  using type = typelist<Mapper<Types>...>;
-};
-template <template <class> class Mapper, class TypeList>
-using map_t = typename map<Mapper, TypeList>::type;
-
-/**
- * Returns the first element of a type list.
- * Example:
- *   int  ==  head_t<typelist<int, string>>
- */
-template <class TypeList>
-struct head final {
-  static_assert(
-      false_t<TypeList>::value,
-      "In typelist::head<T>, the T argument must be typelist<...>.");
-};
-template <class Head, class... Tail>
-struct head<typelist<Head, Tail...>> final {
-  using type = Head;
-};
-template <class TypeList>
-using head_t = typename head<TypeList>::type;
-
-/**
- * Returns the first element of a type list, or the specified default if the
- * type list is empty. Example: int  ==  head_t<bool, typelist<int, string>>
- *   bool  ==  head_t<bool, typelist<>>
- */
-template <class Default, class TypeList>
-struct head_with_default final {
-  using type = Default;
-};
-template <class Default, class Head, class... Tail>
-struct head_with_default<Default, typelist<Head, Tail...>> final {
-  using type = Head;
-};
-template <class Default, class TypeList>
-using head_with_default_t = typename head_with_default<Default, TypeList>::type;
-
-/**
- * Returns the N-th element of a type list.
- * Example:
- * int == element_t<1, typelist<float, int, char>>
- */
-
-/// Base template.
-template <size_t Index, class TypeList>
-struct element final {
-  static_assert(
-      false_t<TypeList>::value,
-      "In typelist::element<T>, the T argument must be typelist<...>.");
-};
-
-/// Successful case, we have reached the zero index and can "return" the head
-/// type.
-template <class Head, class... Tail>
-struct element<0, typelist<Head, Tail...>> {
-  using type = Head;
-};
-
-/// Error case, we have an index but ran out of types! It will only be selected
-/// if `Ts...` is actually empty!
-template <size_t Index, class... Ts>
-struct element<Index, typelist<Ts...>> {
-  static_assert(
-      Index < sizeof...(Ts),
-      "Index is out of bounds in typelist::element");
-};
-
-/// Shave off types until we hit the <0, Head, Tail...> or <Index> case.
-template <size_t Index, class Head, class... Tail>
-struct element<Index, typelist<Head, Tail...>>
-    : element<Index - 1, typelist<Tail...>> {};
-
-/// Convenience alias.
-template <size_t Index, class TypeList>
-using element_t = typename element<Index, TypeList>::type;
-
-/**
- * Returns the last element of a type list.
- * Example:
- *   int  ==  last_t<typelist<int, string>>
- */
-template <class TypeList>
-struct last final {
-  static_assert(
-      false_t<TypeList>::value,
-      "In typelist::last<T>, the T argument must be typelist<...>.");
-};
-template <class Head, class... Tail>
-struct last<typelist<Head, Tail...>> final {
-  using type = typename last<typelist<Tail...>>::type;
-};
-template <class Head>
-struct last<typelist<Head>> final {
-  using type = Head;
-};
-template <class TypeList>
-using last_t = typename last<TypeList>::type;
-static_assert(std::is_same_v<int, last_t<typelist<double, float, int>>>);
-
-/**
- * Take/drop a number of arguments from a typelist.
- * Example:
- *   typelist<int, string> == take_t<typelist<int, string, bool>, 2>
- *   typelist<bool> == drop_t<typelist<int, string, bool>, 2>
- */
-namespace detail {
-template <class TypeList, size_t offset, class IndexSequence>
-struct take_elements final {};
-
-template <class TypeList, size_t offset, size_t... Indices>
-struct take_elements<TypeList, offset, std::index_sequence<Indices...>> final {
-  using type = typelist<typename element<offset + Indices, TypeList>::type...>;
-};
-} // namespace detail
-
-template <class TypeList, size_t num>
-struct take final {
-  static_assert(
-      is_instantiation_of<typelist, TypeList>::value,
-      "In typelist::take<T, num>, the T argument must be typelist<...>.");
-  static_assert(
-      num <= size<TypeList>::value,
-      "Tried to typelist::take more elements than there are in the list");
-  using type = typename detail::
-      take_elements<TypeList, 0, std::make_index_sequence<num>>::type;
-};
-template <class TypeList, size_t num>
-using take_t = typename take<TypeList, num>::type;
-
-template <class TypeList, size_t num>
-struct drop final {
-  static_assert(
-      is_instantiation_of<typelist, TypeList>::value,
-      "In typelist::drop<T, num>, the T argument must be typelist<...>.");
-  static_assert(
-      num <= size<TypeList>::value,
-      "Tried to typelist::drop more elements than there are in the list");
-  using type = typename detail::take_elements<
-      TypeList,
-      num,
-      std::make_index_sequence<size<TypeList>::value - num>>::type;
-};
-template <class TypeList, size_t num>
-using drop_t = typename drop<TypeList, num>::type;
-
-/**
- * Like drop, but returns an empty list rather than an assertion error if `num`
- * is larger than the size of the TypeList.
- * Example:
- *   typelist<> == drop_if_nonempty_t<typelist<string, bool>, 2>
- *   typelist<> == drop_if_nonempty_t<typelist<int, string, bool>, 3>
- */
-template <class TypeList, size_t num>
-struct drop_if_nonempty final {
-  static_assert(
-      is_instantiation_of<typelist, TypeList>::value,
-      "In typelist::drop<T, num>, the T argument must be typelist<...>.");
-  using type = typename detail::take_elements<
-      TypeList,
-      std::min(num, size<TypeList>::value),
-      std::make_index_sequence<
-          size<TypeList>::value - std::min(num, size<TypeList>::value)>>::type;
-};
-template <class TypeList, size_t num>
-using drop_if_nonempty_t = typename drop_if_nonempty<TypeList, num>::type;
-
-/**
- * Reverses a typelist.
- * Example:
- *   typelist<int, string>  == reverse_t<typelist<string, int>>
- */
-template <class TypeList>
-struct reverse final {
-  static_assert(
-      false_t<TypeList>::value,
-      "In typelist::reverse<T>, the T argument must be typelist<...>.");
-};
-template <class Head, class... Tail>
-struct reverse<typelist<Head, Tail...>> final {
-  using type =
-      concat_t<typename reverse<typelist<Tail...>>::type, typelist<Head>>;
-};
-template <>
-struct reverse<typelist<>> final {
-  using type = typelist<>;
-};
-template <class TypeList>
-using reverse_t = typename reverse<TypeList>::type;
-
-/**
- * Find the index of the first type in a typelist fulfilling a type trait
- * condition. Example:
- *
- * 2 == find_if<typelist<char, int, char&, int&>, std::is_reference>::value
- */
-template <class TypeList, template <class> class Condition, class Enable = void>
-struct find_if final {
-  static_assert(
-      false_t<TypeList>::value,
-      "In typelist::find_if<TypeList, Condition>, the TypeList argument must be typelist<...>.");
-};
-template <template <class> class Condition>
-struct find_if<typelist<>, Condition, void> final {
-  static_assert(
-      false_higher_t<Condition>::value,
-      "In typelist::find_if<Type/List, Condition>, didn't find any type fulfilling the Condition.");
-};
-template <class Head, class... Tail, template <class> class Condition>
-struct find_if<
-    typelist<Head, Tail...>,
-    Condition,
-    std::enable_if_t<Condition<Head>::value>>
-    final {
-  static constexpr size_t value = 0;
-};
-template <class Head, class... Tail, template <class> class Condition>
-struct find_if<
-    typelist<Head, Tail...>,
-    Condition,
-    std::enable_if_t<!Condition<Head>::value>>
-    final {
-  static constexpr size_t value =
-      1 + find_if<typelist<Tail...>, Condition>::value;
-};
-
-/**
- * Maps a list of types into a list of values.
- * Examples:
- *   // Example 1
- *   auto sizes =
- *     map_types_to_values<typelist<int64_t, bool, uint32_t>>(
- *       [] (auto t) { return sizeof(decltype(t)::type); }
- *     );
- *   //  sizes  ==  std::tuple<size_t, size_t, size_t>{8, 1, 4}
- *
- *   // Example 2
- *   auto shared_ptrs =
- *     map_types_to_values<typelist<int, double>>(
- *       [] (auto t) { return make_shared<typename decltype(t)::type>(); }
- *     );
- *   // shared_ptrs == std::tuple<shared_ptr<int>, shared_ptr<double>>()
- */
-namespace detail {
-template <class T>
-struct type_ final {
-  using type = T;
-};
-template <class TypeList>
-struct map_types_to_values final {
-  static_assert(
-      false_t<TypeList>::value,
-      "In typelist::map_types_to_values<T>, the T argument must be typelist<...>.");
-};
-template <class... Types>
-struct map_types_to_values<typelist<Types...>> final {
-  template <class Func>
-  static auto call(Func&& func) {
-    return std::tuple{std::forward<Func>(func)(type_<Types>())...};
-  }
-};
-} // namespace detail
-
-template <class TypeList, class Func>
-auto map_types_to_values(Func&& func) {
-  return detail::map_types_to_values<TypeList>::call(std::forward<Func>(func));
-}
-
-} // namespace typelist
-} // namespace c10::guts
+#include <torch/headeronly/util/TypeList.h>
diff --git a/c10/util/TypeTraits.h b/c10/util/TypeTraits.h
index 0607c2d1a470c..5c5e75858642c 100644
--- a/c10/util/TypeTraits.h
+++ b/c10/util/TypeTraits.h
@@ -1,151 +1 @@
-#pragma once
-
-#include <functional>
-#include <type_traits>
-
-namespace c10::guts {
-
-/**
- * is_equality_comparable<T> is true_type iff the equality operator is defined
- * for T.
- */
-template <class T, class Enable = void>
-struct is_equality_comparable : std::false_type {};
-template <class T>
-struct is_equality_comparable<
-    T,
-    std::void_t<decltype(std::declval<T&>() == std::declval<T&>())>>
-    : std::true_type {};
-template <class T>
-using is_equality_comparable_t = typename is_equality_comparable<T>::type;
-
-/**
- * is_hashable<T> is true_type iff std::hash is defined for T
- */
-template <class T, class Enable = void>
-struct is_hashable : std::false_type {};
-template <class T>
-struct is_hashable<T, std::void_t<decltype(std::hash<T>()(std::declval<T&>()))>>
-    : std::true_type {};
-template <class T>
-using is_hashable_t = typename is_hashable<T>::type;
-
-/**
- * is_function_type<T> is true_type iff T is a plain function type (i.e.
- * "Result(Args...)")
- */
-template <class T>
-struct is_function_type : std::false_type {};
-template <class Result, class... Args>
-struct is_function_type<Result(Args...)> : std::true_type {};
-template <class T>
-using is_function_type_t = typename is_function_type<T>::type;
-
-/**
- * is_instantiation_of<T, I> is true_type iff I is a template instantiation of T
- * (e.g. vector<int> is an instantiation of vector) Example:
- *    is_instantiation_of_t<vector, vector<int>> // true
- *    is_instantiation_of_t<pair, pair<int, string>> // true
- *    is_instantiation_of_t<vector, pair<int, string>> // false
- */
-template <template <class...> class Template, class T>
-struct is_instantiation_of : std::false_type {};
-template <template <class...> class Template, class... Args>
-struct is_instantiation_of<Template, Template<Args...>> : std::true_type {};
-template <template <class...> class Template, class T>
-using is_instantiation_of_t = typename is_instantiation_of<Template, T>::type;
-
-namespace detail {
-/**
- * strip_class: helper to remove the class type from pointers to `operator()`.
- */
-
-template <typename T>
-struct strip_class {};
-template <typename Class, typename Result, typename... Args>
-struct strip_class<Result (Class::*)(Args...)> {
-  using type = Result(Args...);
-};
-template <typename Class, typename Result, typename... Args>
-struct strip_class<Result (Class::*)(Args...) const> {
-  using type = Result(Args...);
-};
-template <typename T>
-using strip_class_t = typename strip_class<T>::type;
-} // namespace detail
-
-/**
- * Evaluates to true_type, iff the given class is a Functor
- * (i.e. has a call operator with some set of arguments)
- */
-
-template <class Functor, class Enable = void>
-struct is_functor : std::false_type {};
-template <class Functor>
-struct is_functor<
-    Functor,
-    std::enable_if_t<is_function_type<
-        detail::strip_class_t<decltype(&Functor::operator())>>::value>>
-    : std::true_type {};
-
-/**
- * lambda_is_stateless<T> is true iff the lambda type T is stateless
- * (i.e. does not have a closure).
- * Example:
- *  auto stateless_lambda = [] (int a) {return a;};
- *  lambda_is_stateless<decltype(stateless_lambda)> // true
- *  auto stateful_lambda = [&] (int a) {return a;};
- *  lambda_is_stateless<decltype(stateful_lambda)> // false
- */
-namespace detail {
-template <class LambdaType, class FuncType>
-struct is_stateless_lambda__ final {
-  static_assert(
-      !std::is_same_v<LambdaType, LambdaType>,
-      "Base case shouldn't be hit");
-};
-// implementation idea: According to the C++ standard, stateless lambdas are
-// convertible to function pointers
-template <class LambdaType, class C, class Result, class... Args>
-struct is_stateless_lambda__<LambdaType, Result (C::*)(Args...) const>
-    : std::is_convertible<LambdaType, Result (*)(Args...)> {};
-template <class LambdaType, class C, class Result, class... Args>
-struct is_stateless_lambda__<LambdaType, Result (C::*)(Args...)>
-    : std::is_convertible<LambdaType, Result (*)(Args...)> {};
-
-// case where LambdaType is not even a functor
-template <class LambdaType, class Enable = void>
-struct is_stateless_lambda_ final : std::false_type {};
-// case where LambdaType is a functor
-template <class LambdaType>
-struct is_stateless_lambda_<
-    LambdaType,
-    std::enable_if_t<is_functor<LambdaType>::value>>
-    : is_stateless_lambda__<LambdaType, decltype(&LambdaType::operator())> {};
-} // namespace detail
-template <class T>
-using is_stateless_lambda = detail::is_stateless_lambda_<std::decay_t<T>>;
-
-/**
- * is_type_condition<C> is true_type iff C<...> is a type trait representing a
- * condition (i.e. has a constexpr static bool ::value member) Example:
- *   is_type_condition<std::is_reference>  // true
- */
-template <template <class> class C, class Enable = void>
-struct is_type_condition : std::false_type {};
-template <template <class> class C>
-struct is_type_condition<
-    C,
-    std::enable_if_t<
-        std::is_same_v<bool, std::remove_cv_t<decltype(C<int>::value)>>>>
-    : std::true_type {};
-
-/**
- * is_fundamental<T> is true_type iff the lambda type T is a fundamental type
- * (that is, arithmetic type, void, or nullptr_t). Example: is_fundamental<int>
- * // true We define it here to resolve a MSVC bug. See
- * https://github.com/pytorch/pytorch/issues/30932 for details.
- */
-template <class T>
-struct is_fundamental : std::is_fundamental<T> {};
-} // namespace c10::guts
+#include <torch/headeronly/util/TypeTraits.h>
diff --git a/c10/util/intrusive_ptr.h b/c10/util/intrusive_ptr.h
index 3d5478be90e60..0c8f55f5061ab 100644
--- a/c10/util/intrusive_ptr.h
+++ b/c10/util/intrusive_ptr.h
@@ -12,6 +12,10 @@ template <typename, typename...>
 class class_;
 }
 
+namespace torch::utils {
+class PyObjectPreservation;
+}
+
 namespace c10 {
 class intrusive_ptr_target;
 namespace raw {
@@ -33,6 +37,8 @@ constexpr uint64_t kImpracticallyHugeWeakReferenceCount =
 constexpr uint64_t kReferenceCountOne = 1;
 constexpr uint64_t kWeakReferenceCountOne = (kReferenceCountOne << 32);
 constexpr uint64_t kUniqueRef = (kReferenceCountOne | kWeakReferenceCountOne);
+// Indicates whether the object has a PyObject wrapper.
+constexpr uint64_t kHasPyObject = (uint64_t(1) << 63);
 
 template <class TTarget>
 struct intrusive_target_default_null_type final {
@@ -55,7 +61,11 @@ inline uint32_t refcount(uint64_t combined_refcount) {
 }
 
 inline uint32_t weakcount(uint64_t combined_refcount) {
-  return static_cast<uint32_t>(combined_refcount >> 32);
+  return static_cast<uint32_t>((combined_refcount & ~kHasPyObject) >> 32);
+}
+
+inline bool has_pyobject(uint64_t combined_refcount) {
+  return (combined_refcount & kHasPyObject) != 0;
 }
 
 // The only requirement for refcount increment is that it happens-before
@@ -66,12 +76,6 @@ inline uint64_t atomic_combined_refcount_increment(
   return combined_refcount.fetch_add(inc, std::memory_order_relaxed) + inc;
 }
 
-inline uint32_t atomic_refcount_increment(
-    std::atomic<uint64_t>& combined_refcount) {
-  return detail::refcount(atomic_combined_refcount_increment(
-      combined_refcount, kReferenceCountOne));
-}
-
 inline uint32_t atomic_weakcount_increment(
     std::atomic<uint64_t>& combined_refcount) {
   return detail::weakcount(atomic_combined_refcount_increment(
@@ -99,6 +103,11 @@ inline uint32_t atomic_weakcount_decrement(
       combined_refcount, kWeakReferenceCountOne));
 }
 
+template <class T, class = void>
+struct TargetTraits {
+  static constexpr bool can_have_pyobject = false;
+};
+
 } // namespace detail
 
 /**
@@ -155,6 +164,23 @@ class C10_API intrusive_ptr_target {
   // we can atomically operate on both at the same time for performance
   // and defined behaviors.
   //
+  // Note [PyObject preservation for Tensor and Storages]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // intrusive_ptr has special support for preserving PyObject wrappers
+  // for TensorImpl and StorageImpl. The most significant bit (kHasPyObject) of
+  // the combined_refcount_ is used to indicate whether the object has a
+  // PyObject wrapper.
+  //
+  //   - The PyObject, if it exists, holds a strong reference to the
+  //     intrusive_ptr_target.
+  //
+  //   - When the refcount goes from 1 to 2, we incref the PyObject.
+  //
+  //   - When the refcount goes from 2 to 1, we decref the PyObject.
+  //
+  // In other words, the intrusive_ptr keeps the PyObject alive as long as there
+  // are other C++ references to the intrusive_ptr_target.
+
   mutable std::atomic<uint64_t> combined_refcount_;
   static_assert(sizeof(std::atomic<uint64_t>) == 8);
   static_assert(alignof(std::atomic<uint64_t>) == 8);
@@ -172,6 +198,8 @@ class C10_API intrusive_ptr_target {
   template <typename T>
   friend struct ExclusivelyOwnedTensorTraits;
 
+  friend class torch::utils::PyObjectPreservation;
+
  protected:
   // protected destructor. We never want to destruct intrusive_ptr_target*
   // directly.
@@ -255,6 +283,16 @@ class C10_API intrusive_ptr_target {
    */
   virtual void release_resources() {}
 
+  /**
+   * These two methods are called when the refcount transitions between one
+   * and two and the object has a PyObject wrapper.
+   */
+  virtual void incref_pyobject() const {}
+  virtual void decref_pyobject() const {}
+  virtual bool try_incref_pyobject() const {
+    return false;
+  }
+
   uint32_t refcount(std::memory_order order = std::memory_order_relaxed) const {
     return detail::refcount(combined_refcount_.load(order));
   }
@@ -265,6 +303,19 @@ class C10_API intrusive_ptr_target {
   }
 };
 
+namespace detail {
+
+#ifndef C10_MOBILE
+template <>
+struct TargetTraits<c10::intrusive_ptr_target> {
+  // A generic intrusive_ptr<intrusive_ptr_target> may actually be a TensorImpl
+  // or StorageImpl, so we have to allow for PyObject support.
+  static constexpr bool can_have_pyobject = true;
+};
+#endif
+
+} // namespace detail
+
 template <class TTarget, class NullType>
 class weak_intrusive_ptr;
 
@@ -314,18 +365,34 @@ class intrusive_ptr final {
 
   void retain_() {
     if (target_ != NullType::singleton()) {
-      uint32_t new_refcount =
-          detail::atomic_refcount_increment(target_->combined_refcount_);
+      uint64_t combined = detail::atomic_combined_refcount_increment(
+          target_->combined_refcount_, detail::kReferenceCountOne);
+      uint32_t new_refcount = detail::refcount(combined);
       TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
           new_refcount != 1,
           "intrusive_ptr: Cannot increase refcount after it reached zero.");
+
+      if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
+        // If the refcount transitioned from 1 to 2, we need to incref the
+        // PyObject. In other words, we need to ensure that the PyObject stays
+        // alive now that we have a C++ reference to this object in addition to
+        // the PyObject itself.
+        if (C10_UNLIKELY(
+                detail::has_pyobject(combined) &&
+                detail::refcount(combined) == 2)) {
+          target_->incref_pyobject();
+        }
+      } else {
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+            !detail::has_pyobject(combined),
+            "TargetTraits indicates that type cannot have PyObject, but refcount has PyObject bit set.");
+      }
     }
   }
 
   void reset_() noexcept {
     if (target_ != NullType::singleton()) {
-      if (target_->combined_refcount_.load(std::memory_order_acquire) ==
-          detail::kUniqueRef) {
+      if (is_uniquely_owned()) {
         // Both counts are 1, so there are no weak references and
         // we are releasing the last strong reference. No other
         // threads can observe the effects of this target_ deletion
@@ -337,9 +404,10 @@ class intrusive_ptr final {
 
       auto combined_refcount = detail::atomic_combined_refcount_decrement(
           target_->combined_refcount_, detail::kReferenceCountOne);
-      if (detail::refcount(combined_refcount) == 0) {
-        bool should_delete =
-            (combined_refcount == detail::kWeakReferenceCountOne);
+      uint32_t new_refcount = detail::refcount(combined_refcount);
+      bool has_pyobject = detail::has_pyobject(combined_refcount);
+      if (new_refcount == 0) {
+        bool should_delete = detail::weakcount(combined_refcount) == 1;
         // See comment above about weakcount. As long as refcount>0,
         // weakcount is one larger than the actual number of weak references.
         // So we need to decrement it here.
@@ -356,6 +424,18 @@ class intrusive_ptr final {
         if (should_delete) {
           delete target_;
         }
+      } else if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
+        // If the refcount transitioned from 2 to 1, we need to decref the
+        // PyObject. In other words, we don't want to keep the PyObject alive if
+        // there are no C++ references to this object other than the PyObject
+        // itself.
+        if (C10_UNLIKELY(has_pyobject && new_refcount == 1)) {
+          target_->decref_pyobject();
+        }
+      } else {
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+            !has_pyobject,
+            "TargetTraits indicates that type cannot have PyObject, but refcount has PyObject bit set.");
       }
     }
   }
@@ -522,6 +602,16 @@ class intrusive_ptr final {
     return use_count() == 1;
   }
 
+  /**
+   * Stronger than unique() in that it must not have any weakrefs as well.
+   */
+  bool is_uniquely_owned() const noexcept {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(target_ != NullType::singleton());
+    uint64_t combined =
+        target_->combined_refcount_.load(std::memory_order_acquire);
+    return (combined & ~detail::kHasPyObject) == detail::kUniqueRef;
+  }
+
   /**
    * Returns an owning (!) pointer to the underlying object and makes the
    * intrusive_ptr instance invalid. That means the refcount is not decreased.
@@ -932,6 +1022,7 @@ class weak_intrusive_ptr final {
     if (target_ == NullType::singleton()) {
       return intrusive_ptr<TTarget, NullType>();
     } else {
+      bool increfed = false;
       auto combined_refcount =
           target_->combined_refcount_.load(std::memory_order_relaxed);
       do {
@@ -940,12 +1031,31 @@ class weak_intrusive_ptr final {
           // Return nullptr.
           return intrusive_ptr<TTarget, NullType>();
         }
+        if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
+          if (detail::has_pyobject(combined_refcount) &&
+              detail::refcount(combined_refcount) == 1 && !increfed) {
+            // Object has a python wrapper with no other C++ references.
+            // We need to to incref the Python object before we acquire a
+            // strong reference to the C++ object to avoid a situation
+            // where the Python object is deallocated concurrently.
+            if (!target_->try_incref_pyobject()) {
+              return intrusive_ptr<TTarget, NullType>();
+            }
+            increfed = true;
+          }
+        }
       } while (!target_->combined_refcount_.compare_exchange_weak(
           combined_refcount,
           combined_refcount + detail::kReferenceCountOne,
           std::memory_order_acquire,
           std::memory_order_relaxed));
 
+      if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
+        if (increfed && detail::refcount(combined_refcount) != 1) {
+          target_->decref_pyobject();
+        }
+      }
+
       return intrusive_ptr<TTarget, NullType>(
           target_, raw::DontIncreaseRefcount{});
     }
@@ -1060,7 +1170,18 @@ namespace intrusive_ptr {
 // NullType::singleton to this function
 inline void incref(intrusive_ptr_target* self) {
   if (self) {
-    detail::atomic_refcount_increment(self->combined_refcount_);
+    uint64_t combined = detail::atomic_combined_refcount_increment(
+        self->combined_refcount_, detail::kReferenceCountOne);
+
+#ifndef C10_MOBILE
+    if (C10_UNLIKELY(
+            detail::has_pyobject(combined) &&
+            detail::refcount(combined) == 2)) {
+      self->incref_pyobject();
+    }
+#else
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!detail::has_pyobject(combined));
+#endif
   }
 }
 
diff --git a/c10/util/logging_common.h b/c10/util/logging_common.h
new file mode 100644
index 0000000000000..df65da21c2b22
--- /dev/null
+++ b/c10/util/logging_common.h
@@ -0,0 +1,74 @@
+#ifndef C10_UTIL_LOGGING_COMMON_H_
+#define C10_UTIL_LOGGING_COMMON_H_
+
+#include <c10/macros/Export.h>
+#include <sstream>
+
+namespace c10 {
+
+// MessageLogger that throws exceptions instead of aborting (glog version)
+// or logs and may abort (non-glog version).
+class C10_API MessageLogger {
+ public:
+  MessageLogger(
+      const char* file,
+      int line,
+      int severity,
+      bool exit_on_fatal = true);
+  ~MessageLogger() noexcept(false);
+
+  // Return the stream associated with the logger object.
+  std::stringstream& stream();
+
+ private:
+  // When there is a fatal log, and fatal == true, we abort
+  // otherwise, we throw.
+  void DealWithFatal();
+
+#if defined(ANDROID) && !defined(C10_USE_GLOG)
+  const char* tag_{"native"};
+#endif
+  std::stringstream stream_;
+  int severity_;
+  bool exit_on_fatal_;
+};
+
+// This class is used to explicitly ignore values in the conditional
+// logging macros. This avoids compiler warnings like "value computed
+// is not used" and "statement has no effect".
+class C10_API LoggerVoidify {
+ public:
+  LoggerVoidify() = default;
+  // This has to be an operator with a precedence lower than << but
+  // higher than ?:
+  void operator&(const std::ostream& s [[maybe_unused]]) {}
+};
+
+// Forward declarations for CheckNotNull functions
+template <typename T>
+T& CheckNotNullCommon(
+    const char* file,
+    int line,
+    const char* names,
+    T& t,
+    bool fatal = true);
+
+template <typename T>
+T* CheckNotNull(
+    const char* file,
+    int line,
+    const char* names,
+    T* t,
+    bool fatal = true);
+
+template <typename T>
+T& CheckNotNull(
+    const char* file,
+    int line,
+    const char* names,
+    T& t,
+    bool fatal = true);
+
+} // namespace c10
+
+#endif // C10_UTIL_LOGGING_COMMON_H_
diff --git a/c10/util/logging_is_google_glog.h b/c10/util/logging_is_google_glog.h
index e5470d22cecd3..f4e2ff979088f 100644
--- a/c10/util/logging_is_google_glog.h
+++ b/c10/util/logging_is_google_glog.h
@@ -47,57 +47,53 @@ INSTANTIATE_FOR_CONTAINER(set)
 
 #endif
 
+#include <c10/util/logging_common.h>
 #include <glog/logging.h>
 
-// Additional macros on top of glog
-#define TORCH_CHECK_EQ(val1, val2) CHECK_EQ(val1, val2)
-#define TORCH_CHECK_NE(val1, val2) CHECK_NE(val1, val2)
-#define TORCH_CHECK_LE(val1, val2) CHECK_LE(val1, val2)
-#define TORCH_CHECK_LT(val1, val2) CHECK_LT(val1, val2)
-#define TORCH_CHECK_GE(val1, val2) CHECK_GE(val1, val2)
-#define TORCH_CHECK_GT(val1, val2) CHECK_GT(val1, val2)
-
-#ifndef NDEBUG
-#define TORCH_DCHECK_EQ(val1, val2) DCHECK_EQ(val1, val2)
-#define TORCH_DCHECK_NE(val1, val2) DCHECK_NE(val1, val2)
-#define TORCH_DCHECK_LE(val1, val2) DCHECK_LE(val1, val2)
-#define TORCH_DCHECK_LT(val1, val2) DCHECK_LT(val1, val2)
-#define TORCH_DCHECK_GE(val1, val2) DCHECK_GE(val1, val2)
-#define TORCH_DCHECK_GT(val1, val2) DCHECK_GT(val1, val2)
-#else // !NDEBUG
-// These versions generate no code in optimized mode.
-#define TORCH_DCHECK_EQ(val1, val2) \
-  while (false)                     \
-  DCHECK_EQ(val1, val2)
-#define TORCH_DCHECK_NE(val1, val2) \
-  while (false)                     \
-  DCHECK_NE(val1, val2)
-#define TORCH_DCHECK_LE(val1, val2) \
-  while (false)                     \
-  DCHECK_LE(val1, val2)
-#define TORCH_DCHECK_LT(val1, val2) \
-  while (false)                     \
-  DCHECK_LT(val1, val2)
-#define TORCH_DCHECK_GE(val1, val2) \
-  while (false)                     \
-  DCHECK_GE(val1, val2)
-#define TORCH_DCHECK_GT(val1, val2) \
-  while (false)                     \
-  DCHECK_GT(val1, val2)
-#endif // NDEBUG
-
-// Check that a pointer is not null.
-#define TORCH_CHECK_NOTNULL(val) CHECK_NOTNULL(val)
-
-#ifndef NDEBUG
-// Debug only version of TORCH_CHECK_NOTNULL
-#define TORCH_DCHECK_NOTNULL(val) DCHECK_NOTNULL(val)
-#else // !NDEBUG
-// Optimized version - generates no code.
-#define TORCH_DCHECK_NOTNULL(val) \
-  while (false)                   \
-  DCHECK_NOTNULL(val)
-#endif // NDEBUG
+namespace c10 {
+
+[[noreturn]] void ThrowEnforceNotMet(
+    const char* file,
+    const int line,
+    const char* condition,
+    const std::string& msg,
+    const void* caller);
+
+template <typename T>
+T& CheckNotNullCommon(
+    const char* file,
+    int line,
+    const char* names,
+    T& t,
+    bool fatal) {
+  if (t == nullptr) {
+    MessageLogger(file, line, ::google::GLOG_FATAL, fatal).stream()
+        << "Check failed: '" << names << "' must be non NULL. ";
+  }
+  return t;
+}
+
+template <typename T>
+T* CheckNotNull(
+    const char* file,
+    int line,
+    const char* names,
+    T* t,
+    bool fatal) {
+  return CheckNotNullCommon(file, line, names, t, fatal);
+}
+
+template <typename T>
+T& CheckNotNull(
+    const char* file,
+    int line,
+    const char* names,
+    T& t,
+    bool fatal) {
+  return CheckNotNullCommon(file, line, names, t, fatal);
+}
+
+} // namespace c10
 
 // Log with source location information override (to be used in generic
 // warning/error handlers implemented as functions, not macros)
diff --git a/c10/util/logging_is_not_google_glog.h b/c10/util/logging_is_not_google_glog.h
index 803a833c3cae4..b921cbff47d46 100644
--- a/c10/util/logging_is_not_google_glog.h
+++ b/c10/util/logging_is_not_google_glog.h
@@ -13,6 +13,7 @@
 #include <vector>
 
 #include <c10/util/Flags.h>
+#include <c10/util/logging_common.h>
 
 const char CAFFE2_SEVERITY_PREFIX[] = "FEWIV";
 
@@ -24,61 +25,40 @@ const int GLOG_ERROR = 2;
 const int GLOG_WARNING = 1;
 const int GLOG_INFO = 0;
 
-class C10_API MessageLogger {
- public:
-  MessageLogger(const char* file, int line, int severity);
-  ~MessageLogger();
-  // Return the stream associated with the logger object.
-  std::stringstream& stream() {
-    return stream_;
-  }
-
- private:
-  // When there is a fatal log, we simply abort.
-  void DealWithFatal() {
-    abort();
-  }
-
-  const char* tag_;
-  std::stringstream stream_;
-  int severity_;
-};
-
-// This class is used to explicitly ignore values in the conditional
-// logging macros.  This avoids compiler warnings like "value computed
-// is not used" and "statement has no effect".
-class C10_API LoggerVoidify {
- public:
-  LoggerVoidify() = default;
-  // This has to be an operator with a precedence lower than << but
-  // higher than ?:
-  void operator&(const std::ostream& s [[maybe_unused]]) {}
-};
-
-// Log a message and terminate.
-template <class T>
-void LogMessageFatal(const char* file, int line, const T& message) {
-  MessageLogger(file, line, GLOG_FATAL).stream() << message;
-}
-
 // Helpers for TORCH_CHECK_NOTNULL(). Two are necessary to support both raw
 // pointers and smart pointers.
 template <typename T>
-T& CheckNotNullCommon(const char* file, int line, const char* names, T& t) {
+T& CheckNotNullCommon(
+    const char* file,
+    int line,
+    const char* names,
+    T& t,
+    bool fatal) {
   if (t == nullptr) {
-    LogMessageFatal(file, line, std::string(names));
+    MessageLogger(file, line, GLOG_FATAL, fatal).stream()
+        << "Check failed: '" << names << "' must be non NULL. ";
   }
   return t;
 }
 
 template <typename T>
-T* CheckNotNull(const char* file, int line, const char* names, T* t) {
-  return CheckNotNullCommon(file, line, names, t);
+T* CheckNotNull(
+    const char* file,
+    int line,
+    const char* names,
+    T* t,
+    bool fatal) {
+  return CheckNotNullCommon(file, line, names, t, fatal);
 }
 
 template <typename T>
-T& CheckNotNull(const char* file, int line, const char* names, T& t) {
-  return CheckNotNullCommon(file, line, names, t);
+T& CheckNotNull(
+    const char* file,
+    int line,
+    const char* names,
+    T& t,
+    bool fatal) {
+  return CheckNotNullCommon(file, line, names, t, fatal);
 }
 } // namespace c10
 
@@ -136,65 +116,6 @@ static_assert(
           ::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_##n).stream()
 #endif // NDEBUG
 
-#define TORCH_CHECK_OP(val1, val2, op)                                        \
-  FATAL_IF(((val1)op(val2))) << "Check failed: " #val1 " " #op " " #val2 " (" \
-                             << (val1) << " vs. " << (val2) << ") "
-
-// TORCH_CHECK_OP macro definitions
-#define TORCH_CHECK_EQ(val1, val2) TORCH_CHECK_OP(val1, val2, ==)
-#define TORCH_CHECK_NE(val1, val2) TORCH_CHECK_OP(val1, val2, !=)
-#define TORCH_CHECK_LE(val1, val2) TORCH_CHECK_OP(val1, val2, <=)
-#define TORCH_CHECK_LT(val1, val2) TORCH_CHECK_OP(val1, val2, <)
-#define TORCH_CHECK_GE(val1, val2) TORCH_CHECK_OP(val1, val2, >=)
-#define TORCH_CHECK_GT(val1, val2) TORCH_CHECK_OP(val1, val2, >)
-
-#ifndef NDEBUG
-// Debug only versions of TORCH_CHECK_OP macros.
-#define TORCH_DCHECK_EQ(val1, val2) TORCH_CHECK_OP(val1, val2, ==)
-#define TORCH_DCHECK_NE(val1, val2) TORCH_CHECK_OP(val1, val2, !=)
-#define TORCH_DCHECK_LE(val1, val2) TORCH_CHECK_OP(val1, val2, <=)
-#define TORCH_DCHECK_LT(val1, val2) TORCH_CHECK_OP(val1, val2, <)
-#define TORCH_DCHECK_GE(val1, val2) TORCH_CHECK_OP(val1, val2, >=)
-#define TORCH_DCHECK_GT(val1, val2) TORCH_CHECK_OP(val1, val2, >)
-#else // !NDEBUG
-// These versions generate no code in optimized mode.
-#define TORCH_DCHECK_EQ(val1, val2) \
-  while (false)                     \
-  TORCH_CHECK_OP(val1, val2, ==)
-#define TORCH_DCHECK_NE(val1, val2) \
-  while (false)                     \
-  TORCH_CHECK_OP(val1, val2, !=)
-#define TORCH_DCHECK_LE(val1, val2) \
-  while (false)                     \
-  TORCH_CHECK_OP(val1, val2, <=)
-#define TORCH_DCHECK_LT(val1, val2) \
-  while (false)                     \
-  TORCH_CHECK_OP(val1, val2, <)
-#define TORCH_DCHECK_GE(val1, val2) \
-  while (false)                     \
-  TORCH_CHECK_OP(val1, val2, >=)
-#define TORCH_DCHECK_GT(val1, val2) \
-  while (false)                     \
-  TORCH_CHECK_OP(val1, val2, >)
-#endif // NDEBUG
-
-// Check that a pointer is not null.
-#define TORCH_CHECK_NOTNULL(val) \
-  ::c10::CheckNotNull(           \
-      __FILE__, __LINE__, "Check failed: '" #val "' Must be non NULL", (val))
-
-#ifndef NDEBUG
-// Debug only version of TORCH_CHECK_NOTNULL
-#define TORCH_DCHECK_NOTNULL(val) \
-  ::c10::CheckNotNull(            \
-      __FILE__, __LINE__, "Check failed: '" #val "' Must be non NULL", (val))
-#else // !NDEBUG
-// Optimized version - generates no code.
-#define TORCH_DCHECK_NOTNULL(val) \
-  while (false)                   \
-  TORCH_CHECK_NOTNULL(val)
-#endif // NDEBUG
-
 // ---------------------- Support for std objects --------------------------
 // These are adapted from glog to support a limited set of logging capability
 // for STL objects.
diff --git a/c10/util/signal_handler.cpp b/c10/util/signal_handler.cpp
index 831c0d0245245..bfb04e1ccbc36 100644
--- a/c10/util/signal_handler.cpp
+++ b/c10/util/signal_handler.cpp
@@ -223,7 +223,7 @@ void FatalSignalHandler::fatalSignalHandler(int signum) {
         // a single thread that wouldn't receive the SIGUSR2
         if (std::cv_status::timeout == writingCond.wait_for(ul, 2s)) {
           if (!signalReceived) {
-            std::cerr << "signal lost waiting for stacktrace " << pid << ":"
+            std::cerr << "signal lost waiting for stacktrace " << pid << ':'
                       << tid << '\n';
             break;
           }
diff --git a/c10/util/sparse_bitset.h b/c10/util/sparse_bitset.h
index c8eb0df47f6ae..e7ad1db06d6f7 100644
--- a/c10/util/sparse_bitset.h
+++ b/c10/util/sparse_bitset.h
@@ -877,7 +877,7 @@ std::ostream& operator<<(
     std::ostream& stream,
     const SparseBitVector<ElementSize>& vec) {
   bool first = true;
-  stream << "{";
+  stream << '{';
   for (auto el : vec) {
     if (first) {
       first = false;
@@ -886,7 +886,7 @@ std::ostream& operator<<(
     }
     stream << el;
   }
-  stream << "}";
+  stream << '}';
   return stream;
 }
 
diff --git a/c10/xpu/CMakeLists.txt b/c10/xpu/CMakeLists.txt
index 95b9f031c3e97..c2fa65ba35e73 100644
--- a/c10/xpu/CMakeLists.txt
+++ b/c10/xpu/CMakeLists.txt
@@ -24,6 +24,7 @@ set(C10_XPU_HEADERS
     XPUCachingAllocator.h
     XPUDeviceProp.h
     XPUException.h
+    XPUEvent.h
     XPUFunctions.h
     XPUMacros.h
     XPUStream.h
diff --git a/c10/xpu/XPUCachingAllocator.cpp b/c10/xpu/XPUCachingAllocator.cpp
index 17a2669e7290f..3bd9eff0fee63 100644
--- a/c10/xpu/XPUCachingAllocator.cpp
+++ b/c10/xpu/XPUCachingAllocator.cpp
@@ -15,6 +15,8 @@ using namespace c10::CachingDeviceAllocator;
 // newly allocated memory with 512-byte alignment.
 constexpr size_t kDeviceAlignment = 512;
 
+class XPUAllocator;
+
 namespace {
 using stream_set = ska::flat_hash_set<xpu::XPUStream>;
 
@@ -23,14 +25,19 @@ typedef bool (*Comparison)(const Block*, const Block*);
 bool BlockComparatorSize(const Block* a, const Block* b);
 bool BlockComparatorAddress(const Block* a, const Block* b);
 
+struct PrivatePool;
+
 struct BlockPool {
-  BlockPool(bool small)
+  BlockPool(bool small, PrivatePool* private_pool = nullptr)
       : blocks(BlockComparatorSize),
         unmapped(BlockComparatorAddress),
-        is_small(small) {}
+        is_small(small),
+        owner_PrivatePool(private_pool) {}
+
   std::set<Block*, Comparison> blocks;
   std::set<Block*, Comparison> unmapped;
   const bool is_small;
+  PrivatePool* owner_PrivatePool;
 };
 
 struct ExpandableSegment;
@@ -349,6 +356,43 @@ struct AllocParams {
   StatTypes stat_types = {};
 };
 
+// Internal implementation that manages actual memory blocks.
+// high level MemPool interface wraps PrivatePool via MempoolId.
+struct PrivatePool {
+  PrivatePool(MempoolId_t id, XPUAllocator* allocator = nullptr)
+      : id(std::move(id)),
+        allocator_(allocator),
+        large_blocks(/*small=*/false, this),
+        small_blocks(/*small=*/true, this) {}
+  PrivatePool(const PrivatePool&) = delete;
+  PrivatePool(PrivatePool&&) = delete;
+  PrivatePool& operator=(const PrivatePool&) = delete;
+  PrivatePool& operator=(PrivatePool&&) = delete;
+  ~PrivatePool() = default;
+
+  // default Mempool when no Mempool is specified
+  MempoolId_t id{0, 0};
+  // Number of live graphs using this pool
+  int use_count{1};
+  // Number of unfreed allocations made for this pool. When use_count and
+  // allocation_count drop to zero, we can delete this PrivatePool from
+  // graph_pools.
+  int allocation_count{0};
+  XPUAllocator* allocator_;
+  BlockPool large_blocks;
+  BlockPool small_blocks;
+
+ public:
+  XPUAllocator* allocator() {
+    return allocator_;
+  }
+};
+struct MempoolIdHash {
+  std::size_t operator()(const MempoolId_t& mempool_id) const noexcept {
+    return mempool_id.first != 0 ? mempool_id.first : mempool_id.second;
+  }
+};
+
 } // anonymous namespace
 
 class DeviceCachingAllocator {
@@ -365,6 +409,13 @@ class DeviceCachingAllocator {
   bool set_fraction = false;
   std::vector<ExpandableSegment*> expandable_segments;
   std::vector<c10::DeviceIndex> devices_with_peer_access; // reserved
+  std::vector<std::pair<MempoolId_t, std::function<bool(sycl::queue*)>>>
+      captures_underway;
+  ska::flat_hash_map<MempoolId_t, std::unique_ptr<PrivatePool>, MempoolIdHash>
+      graph_pools;
+  // Pools no longer referenced by any graph.
+  ska::flat_hash_map<MempoolId_t, PrivatePool*, MempoolIdHash>
+      graph_pools_freeable;
 
   size_t try_merge_blocks(Block* dst, Block* src, BlockPool& pool) {
     if (!src || src->allocated || src->event_count > 0 ||
@@ -463,7 +514,22 @@ class DeviceCachingAllocator {
     }
   }
 
-  BlockPool& get_pool(size_t size) {
+  BlockPool& get_pool(size_t size, sycl::queue* queue) {
+    if (C10_UNLIKELY(!captures_underway.empty())) {
+      for (auto& entry : captures_underway) {
+        // lookup for mempool id matching current capture graph
+        if (entry.second(queue)) {
+          auto it1 = graph_pools.find(entry.first);
+          // lookup mempool
+          TORCH_INTERNAL_ASSERT(it1 != graph_pools.end());
+          if (size <= kSmallSize) {
+            return it1->second->small_blocks;
+          } else {
+            return it1->second->large_blocks;
+          }
+        }
+      }
+    }
     if (size < kSmallSize) {
       return small_blocks;
     } else {
@@ -669,6 +735,10 @@ class DeviceCachingAllocator {
     if (!ptr) {
       return false;
     }
+
+    if (p.pool->owner_PrivatePool) {
+      p.pool->owner_PrivatePool->allocation_count++;
+    }
     p.block = new Block(device, p.queue(), size, p.pool, ptr);
     for_each_selected_stat_type(p.stat_types, [&](size_t stat_type) {
       stats.reserved_bytes[stat_type].increase(size);
@@ -677,11 +747,14 @@ class DeviceCachingAllocator {
     return true;
   }
 
-  void synchronize_and_free_events() {
+  void synchronize_and_free_events(PrivatePool* pool = nullptr) {
     for (auto& xe : xpu_events) {
       for (auto& e : xe.second) {
         auto event = e.first;
         auto* block = e.second;
+        if (pool && block->pool->owner_PrivatePool != pool) {
+          continue;
+        }
         event.wait();
         block->event_count--;
         if (block->event_count == 0) {
@@ -785,6 +858,13 @@ class DeviceCachingAllocator {
     for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
       stats.reserved_bytes[stat_type].decrease(unmapped.size);
     });
+
+    if (block->pool->owner_PrivatePool) {
+      // The Freed block belonged to a XPU graph's PrivatePool.
+      TORCH_INTERNAL_ASSERT(
+          block->pool->owner_PrivatePool->allocation_count > 0);
+      block->pool->owner_PrivatePool->allocation_count--;
+    }
   }
 
   void release_blocks(BlockPool& pool) {
@@ -812,13 +892,41 @@ class DeviceCachingAllocator {
     }
   }
 
-  bool release_cached_blocks() {
-    synchronize_and_free_events();
-    // See Note [Safe to Free Blocks on BlockPool]
-    c10::xpu::syncStreamsOnDevice(device_index);
+  bool release_cached_blocks(MempoolId_t mempool_id) {
+    if (mempool_id.first == 0 && mempool_id.second == 0 &&
+        captures_underway.empty()) {
+      synchronize_and_free_events();
+      // See Note [Safe to Free Blocks on BlockPool]
+      c10::xpu::syncStreamsOnDevice(device_index);
 
-    release_blocks(large_blocks);
-    release_blocks(small_blocks);
+      release_blocks(large_blocks);
+      release_blocks(small_blocks);
+    }
+
+    for (auto it = graph_pools_freeable.begin();
+         it != graph_pools_freeable.end();) {
+      if (mempool_id.first != 0 || mempool_id.second != 0) {
+        if (it->first == mempool_id) {
+          // If there is an active mempool, we sync only the events
+          // associated with the pool
+          synchronize_and_free_events(it->second);
+        } else {
+          // otherwise we move on
+          ++it;
+          continue;
+        }
+      }
+      TORCH_INTERNAL_ASSERT(it->second->use_count == 0);
+      release_blocks(it->second->small_blocks);
+      release_blocks(it->second->large_blocks);
+      if (it->second->allocation_count == 0) {
+        auto erase_count = graph_pools.erase(it->first);
+        TORCH_INTERNAL_ASSERT(erase_count == 1);
+        it = graph_pools_freeable.erase(it);
+      } else {
+        ++it;
+      }
+    }
     return true;
   }
 
@@ -903,6 +1011,30 @@ class DeviceCachingAllocator {
     }
   }
 
+  void create_or_incref_pool(
+      MempoolId_t mempool_id,
+      XPUAllocator* allocator = nullptr) {
+    auto it = graph_pools.find(mempool_id);
+    if (it == graph_pools.end()) {
+      // mempool_id does not reference an existing pool.
+      // Make a new pool for XPU graph capture or memory pool usage.
+      graph_pools.emplace(
+          mempool_id, std::make_unique<PrivatePool>(mempool_id, allocator));
+    } else {
+      // mempool_id references an existing pool, which the current XPU graph
+      // capture will share.
+      TORCH_INTERNAL_ASSERT(it->second->use_count > 0);
+      TORCH_INTERNAL_ASSERT(allocator == nullptr);
+      it->second->use_count++;
+    }
+  }
+
+  PrivatePool* get_private_pool(MempoolId_t mempool_id) {
+    auto it = graph_pools.find(mempool_id);
+    TORCH_INTERNAL_ASSERT(it != graph_pools.end());
+    return it->second.get();
+  }
+
  public:
   DeviceCachingAllocator(DeviceIndex device_index)
       : large_blocks(/* small */ false),
@@ -911,9 +1043,11 @@ class DeviceCachingAllocator {
 
   Block* malloc(DeviceIndex device, size_t orig_size, sycl::queue& queue) {
     std::scoped_lock<std::recursive_mutex> lock(mutex);
-    process_events();
+    if (C10_LIKELY(captures_underway.empty())) {
+      process_events();
+    }
     size_t size = round_size(orig_size);
-    auto& pool = get_pool(size);
+    auto& pool = get_pool(size, &queue);
     const size_t alloc_size = get_allocation_size(size);
     AllocParams params(device, size, &queue, &pool, alloc_size);
     params.stat_types = get_stat_types_for_pool(pool);
@@ -923,18 +1057,17 @@ class DeviceCachingAllocator {
     // Can't reuse an existing block, try to get a new one.
     if (!block_found) {
       block_found = alloc_block(params, false) ||
-          (release_cached_blocks() && alloc_block(params, true));
+          (release_cached_blocks({0, 0}) && alloc_block(params, true));
     }
     if (!block_found) {
-      c10::xpu::DeviceProp device_prop;
-      c10::xpu::get_device_properties(&device_prop, device);
-      auto device_total = device_prop.global_mem_size;
+      const auto& raw_device = c10::xpu::get_raw_device(device);
+      const auto device_total =
+          raw_device.get_info<sycl::info::device::global_mem_size>();
       // Estimate the available device memory when the SYCL runtime does not
       // support the corresponding aspect (ext_intel_free_memory).
-      size_t device_free = device_prop.global_mem_size -
+      size_t device_free = device_total -
           stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)]
               .current;
-      auto& raw_device = c10::xpu::get_raw_device(device);
       // TODO: Remove the aspect check once the SYCL runtime bug is fixed on
       // affected devices.
       if (raw_device.has(sycl::aspect::ext_intel_free_memory)) {
@@ -1017,9 +1150,9 @@ class DeviceCachingAllocator {
     block->stream_uses.insert(stream);
   }
 
-  void emptyCache() {
+  void emptyCache(MempoolId_t mempool_id) {
     std::scoped_lock<std::recursive_mutex> lock(mutex);
-    release_cached_blocks();
+    release_cached_blocks(mempool_id);
   }
 
   DeviceStats getStats() {
@@ -1052,21 +1185,37 @@ class DeviceCachingAllocator {
     }
   }
 
+  std::pair<size_t, size_t> getMemoryInfo() {
+    const auto& device = c10::xpu::get_raw_device(device_index);
+    const size_t total = device.get_info<sycl::info::device::global_mem_size>();
+    TORCH_CHECK(
+        device.has(sycl::aspect::ext_intel_free_memory),
+        "The device (",
+        device.get_info<sycl::info::device::name>(),
+        ") doesn't support querying the available free memory. ",
+        "You can file an issue at https://github.com/pytorch/pytorch/issues ",
+        "to help us prioritize its implementation.");
+    const size_t free =
+        device.get_info<sycl::ext::intel::info::device::free_memory>();
+    return {free, total};
+  }
+
   double getMemoryFraction() {
     if (!set_fraction) {
       return 1.0;
     }
 
-    c10::xpu::DeviceProp device_prop;
-    c10::xpu::get_device_properties(&device_prop, device_index);
+    const auto device_total =
+        xpu::get_raw_device(device_index)
+            .get_info<sycl::info::device::global_mem_size>();
     return static_cast<double>(allowed_memory_maximum) /
-        static_cast<double>(device_prop.global_mem_size);
+        static_cast<double>(device_total);
   }
 
   void setMemoryFraction(double fraction) {
-    c10::xpu::DeviceProp device_prop;
-    c10::xpu::get_device_properties(&device_prop, device_index);
-    auto device_total = device_prop.global_mem_size;
+    const auto device_total =
+        xpu::get_raw_device(device_index)
+            .get_info<sycl::info::device::global_mem_size>();
     allowed_memory_maximum = static_cast<size_t>(fraction * device_total);
     set_fraction = true;
   }
@@ -1157,9 +1306,9 @@ class XPUAllocator : public DeviceAllocator {
     }
   }
 
-  void emptyCache(MempoolId_t mempool_id [[maybe_unused]] = {0, 0}) override {
+  void emptyCache(MempoolId_t mempool_id) override {
     for (auto& da : device_allocators) {
-      da->emptyCache();
+      da->emptyCache(mempool_id);
     }
   }
 
@@ -1240,6 +1389,11 @@ class XPUAllocator : public DeviceAllocator {
         c10::xpu::get_raw_device(dev_to_access));
   }
 
+  std::pair<size_t, size_t> getMemoryInfo(DeviceIndex device) override {
+    assertValidDevice(device);
+    return device_allocators[device]->getMemoryInfo();
+  }
+
   double getMemoryFraction(DeviceIndex device) {
     assertValidDevice(device);
     return device_allocators[device]->getMemoryFraction();
@@ -1270,8 +1424,8 @@ void init(DeviceIndex device_count) {
   return allocator.init(device_count);
 }
 
-void emptyCache() {
-  return allocator.emptyCache();
+void emptyCache(MempoolId_t mempool_id) {
+  return allocator.emptyCache(mempool_id);
 }
 
 void resetPeakStats(DeviceIndex device) {
diff --git a/c10/xpu/XPUCachingAllocator.h b/c10/xpu/XPUCachingAllocator.h
index b1f41a103f8f8..bbb20a5b2ecdf 100644
--- a/c10/xpu/XPUCachingAllocator.h
+++ b/c10/xpu/XPUCachingAllocator.h
@@ -10,7 +10,7 @@ C10_XPU_API Allocator* get();
 
 C10_XPU_API void init(DeviceIndex device_count);
 
-C10_XPU_API void emptyCache();
+C10_XPU_API void emptyCache(MempoolId_t mempool_id = {0, 0});
 
 C10_XPU_API void resetPeakStats(DeviceIndex device);
 
diff --git a/c10/xpu/XPUEvent.h b/c10/xpu/XPUEvent.h
new file mode 100644
index 0000000000000..169a36f77e504
--- /dev/null
+++ b/c10/xpu/XPUEvent.h
@@ -0,0 +1,178 @@
+#pragma once
+#include <c10/xpu/XPUStream.h>
+
+namespace c10::xpu {
+
+/*
+ * XPUEvent are movable not copyable wrappers around SYCL event. XPUEvent are
+ * constructed lazily when first recorded. It has a device, and this device is
+ * acquired from the first recording stream. Later streams that record the event
+ * must match the same device.
+ *
+ * Currently, XPUEvent does NOT support to export an inter-process event from
+ * another process via inter-process communication(IPC). So it means that
+ * inter-process communication for event handles between different processes is
+ * not available. This could impact some applications that rely on cross-process
+ * synchronization and communication.
+ */
+struct XPUEvent {
+  // Constructors
+  XPUEvent(bool enable_timing = false) noexcept
+      : enable_timing_{enable_timing} {}
+
+  ~XPUEvent() {
+    if (isCreated()) {
+      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+      if (C10_UNLIKELY(interp)) {
+        (*interp)->trace_gpu_event_deletion(
+            c10::kXPU, reinterpret_cast<uintptr_t>(event_.get()));
+      }
+    }
+  }
+
+  C10_DISABLE_COPY_AND_ASSIGN(XPUEvent);
+
+  XPUEvent(XPUEvent&& other) = default;
+  XPUEvent& operator=(XPUEvent&& other) = default;
+
+  operator sycl::event&() const {
+    return event();
+  }
+
+  std::optional<c10::Device> device() const {
+    if (isCreated()) {
+      return c10::Device(c10::kXPU, device_index_);
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  inline bool isCreated() const {
+    return (event_.get() != nullptr);
+  }
+
+  DeviceIndex device_index() const {
+    return device_index_;
+  }
+
+  sycl::event& event() const {
+    return *event_;
+  }
+
+  bool query() const {
+    using namespace sycl::info;
+    if (!isCreated()) {
+      return true;
+    }
+
+    return event().get_info<event::command_execution_status>() ==
+        event_command_status::complete;
+  }
+
+  void record() {
+    record(getCurrentXPUStream());
+  }
+
+  void recordOnce(const XPUStream& stream) {
+    if (!isCreated()) {
+      record(stream);
+    }
+  }
+
+  void record(const XPUStream& stream) {
+    if (!isCreated()) {
+      device_index_ = stream.device_index();
+      assignEvent(stream.queue());
+      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+      if (C10_UNLIKELY(interp)) {
+        (*interp)->trace_gpu_event_creation(
+            c10::kXPU, reinterpret_cast<uintptr_t>(event_.get()));
+      }
+    } else {
+      TORCH_CHECK(
+          device_index_ == stream.device_index(),
+          "Event device ",
+          device_index_,
+          " does not match recording stream's device ",
+          stream.device_index(),
+          ".");
+      reassignEvent(stream.queue());
+    }
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_record(
+          c10::kXPU,
+          reinterpret_cast<uintptr_t>(event_.get()),
+          reinterpret_cast<uintptr_t>(&stream.queue()));
+    }
+  }
+
+  void block(const XPUStream& stream) {
+    if (isCreated()) {
+      std::vector<sycl::event> event_list{event()};
+      // Make this stream wait until event_ is completed.
+      stream.queue().ext_oneapi_submit_barrier(event_list);
+      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+      if (C10_UNLIKELY(interp)) {
+        (*interp)->trace_gpu_event_wait(
+            c10::kXPU,
+            reinterpret_cast<uintptr_t>(event_.get()),
+            reinterpret_cast<uintptr_t>(&stream.queue()));
+      }
+    }
+  }
+
+  double elapsed_time(const XPUEvent& other) const {
+    TORCH_CHECK(
+        isCreated() && other.isCreated(),
+        "Both events must be recorded before calculating elapsed time.");
+    TORCH_CHECK(
+        query() && other.query(),
+        "Both events must be completed before calculating elapsed time.");
+    TORCH_CHECK(
+        enable_timing_ && other.enable_timing_,
+        "Both events must be created with argument 'enable_timing=True'.");
+
+    using namespace sycl::info::event_profiling;
+    // Block until both of the recorded events are completed.
+    uint64_t end_time_ns = other.event().get_profiling_info<command_end>();
+    uint64_t start_time_ns = event().get_profiling_info<command_end>();
+    // Return the eplased time in milliseconds.
+    return 1e-6 *
+        (static_cast<double>(end_time_ns) - static_cast<double>(start_time_ns));
+  }
+
+  void synchronize() const {
+    if (isCreated()) {
+      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+      if (C10_UNLIKELY(interp)) {
+        (*interp)->trace_gpu_event_synchronization(
+            c10::kXPU, reinterpret_cast<uintptr_t>(event_.get()));
+      }
+      event().wait_and_throw();
+    }
+  }
+
+ private:
+  void assignEvent(sycl::queue& queue) {
+    if (enable_timing_) {
+      event_ = std::make_unique<sycl::event>(
+          sycl::ext::oneapi::experimental::submit_profiling_tag(queue));
+    } else {
+      event_ = std::make_unique<sycl::event>(queue.ext_oneapi_submit_barrier());
+    }
+  }
+
+  void reassignEvent(sycl::queue& queue) {
+    event_.reset();
+    assignEvent(queue);
+  }
+
+  bool enable_timing_ = false;
+  c10::DeviceIndex device_index_ = -1;
+  // Only need to track the last event, as events in an in-order queue are
+  // executed sequentially.
+  std::unique_ptr<sycl::event> event_;
+};
+
+} // namespace c10::xpu
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 0e86e826405c6..d5c585c1e1f0b 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1307,7 +1307,7 @@ endif()
 
 if(USE_MKLDNN_ACL)
   find_package(ACL REQUIRED)
-  target_include_directories(torch_cpu PRIVATE ${ACL_INCLUDE_DIRS})
+  target_include_directories(torch_cpu SYSTEM PRIVATE ${ACL_INCLUDE_DIRS})
 endif()
 
 target_include_directories(torch_cpu PRIVATE ${ATen_CPU_INCLUDE})
@@ -1941,6 +1941,7 @@ if(BUILD_TEST)
     foreach(test_src ${Caffe2_XPU_TEST_SRCS})
       get_filename_component(test_name ${test_src} NAME_WE)
       add_executable(${test_name} "${test_src}")
+      torch_compile_options(${test_name})
       target_link_libraries(${test_name} torch_library gtest_main)
       target_include_directories(${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
       target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
diff --git a/caffe2/perfkernels/batch_box_cox_vec.h b/caffe2/perfkernels/batch_box_cox_vec.h
index ed2e83062d107..08e4f84fe4327 100644
--- a/caffe2/perfkernels/batch_box_cox_vec.h
+++ b/caffe2/perfkernels/batch_box_cox_vec.h
@@ -73,6 +73,19 @@ void box_cox_zero_lambda(
   }
 }
 
+template <typename T>
+at::vec::Vectorized<T> box_cox_nonzero_lambda_impl(
+    at::vec::Vectorized<T> data,
+    at::vec::Vectorized<T> lambda1,
+    at::vec::Vectorized<T> lambda2,
+    at::vec::Vectorized<T> k_eps) {
+  auto sum = data + lambda2;
+  auto max = at::vec::max(sum, k_eps);
+  auto lambda_over_1 = at::vec::fast_recieprocal(lambda1);
+  auto pow = max.pow(lambda1);
+  return at::vec::fmsub(pow, lambda_over_1, lambda_over_1);
+}
+
 template <typename T>
 void box_cox_nonzero_lambda(
     int64_t D,
@@ -88,21 +101,18 @@ void box_cox_nonzero_lambda(
   auto k_eps_vec = Vec(k_eps);
   for(; j + VLEN < D; j += VLEN) {
     auto data = Vec::loadu(data_ptr + j);
-    auto lambda2 = Vec::loadu(lambda2_ptr + j);
-    auto sum = data + lambda2;
-    auto max = at::vec::max(sum, k_eps_vec);
     auto lambda1 = Vec::loadu(lambda1_ptr + j);
-    auto lambda_over_1 = at::vec::fast_recieprocal(lambda1);
-    auto pow = max.pow(lambda1);
-    auto res = at::vec::fmsub(pow, lambda_over_1, lambda_over_1);
+    auto lambda2 = Vec::loadu(lambda2_ptr + j);
+    auto res = box_cox_nonzero_lambda_impl(data, lambda1, lambda2, k_eps_vec);
     res.store(out + j);
   }
-  for ( ;j < D; ++j) {
-    auto sum = data_ptr[j] + lambda2_ptr[j];
-    auto max = std::max(sum, k_eps);
-    auto lambda_over_1 = at::vec::fast_recieprocal(lambda1_ptr[j]);
-    auto pow = std::pow(max, lambda1_ptr[j]);
-    out[j] = pow * lambda_over_1 - lambda_over_1;
+  if (j < D) {
+    auto remaining = D - j;
+    auto data = Vec::loadu(data_ptr + j, remaining);
+    auto lambda1 = Vec::loadu(lambda1_ptr + j, remaining);
+    auto lambda2 = Vec::loadu(lambda2_ptr + j, remaining);
+    auto res = box_cox_nonzero_lambda_impl(data, lambda1, lambda2, k_eps_vec);
+    res.store(out + j, remaining);
   }
 }
 #else
diff --git a/caffe2/serialize/inline_container.cc b/caffe2/serialize/inline_container.cc
index 015c480cf04f0..0193a6bc180f1 100644
--- a/caffe2/serialize/inline_container.cc
+++ b/caffe2/serialize/inline_container.cc
@@ -734,7 +734,7 @@ void PyTorchStreamWriter::setup(const string& file_name) {
           file_name,
           std::ofstream::out | std::ofstream::trunc | std::ofstream::binary
         );
-    } catch (const std::ios_base::failure& e) {
+    } catch (const std::ios_base::failure&) {
 #ifdef _WIN32
       // Windows have verbose error code, we prefer to use it than std errno.
       uint32_t error_code = GetLastError();
@@ -773,8 +773,20 @@ void PyTorchStreamWriter::writeRecord(
     bool compress) {
   AT_ASSERT(!finalized_);
   AT_ASSERT(!archive_name_plus_slash_.empty());
-  TORCH_INTERNAL_ASSERT(
-      files_written_.count(name) == 0, "Tried to serialize file twice: ", name);
+  if (files_written_.count(name) > 0) {
+    // Allow multiple writes for triton binaries
+    bool is_triton_extension =
+        c10::ends_with(name, ".so") ||
+        c10::ends_with(name, ".cubin") ||
+        c10::ends_with(name, ".hsaco");
+
+    if (is_triton_extension) {
+      LOG(WARNING) << "File '" << name << "' is being serialized multiple times";
+      return;
+    }
+
+    TORCH_INTERNAL_ASSERT(false, "Tried to serialize file twice: ", name);
+  }
   if (name == kSerializationIdRecordName && serialization_id_.empty()) {
     // In case of copying records from another file, skip writing a different
     // serialization_id than the one computed in this writer.
diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake
index 55d03b7c46320..bac1fa7daac01 100644
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@@ -118,6 +118,12 @@ if(INTERN_BUILD_ATEN_OPS)
             list(APPEND _file_compile_flags "-gencode;arch=compute_120a,code=sm_120a")
           endif()
         endif()
+        # We will need to gate against CUDA version, sm_121a was introduced in CUDA 12.9
+        if("${_arch}" STREQUAL "121a" AND CUDA_VERSION VERSION_GREATER_EQUAL 12.9)
+          if(_existing_arch_flags MATCHES ".*compute_120.*")
+            list(APPEND _file_compile_flags "-gencode;arch=compute_121a,code=sm_121a")
+          endif()
+        endif()
       endforeach()
       list(JOIN _file_compile_flags " " _file_compile_flags)
 
@@ -126,7 +132,7 @@ if(INTERN_BUILD_ATEN_OPS)
 
     _BUILD_FOR_ADDITIONAL_ARCHS(
       "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cuda/RowwiseScaledMM.cu"
-      "89;90a;100a;103a;120a")
+      "89;90a;100a;103a;120a;121a")
     _BUILD_FOR_ADDITIONAL_ARCHS(
       "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cuda/ScaledGroupMM.cu"
       "90a")
diff --git a/cmake/External/aotriton.cmake b/cmake/External/aotriton.cmake
index b19f25609cad3..21369c2981c31 100644
--- a/cmake/External/aotriton.cmake
+++ b/cmake/External/aotriton.cmake
@@ -15,12 +15,14 @@ if(NOT __AOTRITON_INCLUDED)
       "manylinux_2_28"  # rocm6.3
       "manylinux_2_28"  # rocm6.4
       "manylinux_2_28"  # rocm7.0
+      "manylinux_2_28"  # rocm7.1
       )
   set(__AOTRITON_ROCM_LIST
       "rocm6.2"
       "rocm6.3"
       "rocm6.4"
       "rocm7.0"
+      "rocm7.1"
       )
   set(__AOTRITON_CI_COMMIT "972223c501ffc22068bb035ac5d64cf54318d895")
   set(__AOTRITON_SHA256_LIST
@@ -28,6 +30,7 @@ if(NOT __AOTRITON_INCLUDED)
       "72a153549ea20707331e8a1f1e3d1b8de2913f9d5af2b900c56235d578b57efe"  # rocm6.3
       "c7f319dd7448cbbbab81889dd8a37d47dbc25ebcbd89760f09e6a0904e556393"  # rocm6.4
       "a2a974e0ad929a5e5827c0f896c59bda4872459cbaf8dd8e0a00407f404491cf"  # rocm7.0
+      "d4eb24c9f1a0cfedb35f9292efb41d16589cf5a4b98c3c0940181bbefc49d722"  # rocm7.1
       )
   set(__AOTRITON_IMAGE_LIST
       "amd-gfx90a"
diff --git a/cmake/Modules/FindGloo.cmake b/cmake/Modules/FindGloo.cmake
index 944cd4d8d2573..0bdfe275d9c06 100644
--- a/cmake/Modules/FindGloo.cmake
+++ b/cmake/Modules/FindGloo.cmake
@@ -26,7 +26,7 @@ find_library(Gloo_CUDA_LIBRARY
 # if Gloo + HIP is desired, Gloo_HIP_LIBRARY
 # needs to be linked to desired target
 find_library(Gloo_HIP_LIBRARY
-  NAMES gloo_hiop
+  NAMES gloo_hip
   DOC "Gloo's HIP support/code"
 )
 
diff --git a/cmake/Modules/FindSYCLToolkit.cmake b/cmake/Modules/FindSYCLToolkit.cmake
index 1dac15bb676af..337afa1bfe417 100644
--- a/cmake/Modules/FindSYCLToolkit.cmake
+++ b/cmake/Modules/FindSYCLToolkit.cmake
@@ -1,7 +1,7 @@
 # This will define the following variables:
 # SYCL_FOUND               : True if the system has the SYCL library.
 # SYCL_INCLUDE_DIR         : Include directories needed to use SYCL.
-# SYCL_LIBRARY_DIR         ：The path to the SYCL library.
+# SYCL_LIBRARY_DIR         : The path to the SYCL library.
 # SYCL_LIBRARY             : SYCL library fullname.
 # SYCL_COMPILER_VERSION    : SYCL compiler version.
 
diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
index 218c50a69c6fb..bc8855d23e61f 100644
--- a/cmake/public/cuda.cmake
+++ b/cmake/public/cuda.cmake
@@ -28,6 +28,15 @@ endif()
 # Find CUDA.
 find_package(CUDA)
 if(NOT CUDA_FOUND)
+  # If user explicitly set USE_CUDA=1, error out instead of falling back
+  if(_USE_CUDA_EXPLICITLY_SET AND USE_CUDA)
+    message(FATAL_ERROR
+      "PyTorch: CUDA was explicitly requested (USE_CUDA=1) but cannot be found. "
+      "Please check your CUDA installation, ensure CUDA toolkit is installed, "
+      "and that CUDA_HOME or CMAKE_CUDA_COMPILER is set correctly. "
+      "If you want to build without CUDA, please set USE_CUDA=0.")
+  endif()
+
   message(WARNING
     "PyTorch: CUDA cannot be found. Depending on whether you are building "
     "PyTorch or a PyTorch dependent library, the next warning / error will "
diff --git a/cmake/public/utils.cmake b/cmake/public/utils.cmake
index efc39f2bc1481..2cea7da5af3f0 100644
--- a/cmake/public/utils.cmake
+++ b/cmake/public/utils.cmake
@@ -478,6 +478,7 @@ function(torch_update_find_cuda_flags)
 endfunction()
 
 include(CheckCXXCompilerFlag)
+include(CheckCCompilerFlag)
 include(CheckLinkerFlag)
 
 ##############################################################################
@@ -501,6 +502,24 @@ function(append_cxx_flag_if_supported flag outputvar)
     endif()
 endfunction()
 
+function(append_c_flag_if_supported flag outputvar)
+    string(TOUPPER "HAS${flag}" _FLAG_NAME)
+    string(REGEX REPLACE "[=-]" "_" _FLAG_NAME "${_FLAG_NAME}")
+
+    # GCC silences unknown -Wno-XXX flags, so test the corresponding -WXXX.
+    if(CMAKE_C_COMPILER_ID STREQUAL "GNU")
+        string(REGEX REPLACE "^Wno-" "W" new_flag "${flag}")
+    else()
+        set(new_flag "${flag}")
+    endif()
+
+    check_c_compiler_flag("${new_flag}" ${_FLAG_NAME})
+    if(${_FLAG_NAME})
+        string(APPEND ${outputvar} " ${flag}")
+        set(${outputvar} "${${outputvar}}" PARENT_SCOPE)
+    endif()
+endfunction()
+
 function(target_compile_options_if_supported target flag)
   set(_compile_options "")
   append_cxx_flag_if_supported("${flag}" _compile_options)
diff --git a/docs/cpp/source/index.rst b/docs/cpp/source/index.rst
index 6f4c0f320a388..12c5090b5c1e8 100644
--- a/docs/cpp/source/index.rst
+++ b/docs/cpp/source/index.rst
@@ -10,7 +10,7 @@ API.  This API can roughly be divided into five parts:
 - **TorchScript**: An interface to the TorchScript JIT compiler and interpreter.
 - **C++ Extensions**: A means of extending the Python API with custom C++ and CUDA routines.
 
-Combining, these building blocks form a research and
+Combined, these building blocks form a research and
 production ready C++ library for tensor computation and dynamic neural
 networks with strong emphasis on GPU acceleration as well as fast CPU
 performance. It is currently in use at Facebook in research and
@@ -76,7 +76,7 @@ C++ Frontend
 ------------
 
 The PyTorch C++ frontend provides a high level, pure C++ modeling interface for
-neural network and general ML(Machine Learning) research and production use cases,
+neural networks and general ML (Machine Learning) research and production use cases,
 largely following the Python API in design and provided functionality. The C++
 frontend includes the following:
 
diff --git a/docs/source/accelerator.md b/docs/source/accelerator.md
index ce593a9acf518..c5904563ee711 100644
--- a/docs/source/accelerator.md
+++ b/docs/source/accelerator.md
@@ -40,6 +40,7 @@
     :nosignatures:
 
      empty_cache
+     get_memory_info
      max_memory_allocated
      max_memory_reserved
      memory_allocated
diff --git a/docs/source/accelerator/device.md b/docs/source/accelerator/device.md
new file mode 100644
index 0000000000000..fc508279948dd
--- /dev/null
+++ b/docs/source/accelerator/device.md
@@ -0,0 +1,113 @@
+# Device Management
+
+## Background
+
+Device management handles basic operations like querying how many devices are available and switching between them. Accelerator backends need to wrap their device runtime's APIs and expose them to PyTorch.
+
+The OpenReg implementation ([`OpenRegFunctions.h/cpp`][OpenReg Device Management]) shows how to wrap a third-party runtime. These functions are used throughout the backend - by streams, events, generators, and Python bindings.
+
+## Design
+
+Accelerator vendors need to implement these core functions:
+
+| Function Name             | Description                                                      | Application Scenarios                                                                                          |
+| ------------------------- | ---------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------- |
+| `device_count()`          | Query the total number of available devices in the system        | - Application initialization<br>- Multi-device workload distribution<br>- Validating device indices before use |
+| `current_device()`        | Get the currently active device for the calling thread           | - Debugging and logging<br>- Determining tensor placement<br>- Guard implementations                           |
+| `set_device()`            | Change the active device for subsequent operations               | - Switching context between devices<br>- Initializing specific device resources<br>- Multi-GPU training loops  |
+| `exchange_device()`       | Atomically swap device and return the previous device            | - Implementing device guards<br>- Temporarily switching device context<br>- RAII-based device management       |
+| `maybe_exchange_device()` | Conditionally exchange device only if the index is valid (-1 OK) | - Safe device switching with optional indices<br>- Guard implementations with nullable device values           |
+
+These functions are building blocks for more complex features like streams, events, and memory management. Make sure to validate inputs and handle errors properly.
+
+## Implementation
+
+This section shows how to implement device management using `set_device` as an example. The implementation requires:
+1. C++ wrappers around the device runtime
+2. Python bindings to expose the C++ functions
+3. User-friendly Python APIs
+
+### C++ Side
+
+Wrap the device runtime's API and add error handling. The `SetDevice` function shows this pattern:
+
+```{eval-rst}
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: OPENREG SetDevice FUNCTION
+    :end-before: LITERALINCLUDE END: OPENREG SetDevice FUNCTION
+    :linenos:
+```
+```{eval-rst}
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: OPENREG set_device FUNCTION
+    :end-before: LITERALINCLUDE END: OPENREG set_device FUNCTION
+    :linenos:
+```
+
+### Binding
+
+Expose the C++ functions to Python using pybind11:
+
+```{eval-rst}
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/csrc/Module.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: MODULE SET DEVICE HELPER
+    :end-before: LITERALINCLUDE END: MODULE SET DEVICE HELPER
+    :linenos:
+```
+```{eval-rst}
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/csrc/Module.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: OPENREG MODULE METHODS
+    :end-before: LITERALINCLUDE END: OPENREG MODULE METHODS
+    :linenos:
+    :emphasize-lines: 5
+```
+
+### Python Side
+
+Wrap the C++ bindings with user-friendly Python functions:
+
+```{eval-rst}
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/__init__.py
+    :language: python
+    :start-after: LITERALINCLUDE START: PYTHON SET DEVICE FUNCTION
+    :end-before: LITERALINCLUDE END: PYTHON SET DEVICE FUNCTION
+    :linenos:
+```
+
+Here's the complete mapping from C++ to Python:
+
+| C++ Binding Function | C++ Binding API (pybind11)               | Python User API                  | Description                                  |
+| -------------------- | ---------------------------------------- | -------------------------------- | -------------------------------------------- |
+| `_getDeviceCount`    | `torch_openreg._C._get_device_count()`   | `torch.openreg.device_count()`   | Returns the total number of devices          |
+| `_getDevice`         | `torch_openreg._C._get_device()`         | `torch.openreg.current_device()` | Returns the current active device index      |
+| `_setDevice`         | `torch_openreg._C._set_device(idx)`      | `torch.openreg.set_device(idx)`  | Sets the active device                       |
+| `_exchangeDevice`    | `torch_openreg._C._exchange_device(idx)` | N/A (internal use only)          | Atomically swaps device and returns previous |
+
+## Guard
+
+Device guards provide automatic device switching with exception safety. They're similar to lock guards in C++ - they switch device on construction and restore it on destruction.
+
+Implement `DeviceGuardImplInterface` to integrate with PyTorch's guard system:
+
+```{eval-rst}
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegGuard.h
+    :language: c++
+    :start-after: LITERALINCLUDE START: OPENREG DEVICE MGMT GUARD IMPL EXAMPLE
+    :end-before: LITERALINCLUDE END: OPENREG DEVICE MGMT GUARD IMPL EXAMPLE
+    :linenos:
+```
+
+**What needs to be implemented:**
+
+1. **exchangeDevice()**: Switch to a new device and return the old one (used by guard constructors)
+2. **getDevice()**: Get the current device
+3. **setDevice()**: Set the active device
+4. **Type checking**: Validate that device type matches the backend
+
+This makes the guard available to PyTorch for the `PrivateUse1` device type. Users can then use standard PyTorch device guards with the custom backend.
+
+[OpenReg Device Management]: https://github.com/pytorch/pytorch/blob/main/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.cpp "OpenReg Device Management"
\ No newline at end of file
diff --git a/docs/source/accelerator/hooks.md b/docs/source/accelerator/hooks.md
new file mode 100644
index 0000000000000..f553f4387f08d
--- /dev/null
+++ b/docs/source/accelerator/hooks.md
@@ -0,0 +1,164 @@
+# Accelerator Hooks
+
+## Background
+
+OpenReg hooks provide a mechanism for integrating custom accelerator devices into PyTorch's runtime system. OpenReg (Open Registration) is PyTorch's extensibility framework that allows accelerator vendors to register custom device backends without modifying PyTorch core code.
+
+## Design
+
+The following tables list all hooks that accelerator vendors need to implement when integrating a new device backend. These hooks are categorized into two priority levels:
+
+- **High Priority Hooks**: Core APIs that PyTorch runtime directly depends on. Accelerator vendors are recommended to implement all high priority hooks to ensure full PyTorch compatibility and enable basic device functionality.
+
+- **Low Priority Hooks**: Device management and utility APIs that PyTorch does not directly depend on. These hooks enhance user experience and multi-device support but are *optional*. Accelerator vendors can choose to implement them based on their specific requirements and use cases.
+
+### High Priority Hooks
+
+| Hook Method                        | Description                                               | Application Scenario                                                             |
+| ---------------------------------- | --------------------------------------------------------- | -------------------------------------------------------------------------------- |
+| `init()`                           | Initializes the accelerator runtime and device contexts   | Set up necessary state when PyTorch first accesses the device                    |
+| `hasPrimaryContext(DeviceIndex)`   | Checks if a primary context exists for the device         | Determine whether device initialization has occurred                             |
+| `getDefaultGenerator(DeviceIndex)` | Returns the default random number generator for a device  | Access the device's primary RNG for reproducible random operations               |
+| `getNewGenerator(DeviceIndex)`     | Creates a new independent random number generator         | Create isolated RNG instances for parallel operations                            |
+| `getDeviceFromPtr(void*)`          | Determines which device a memory pointer belongs to       | Identify the accelerator device associated with a memory allocation              |
+| `getPinnedMemoryAllocator()`       | Returns an allocator for pinned (page-locked) host memory | Allocate host memory that can be efficiently transferred to/from the accelerator |
+| `isPinnedPtr(void*)`               | Checks if a pointer points to pinned memory               | Validate memory types before performing operations                               |
+
+### Low Priority Hooks
+
+| Hook Method                        | Description                                                                  | Application Scenario                                                 |
+| ---------------------------------- | ---------------------------------------------------------------------------- | -------------------------------------------------------------------- |
+| `isBuilt()`                        | Returns whether the accelerator backend is built/compiled into the extension | Check whether the accelerator library is available at compile time   |
+| `isAvailable()`                    | Returns whether the accelerator hardware is available at runtime             | Verify whether accelerator devices can be detected and initialized   |
+| `deviceCount()`                    | Returns the number of available accelerator devices                          | Enumerate all available accelerator devices for device selection     |
+| `setCurrentDevice(DeviceIndex)`    | Sets the active device for the current thread                                | Switch the current thread's context to a specific accelerator device |
+| `getCurrentDevice()`               | Returns the currently active device index                                    | Query which accelerator device is active in the current thread       |
+| `exchangeDevice(DeviceIndex)`      | Atomically exchanges the current device and returns the previous one         | Temporarily switch devices and restore the previous device afterward |
+| `maybeExchangeDevice(DeviceIndex)` | Conditionally exchanges device only if the index is valid                    | Safely attempt device switching with validation                      |
+
+## Implementation
+
+We can just take `getDefaultGenerator` as an implementation example:
+
+```{eval-rst}
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegHooks.h
+    :language: c++
+    :start-after: LITERALINCLUDE START: OPENREG HOOK EXAMPLES
+    :end-before: LITERALINCLUDE END: OPENREG HOOK EXAMPLES
+    :linenos:
+```
+
+In this implementation:
+
+1. **Override the base interface**: The `getDefaultGenerator` method overrides the virtual method from `at::PrivateUse1HooksInterface`.
+
+2. **Delegate to device-specific implementation**: It calls `getDefaultOpenRegGenerator(device_index)`, which manages a per-device generator instance.
+
+3. **Return device-specific generator**: The returned `at::Generator` wraps an `OpenRegGeneratorImpl` that implements device-specific random number generation.
+
+This pattern applies to all hooks: override the interface method, validate inputs, delegate to your device-specific API, and return results in PyTorch's expected format.
+
+## Integration Example
+
+The following sections demonstrate how PyTorch integrates with accelerator hooks when accessing the default random number generator. The example traces the complete flow from user-facing Python code down to the device-specific implementation.
+
+### Layer 1: User Code
+
+User code initiates the operation by calling `manual_seed` to set the random seed for reproducible results:
+
+```python
+import torch
+torch.openreg.manual_seed(42)
+```
+
+### Layer 2: Extension Python API
+
+The Python API layer handles device management and calls into the C++ extension (defined in [`torch_openreg/openreg/random.py`][random.py]):
+
+```{eval-rst}
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/random.py
+    :language: python
+    :start-after: LITERALINCLUDE START: OPENREG MANUAL SEED
+    :end-before: LITERALINCLUDE END: OPENREG MANUAL SEED
+    :linenos:
+```
+
+The `manual_seed` function gets the current device index and calls `torch_openreg._C._get_default_generator(idx)` to obtain the device-specific generator, then sets the seed on it.
+
+### Layer 3: Python/C++ Bridge
+
+The C++ extension exposes `_getDefaultGenerator` to Python, which bridges to PyTorch's core runtime:
+
+```{eval-rst}
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/csrc/Module.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: OPENREG GET DEFAULT GENERATOR
+    :end-before: LITERALINCLUDE END: OPENREG GET DEFAULT GENERATOR
+    :linenos:
+    :emphasize-lines: 10-11
+```
+
+```{eval-rst}
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/csrc/Module.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: OPENREG MODULE METHODS
+    :end-before: LITERALINCLUDE END: OPENREG MODULE METHODS
+    :linenos:
+    :emphasize-lines: 3
+```
+
+This function unpacks the device index from Python, creates a `PrivateUse1` device object, and calls `at::globalContext().defaultGenerator()`. PyTorch's context then dispatches to the registered hooks.
+
+### Layer 4: PyTorch Core Context
+
+PyTorch's Context class dispatches to the appropriate accelerator hooks ([`aten/src/ATen/Context.h`][Context.h]):
+
+```{eval-rst}
+.. literalinclude:: ../../../aten/src/ATen/Context.h
+    :language: c++
+    :lines: 60-103
+    :linenos:
+    :emphasize-lines: 8-9, 24-25
+```
+
+This layered architecture enables PyTorch to remain device-agnostic while delegating hardware-specific operations to accelerator implementations. The hooks are registered once at module load time:
+
+```{eval-rst}
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegHooks.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: OPENREG HOOK REGISTER
+    :end-before: LITERALINCLUDE END: OPENREG HOOK REGISTER
+    :linenos:
+    :emphasize-lines: 4
+```
+
+### Layer 5: Accelerator Hooks
+
+The hooks interface provides the abstraction that PyTorch uses to delegate to device-specific implementations:
+
+```{eval-rst}
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegHooks.h
+    :language: c++
+    :start-after: LITERALINCLUDE START: OPENREG HOOK EXAMPLES
+    :end-before: LITERALINCLUDE END: OPENREG HOOK EXAMPLES
+    :linenos:
+```
+
+The `getDefaultGenerator` hook method overrides the base interface and delegates to `getDefaultOpenRegGenerator`, which manages the actual generator instances.
+
+### Layer 6: Device-Specific Implementation
+
+The device-specific implementation manages per-device generator instances:
+
+```{eval-rst}
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegGenerator.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: OPENREG GET DEFAULT GENERATOR IMPL
+    :end-before: LITERALINCLUDE END: OPENREG GET DEFAULT GENERATOR IMPL
+    :linenos:
+```
+
+This function maintains a static vector of generators (one per device), initializes them on first access, validates the device index, and returns the appropriate generator instance.
+
+[random.py]: https://github.com/pytorch/pytorch/tree/main/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/random.py#L48-L53 "random.py"
+[Context.h]: https://github.com/pytorch/pytorch/tree/main/aten/src/ATen/Context.h#L61-L102 "Context.h"
\ No newline at end of file
diff --git a/docs/source/accelerator/index.md b/docs/source/accelerator/index.md
index 3e8e5c895699e..f47df8a660b77 100644
--- a/docs/source/accelerator/index.md
+++ b/docs/source/accelerator/index.md
@@ -42,6 +42,8 @@ Next, we will delve into each chapter of this guide. Each chapter focuses on a k
 :glob:
 :maxdepth: 1
 
+device
+hooks
 autoload
 operators
 amp
diff --git a/docs/source/complex_numbers.md b/docs/source/complex_numbers.md
index 610f9a06615a1..095401879f09b 100644
--- a/docs/source/complex_numbers.md
+++ b/docs/source/complex_numbers.md
@@ -45,7 +45,7 @@ supported for complex tensors.
 ## Transition from the old representation
 
 Users who currently worked around the lack of complex tensors with real tensors of shape {math}`(..., 2)`
-can easily to switch using the complex tensors in their code using {func}`torch.view_as_complex`
+can easily switch to using the complex tensors in their code using {func}`torch.view_as_complex`
 and {func}`torch.view_as_real`. Note that these functions don’t perform any copy and return a
 view of the input tensor.
 
@@ -140,7 +140,7 @@ through the same optimizer on the {func}`torch.view_as_real` equivalent of the c
 
 `real_optim` and `complex_optim` will compute the same updates on the parameters, though there may be slight numerical
 discrepancies between the two optimizers, similar to numerical discrepancies between foreach vs forloop optimizers
-and capturable vs default optimizers. For more details, see [numbercial accuracy](https://pytorch.org/docs/stable/notes/numerical_accuracy.html).
+and capturable vs default optimizers. For more details, see [numerical accuracy](https://pytorch.org/docs/stable/notes/numerical_accuracy.html).
 
 Specifically, while you can think of our optimizer's handling of complex tensors as the same as optimizing over their
 `p.real` and `p.imag` pieces separately, the implementation details are not precisely that. Note that the
diff --git a/docs/source/conf.py b/docs/source/conf.py
index b5a04df3e090b..99ce1e0b8db5d 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -206,6 +206,41 @@
     os.path.join(os.path.dirname(pytorch_sphinx_theme2.__file__), "templates"),
 ]
 # TODO: document these and remove them from here.
+# Fixes the duplicated
+autosummary_filename_map = {
+    "torch.nn.utils.prune.identity": "torch.nn.utils.prune.identity_function",
+    "torch.nn.utils.prune.Identity": "torch.nn.utils.prune.Identity_class",
+    "torch.optim.adamw.adamw": "torch.optim.adamw.adamw_function",
+    "torch.optim.adamw.AdamW": "torch.optim.adamw.AdamW_class",
+    "torch.optim.asgd.asgd": "torch.optim.asgd.asgd_function",
+    "torch.optim.asgd.ASGD": "torch.optim.asgd.ASGD_class",
+    "torch.optim.nadam.nadam": "torch.optim.nadam.nadam_function",
+    "torch.optim.nadam.NAdam": "torch.optim.nadam.NAdam_class",
+    "torch.optim.radam.radam": "torch.optim.radam.radam_function",
+    "torch.optim.radam.RAdam": "torch.optim.radam.RAdam_class",
+    "torch.optim.rmsprop.rmsprop": "torch.optim.rmsprop.rmsprop_function",
+    "torch.optim.rmsprop.RMSprop": "torch.optim.rmsprop.RMSprop_class",
+    "torch.optim.rprop.rprop": "torch.optim.rprop.rprop_function",
+    "torch.optim.rprop.Rprop": "torch.optim.rprop.Rprop_class",
+    "torch.optim.sgd.sgd": "torch.optim.sgd.sgd_function",
+    "torch.optim.sgd.SGD": "torch.optim.sgd.SGD_class",
+    "torch.optim.adadelta.adadelta": "torch.optim.adadelta.adadelta_function",
+    "torch.optim.adadelta.Adadelta": "torch.optim.adadelta.Adadelta_class",
+    "torch.optim.adagrad.adagrad": "torch.optim.adagrad.adagrad_function",
+    "torch.optim.adagrad.Adagrad": "torch.optim.adagrad.Adagrad_class",
+    "torch.optim.adam.adam": "torch.optim.adam.adam_function",
+    "torch.optim.adam.Adam": "torch.optim.adam.Adam_class",
+    "torch.optim.adamax.adamax": "torch.optim.adamax.adamax_function",
+    "torch.optim.adamax.Adamax": "torch.optim.adamax.Adamax_class",
+    "torch.mtia.stream": "torch.mtia.stream_function",
+    "torch.mtia.Stream": "torch.mtia.Stream_class",
+    "torch.cpu.stream": "torch.cpu.stream_function",
+    "torch.cpu.Stream": "torch.cpu.Stream_class",
+    "torch.cuda.stream": "torch.cuda.stream_function",
+    "torch.cuda.Stream": "torch.cuda.Stream_class",
+    "torch.xpu.stream": "torch.xpu.stream_function",
+    "torch.xpu.Stream": "torch.xpu.Stream_class",
+}
 
 coverage_ignore_functions = [
     # torch
@@ -347,20 +382,6 @@
     # torch.ao.quantization.backend_config.tensorrt
     "get_tensorrt_backend_config",
     "get_tensorrt_backend_config_dict",
-    # torch.ao.quantization.backend_config.utils
-    "entry_to_pretty_str",
-    "get_fused_module_classes",
-    "get_fuser_method_mapping",
-    "get_fusion_pattern_to_extra_inputs_getter",
-    "get_fusion_pattern_to_root_node_getter",
-    "get_module_to_qat_module",
-    "get_pattern_to_dtype_configs",
-    "get_pattern_to_input_type_to_index",
-    "get_qat_module_classes",
-    "get_root_module_to_quantized_reference_module",
-    "pattern_to_human_readable",
-    "remove_boolean_dispatch_from_name",
-    # torch.ao.quantization.backend_config.x86
     "get_x86_backend_config",
     # torch.ao.quantization.fuse_modules
     "fuse_known_modules",
@@ -391,25 +412,6 @@
     "insert_observers_for_model",
     "prepare",
     "propagate_dtypes_for_known_nodes",
-    # torch.ao.quantization.fx.utils
-    "all_node_args_except_first",
-    "all_node_args_have_no_tensors",
-    "assert_and_get_unique_device",
-    "collect_producer_nodes",
-    "create_getattr_from_value",
-    "create_node_from_old_node_preserve_meta",
-    "get_custom_module_class_keys",
-    "get_linear_prepack_op_for_dtype",
-    "get_new_attr_name_with_prefix",
-    "get_non_observable_arg_indexes_and_types",
-    "get_qconv_prepack_op",
-    "get_skipped_module_name_and_classes",
-    "graph_module_from_producer_nodes",
-    "maybe_get_next_module",
-    "node_arg_is_bias",
-    "node_arg_is_weight",
-    "return_arg_list",
-    # torch.ao.quantization.pt2e.graph_utils
     "bfs_trace_with_node_process",
     "find_sequential_partitions",
     "get_equivalent_types",
@@ -825,80 +827,10 @@
     "get_latency_of_one_partition",
     "get_latency_of_partitioned_graph",
     "get_partition_to_latency_mapping",
-    # torch.fx.experimental.proxy_tensor
-    "decompose",
-    "disable_autocast_cache",
-    "disable_proxy_modes_tracing",
-    "dispatch_trace",
-    "extract_val",
-    "fake_signature",
-    "fetch_sym_proxy",
-    "fetch_object_proxy",
-    "get_innermost_proxy_mode",
-    "get_isolated_graphmodule",
-    "get_proxy_slot",
-    "get_torch_dispatch_modes",
-    "has_proxy_slot",
-    "is_sym_node",
-    "maybe_handle_decomp",
-    "proxy_call",
-    "set_meta",
-    "set_original_aten_op",
-    "set_proxy_slot",
-    "snapshot_fake",
-    "thunkify",
-    "track_tensor",
-    "track_tensor_tree",
-    "wrap_key",
-    "wrapper_and_args_for_make_fx",
-    # torch.fx.experimental.recording
     "record_shapeenv_event",
     "replay_shape_env_events",
     "shape_env_check_state_equal",
-    # torch.fx.experimental.sym_node
-    "ceil_impl",
-    "floor_ceil_helper",
-    "floor_impl",
-    "method_to_operator",
-    "sympy_is_channels_last_contiguous_2d",
-    "sympy_is_channels_last_contiguous_3d",
-    "sympy_is_channels_last_strides_2d",
-    "sympy_is_channels_last_strides_3d",
-    "sympy_is_channels_last_strides_generic",
-    "sympy_is_contiguous",
-    "sympy_is_contiguous_generic",
-    "to_node",
-    "wrap_node",
     "sym_sqrt",
-    # torch.fx.experimental.symbolic_shapes
-    "bind_symbols",
-    "cast_symbool_to_symint_guardless",
-    "create_contiguous",
-    "error",
-    "eval_guards",
-    "eval_is_non_overlapping_and_dense",
-    "expect_true",
-    "find_symbol_binding_fx_nodes",
-    "free_symbols",
-    "free_unbacked_symbols",
-    "fx_placeholder_targets",
-    "fx_placeholder_vals",
-    "guard_bool",
-    "guard_float",
-    "guard_int",
-    "guard_scalar",
-    "has_hint",
-    "has_symbolic_sizes_strides",
-    "is_channels_last_contiguous_2d",
-    "is_channels_last_contiguous_3d",
-    "is_channels_last_strides_2d",
-    "is_channels_last_strides_3d",
-    "is_contiguous",
-    "is_non_overlapping_and_dense_indicator",
-    "is_nested_int",
-    "is_symbol_binding_fx_node",
-    "is_symbolic",
-    # torch.fx.experimental.unification.core
     "reify",
     # torch.fx.experimental.unification.match
     "edge",
@@ -936,24 +868,6 @@
     "reverse_dict",
     # torch.fx.experimental.unification.multipledispatch.variadic
     "isvariadic",
-    # torch.fx.experimental.unification.unification_tools
-    "assoc",
-    "assoc_in",
-    "dissoc",
-    "first",
-    "get_in",
-    "getter",
-    "groupby",
-    "itemfilter",
-    "itemmap",
-    "keyfilter",
-    "keymap",
-    "merge",
-    "merge_with",
-    "update_in",
-    "valfilter",
-    "valmap",
-    # torch.fx.experimental.unification.utils
     "freeze",
     "hashable",
     "raises",
@@ -3195,6 +3109,11 @@ def linkcode_resolve(domain, info):
 # Enable overriding of function signatures in the first line of the docstring.
 autodoc_docstring_signature = True
 
+# Exclude inherited IntEnum methods that have RST formatting issues in their docstrings
+autodoc_default_options = {
+    "exclude-members": "from_bytes, to_bytes",
+}
+
 # -- katex javascript in header
 #
 #    def setup(app):
diff --git a/docs/source/distributed.md b/docs/source/distributed.md
index 1c9d374b8ab02..ca1fe3b5e9099 100644
--- a/docs/source/distributed.md
+++ b/docs/source/distributed.md
@@ -394,6 +394,10 @@ an opaque group handle that can be given as a `group` argument to all collective
 .. autofunction:: new_group
 ```
 
+```{eval-rst}
+.. autofunction:: torch.distributed.distributed_c10d.shrink_group
+```
+
 ```{eval-rst}
 .. autofunction:: get_group_rank
 ```
diff --git a/docs/source/fx.experimental.md b/docs/source/fx.experimental.md
index cba695b5e1c55..79cfaff7d0f2d 100644
--- a/docs/source/fx.experimental.md
+++ b/docs/source/fx.experimental.md
@@ -12,6 +12,37 @@ These APIs are experimental and subject to change without notice.
 .. autoclass:: torch.fx.experimental.sym_node.DynamicInt
 ```
 
+## torch.fx.experimental.sym_node
+
+```{eval-rst}
+.. currentmodule:: torch.fx.experimental.sym_node
+```
+
+```{eval-rst}
+.. automodule:: torch.fx.experimental.sym_node
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    is_channels_last_contiguous_2d
+    is_channels_last_contiguous_3d
+    is_channels_last_strides_2d
+    is_channels_last_strides_3d
+    is_contiguous
+    is_non_overlapping_and_dense_indicator
+    method_to_operator
+    sympy_is_channels_last_contiguous_2d
+    sympy_is_channels_last_contiguous_3d
+    sympy_is_channels_last_strides_2d
+    sympy_is_channels_last_strides_3d
+    sympy_is_channels_last_strides_generic
+    sympy_is_contiguous
+    sympy_is_contiguous_generic
+```
+
 ## torch.fx.experimental.symbolic_shapes
 
 ```{eval-rst}
@@ -69,6 +100,25 @@ These APIs are experimental and subject to change without notice.
     rebind_unbacked
     resolve_unbacked_bindings
     is_accessor_node
+    cast_symbool_to_symint_guardless
+    create_contiguous
+    error
+    eval_guards
+    eval_is_non_overlapping_and_dense
+    find_symbol_binding_fx_nodes
+    free_symbols
+    free_unbacked_symbols
+    fx_placeholder_targets
+    fx_placeholder_vals
+    guard_bool
+    guard_float
+    guard_int
+    guard_scalar
+    has_hint
+    has_symbolic_sizes_strides
+    is_nested_int
+    is_symbol_binding_fx_node
+    is_symbolic
 ```
 
 ## torch.fx.experimental.proxy_tensor
@@ -91,4 +141,46 @@ These APIs are experimental and subject to change without notice.
     get_proxy_mode
     maybe_enable_thunkify
     maybe_disable_thunkify
+    decompose
+    disable_autocast_cache
+    disable_proxy_modes_tracing
+    extract_val
+    fake_signature
+    fetch_object_proxy
+    fetch_sym_proxy
+    has_proxy_slot
+    is_sym_node
+    maybe_handle_decomp
+    proxy_call
+    set_meta
+    set_original_aten_op
+    set_proxy_slot
+    snapshot_fake
 ```
+
+## torch.fx.experimental.unification.unification_tools
+
+```{eval-rst}
+.. currentmodule:: torch.fx.experimental.unification.unification_tools
+```
+
+```{eval-rst}
+.. automodule:: torch.fx.experimental.unification.unification_tools
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    assoc
+    assoc_in
+    dissoc
+    first
+    keyfilter
+    keymap
+    merge
+    merge_with
+    update_in
+    valfilter
+    valmap
diff --git a/docs/source/fx.md b/docs/source/fx.md
index c9c235382893e..b8447b378d3f9 100644
--- a/docs/source/fx.md
+++ b/docs/source/fx.md
@@ -1134,7 +1134,6 @@ The set of leaf modules can be customized by overriding
 .. py:module:: torch.fx.experimental.refinement_types
 .. py:module:: torch.fx.experimental.rewriter
 .. py:module:: torch.fx.experimental.schema_type_annotation
-.. py:module:: torch.fx.experimental.sym_node
 .. py:module:: torch.fx.experimental.unification.core
 .. py:module:: torch.fx.experimental.unification.dispatch
 .. py:module:: torch.fx.experimental.unification.match
@@ -1144,7 +1143,6 @@ The set of leaf modules can be customized by overriding
 .. py:module:: torch.fx.experimental.unification.multipledispatch.dispatcher
 .. py:module:: torch.fx.experimental.unification.multipledispatch.utils
 .. py:module:: torch.fx.experimental.unification.multipledispatch.variadic
-.. py:module:: torch.fx.experimental.unification.unification_tools
 .. py:module:: torch.fx.experimental.unification.utils
 .. py:module:: torch.fx.experimental.unification.variable
 .. py:module:: torch.fx.experimental.unify_refinements
diff --git a/docs/source/mtia.mtia_graph.md b/docs/source/mtia.mtia_graph.md
new file mode 100644
index 0000000000000..1d1560960792c
--- /dev/null
+++ b/docs/source/mtia.mtia_graph.md
@@ -0,0 +1,21 @@
+# torch.mtia.mtia_graph
+
+The MTIA backend is implemented out of the tree, only interfaces are defined here.
+
+```{eval-rst}
+.. automodule:: torch.mtia.mtia_graph
+```
+
+```{eval-rst}
+.. currentmodule:: torch.mtia.mtia_graph
+```
+
+```{eval-rst}
+.. autoclass:: MTIAGraph
+    :members:
+```
+
+```{eval-rst}
+.. autoclass:: graph
+    :members:
+```
diff --git a/docs/source/nn.attention.rst b/docs/source/nn.attention.rst
index 8e7e6b0a762a4..693f7e061a66e 100644
--- a/docs/source/nn.attention.rst
+++ b/docs/source/nn.attention.rst
@@ -14,6 +14,10 @@ Utils
 
     sdpa_kernel
     SDPBackend
+    register_flash_attention_impl
+    activate_flash_attention_impl
+    list_flash_attention_impls
+    current_flash_attention_impl
 
 Submodules
 ----------
diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst
index c7d3a93f73523..676baa67c201a 100644
--- a/docs/source/notes/cuda.rst
+++ b/docs/source/notes/cuda.rst
@@ -254,7 +254,7 @@ To toggle the reduced precision reduction flags in C++, one can do
 
 .. _fp16accumulation:
 
-Full FP16 Accmumulation in FP16 GEMMs
+Full FP16 Accumulation in FP16 GEMMs
 -------------------------------------
 
 Certain GPUs have increased performance when doing _all_ FP16 GEMM accumulation
@@ -619,6 +619,10 @@ Available options:
   and reallocate buffers across multiple streams, especially when the capture DAG frequently
   reaches joined frontiers.
 
+* ``per_process_memory_fraction`` option limits the amount of memory that can be allocated
+  on all the CUDA devices to a specified fraction of the available memory. This is a value
+  between 0 and 1. Attempting to allocate more memory will raise an out of memory error.
+
 .. note::
 
     Some stats reported by the
@@ -1720,6 +1724,16 @@ and can be used to share memory across graphs as shown::
     g1.replay()
     g2.replay()
 
+It's also safe to share a memory pool across separate graphs that do not depend
+on each other's outputs, provided they never run concurrently.
+Be aware that replaying one graph can clobber another graph's outputs when
+they share a pool, unless :meth:`~torch.Tensor.clone` is called on the outputs
+beforehand.
+This pattern is frequently used in inference servers that accept variable batch
+sizes at runtime.
+vLLM is a notable example; see `here <https://github.com/vllm-project/vllm/blob/938a81692ea318e59ead4750e7e7425bfd6a4896/vllm/platforms/interface.py#L508-L515>`__
+and `here <https://github.com/vllm-project/vllm/blob/938a81692ea318e59ead4750e7e7425bfd6a4896/vllm/compilation/cuda_graph.py#L86-L89>`__.
+
 With :func:`torch.cuda.make_graphed_callables`, if you want to graph several
 callables and you know they'll always run in the same order (and never concurrently)
 pass them as a tuple in the same order they'll run in the live workload, and
diff --git a/docs/source/notes/libtorch_stable_abi.md b/docs/source/notes/libtorch_stable_abi.md
index fff32d00cb449..5312dfe546072 100644
--- a/docs/source/notes/libtorch_stable_abi.md
+++ b/docs/source/notes/libtorch_stable_abi.md
@@ -46,6 +46,108 @@ These headers are promised to be ABI stable across releases and adhere to a stro
 Unless absolutely necessary, we recommend the high-level C++ API in `torch/csrc/stable`
 which will handle all the rough edges of the C API for the user.
 
+## Migrating your kernel to the LibTorch stable ABI
+
+If you'd like your kernel to be ABI stable with LibTorch, meaning you'd the ability to build for one version and run on another, your kernel must only use the limited stable ABI. This following section goes through some steps of migrating an existing kernel and APIs we imagine you would need to swap over.
+
+Firstly, instead of registering kernels through `TORCH_LIBRARY`, LibTorch ABI stable kernels must be registered via `STABLE_TORCH_LIBRARY`. Note that, for the time being, implementations registered via `STABLE_TORCH_LIBRARY` must be boxed unlike `TORCH_LIBRARY`. See the simple example below or our docs on [Stack-based APIs](stack-based-apis) for more details. For kernels that are registered via `pybind`, before using the stable ABI, it would be useful to migrate to register them via `TORCH_LIBRARY`.
+
+While previously your kernels might have included APIs from `<torch/*.h>` (for example, `<torch/all.h>`), they are now limited to including from the 3 categories of headers mentioned above (`torch/csrc/stable/*.h`, `torch/headeronly/*.h` and the stable C headers). This means that your extension should no longer use any utilities from the `at::` or `c10::` namespaces but instead use their replacements in `torch::stable` and `torch::headeronly`. To provide a couple examples of the necessary migrations:
+- all uses of `at::Tensor` must be replaced with `torch::stable::Tensor`
+- all uses of `TORCH_CHECK` must be replaced with `STD_TORCH_CHECK`
+- all uses of `at::kCUDA` must be replaced with `torch::headeronly::kCUDA` etc.
+- native functions such as `at::pad` must be replaced with `torch::stable::pad`
+- native functions that are called as Tensor methods (e.g., `Tensor.pad`) must be replaced with the ATen variant through `torch::stable::pad`.
+
+As mentioned above, the LibTorch stable ABI is still under development. If there is any API or feature you would like to see added to the stable ABI/`torch::headeronly`/`torch::stable`, please file a request through a [new issue on the PyTorch repo](https://github.com/pytorch/pytorch/issues).
+
+Below is a simple example of migrating an existing kernel that uses `TORCH_LIBRARY` to the stable ABI (`TORCH_STABLE_LIBRARY`). For a larger end to end example you can take a look at the FA3 repository. Specifically the diff between [`flash_api.cpp`](https://github.com/Dao-AILab/flash-attention/blob/ad70a007e6287d4f7e766f94bcf2f9a813f20f6b/hopper/flash_api.cpp#L1) and the stable variant [`flash_api_stable.cpp`](https://github.com/Dao-AILab/flash-attention/blob/ad70a007e6287d4f7e766f94bcf2f9a813f20f6b/hopper/flash_api_stable.cpp#L1).
+
+
+### Original Version with `TORCH_LIBRARY`
+
+```cpp
+// original_kernel.cpp - Using TORCH_LIBRARY (not stable ABI)
+#include <torch/torch.h>
+#include <ATen/ATen.h>
+
+namespace myops {
+
+// Simple kernel that adds a scalar value to each element of a tensor
+at::Tensor add_scalar(const at::Tensor& input, double scalar) {
+  TORCH_CHECK(input.scalar_type() == at::kFloat, "Input must be float32");
+
+  return input.add(scalar);
+}
+
+// Register the operator
+TORCH_LIBRARY(myops, m) {
+  m.def("add_scalar(Tensor input, float scalar) -> Tensor", &add_scalar);
+}
+
+// Register the implementation
+TORCH_LIBRARY_IMPL(myops, CompositeExplicitAutograd, m) {
+  m.impl("add_scalar", &add_scalar);
+}
+
+} // namespace myops
+```
+
+### Migrated Version with `STABLE_TORCH_LIBRARY`
+
+```cpp
+// stable_kernel.cpp - Using STABLE_TORCH_LIBRARY (stable ABI)
+
+// (1) Don't include <torch/torch.h> <ATen/ATen.h>
+//     only include APIs from torch/csrc/stable, torch/headeronly and C-shims
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor_struct.h>
+#include <torch/csrc/stable/ops.h>
+#include <torch/csrc/stable/stableivalue_conversions.h>
+#include <torch/headeronly/core/ScalarType.h>
+#include <torch/headeronly/macros/Macros.h>
+
+namespace myops {
+
+// Simple kernel that adds a scalar value to each element of a tensor
+torch::stable::Tensor add_scalar(const torch::stable::Tensor& input, double scalar) {
+  // (2) use STD_TORCH_CHECK instead of TORCH_CHECK
+  STD_TORCH_CHECK(
+      // (3) use torch::headeronly::kFloat instead of at:kFloat
+      input.scalar_type() == torch::headeronly::kFloat,
+      "Input must be float32");
+
+  // (4) Use stable ops namespace instead of input.add
+  return torch::stable::add(input, scalar);
+}
+
+// (5) Add Boxed wrapper required for STABLE_TORCH_LIBRARY
+void boxed_add_scalar(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+  // Extract arguments from stack using `to<T>`
+  auto input = to<torch::stable::Tensor>(stack[0]);
+  auto scalar = to<double>(stack[1]);
+
+  // Call the actual kernel
+  auto result = add_scalar(input, scalar);
+
+  // Put result back on stack using `from()`
+  // Stack slot 0 now holds the return value
+  stack[0] = from(result);
+}
+
+// (6) Register the operator using STABLE_TORCH_LIBRARY
+STABLE_TORCH_LIBRARY(myops, m) {
+  m.def("add_scalar(Tensor input, float scalar) -> Tensor", &boxed_add_scalar);
+}
+
+// (7) Register the implementation using STABLE_TORCH_LIBRARY_IMPL
+STABLE_TORCH_LIBRARY_IMPL(myops, CompositeExplicitAutograd, m) {
+  m.impl("add_scalar", &boxed_add_scalar);
+}
+
+} // namespace myops
+```
+
 
 ## How are objects passed across the ABI boundary when interacting with the dispatcher?
 
@@ -109,6 +211,7 @@ There are two invariants for the stack:
     a. When calling a stack-based API, you must give owning references to the calling stack and steal references from the returned stack.
     b. When registering your function to be called with a stack, you must steal references from your argument stack and push onto the stack new references.
 
+(stack-based-apis)=
 ### Stack-based APIs
 
 The above is relevant in two places:
diff --git a/docs/source/pytorch-api.md b/docs/source/pytorch-api.md
index 5f99e4334bb69..c0f1302b8e8ed 100644
--- a/docs/source/pytorch-api.md
+++ b/docs/source/pytorch-api.md
@@ -29,6 +29,7 @@ mps
 xpu
 mtia
 mtia.memory
+mtia.mtia_graph
 meta
 torch.backends <backends>
 torch.export <export>
diff --git a/docs/source/quantization-support.md b/docs/source/quantization-support.md
index 986b1cb257513..0b5d338d6f2bb 100644
--- a/docs/source/quantization-support.md
+++ b/docs/source/quantization-support.md
@@ -134,6 +134,23 @@ Quantization to work with this as well.
     ObservationType
 ```
 
+## torch.ao.quantization.backend_config.utils
+```{eval-rst}
+.. currentmodule:: torch.ao.quantization.backend_config.utils
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    entry_to_pretty_str
+    pattern_to_human_readable
+    remove_boolean_dispatch_from_name
+
+```
+
 ## torch.ao.quantization.fx.custom_config
 
 This module contains a few CustomConfig classes that's used in both eager mode and FX graph mode quantization
@@ -154,6 +171,30 @@ This module contains a few CustomConfig classes that's used in both eager mode a
     StandaloneModuleConfigEntry
 ```
 
+## torch.ao.quantization.fx.utils
+
+```{eval-rst}
+.. currentmodule:: torch.ao.quantization.fx.utils
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    all_node_args_except_first
+    all_node_args_have_no_tensors
+    collect_producer_nodes
+    create_getattr_from_value
+    create_node_from_old_node_preserve_meta
+    graph_module_from_producer_nodes
+    maybe_get_next_module
+    node_arg_is_bias
+    node_arg_is_weight
+    return_arg_list
+```
+
 ## torch.ao.quantization.quantizer
 
 ```{eval-rst}
@@ -253,7 +294,6 @@ regular full-precision tensor.
 .. autosummary::
     :toctree: generated
     :nosignatures:
-    :template: classtemplate.rst
 
     view
     as_strided
diff --git a/docs/source/torch.compiler_api.md b/docs/source/torch.compiler_api.md
index 2b79b0e670073..66237db8163fc 100644
--- a/docs/source/torch.compiler_api.md
+++ b/docs/source/torch.compiler_api.md
@@ -30,5 +30,6 @@ For a quick overview of `torch.compiler`, see {ref}`torch.compiler_overview`.
      skip_guard_on_all_nn_modules_unsafe
      keep_tensor_guards_unsafe
      skip_guard_on_globals_unsafe
+     skip_all_guards_unsafe
      nested_compile_region
 ```
diff --git a/functorch/op_analysis/gen_data.py b/functorch/op_analysis/gen_data.py
index 5e874e2bb1177..716f1c3df0c97 100644
--- a/functorch/op_analysis/gen_data.py
+++ b/functorch/op_analysis/gen_data.py
@@ -24,15 +24,11 @@ def gen_data(special_op_lists, analysis_name):
     all_ops = get_ops_for_key(None)
     composite_ops = get_ops_for_key("CompositeImplicitAutograd")
     noncomposite_ops = all_ops - composite_ops
+    with open("../../aten/src/ATen/native/native_functions.yaml") as f:
+        ops = yaml.load(f.read(), Loader=yaml.CLoader)
 
-    ops = yaml.load(
-        open("../../aten/src/ATen/native/native_functions.yaml").read(),
-        Loader=yaml.CLoader,
-    )
-
-    annotated_ops = {
-        a.strip(): b.strip() for a, b in list(csv.reader(open("annotated_ops")))
-    }
+    with open("annotated_ops") as f:
+        annotated_ops = {a.strip(): b.strip() for a, b in csv.reader(f)}
 
     uniq_ops = []
     uniq_names = set()
diff --git a/pyproject.toml b/pyproject.toml
index 4cf3562886fd9..9986c6a9b7b6b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -172,9 +172,9 @@ ignore = [
     "SIM102", "SIM103", "SIM112", # flake8-simplify code styles
     "SIM105", # these ignores are from flake8-simplify. please fix or ignore with commented reason
     "SIM108", # SIM108 ignored because we prefer if-else-block instead of ternary expression
-    "SIM110",
+    "SIM110", # Checks for for loops that can be replaced with a builtin function, like any or all.
     "SIM114", # Combine `if` branches using logical `or` operator
-    "SIM115",
+    "SIM115", # Checks for cases where files are opened without using a context manager.
     "SIM116", # Disable Use a dictionary instead of consecutive `if` statements
     "SIM117",
     "SIM118",
@@ -184,7 +184,6 @@ ignore = [
     "TC006",
     # TODO: Remove Python-3.10 specific suppressions
     "B905",
-    "UP035",
 ]
 select = [
     "B",
@@ -261,6 +260,7 @@ select = [
     "TRY401", # verbose-log-message
     "UP",
     "YTT",
+    "S101",
 ]
 
 [tool.ruff.lint.pyupgrade]
@@ -340,6 +340,55 @@ keep-runtime-typing = true
 "tools/linter/**" = [
     "LOG015" # please fix
 ]
+"benchmarks/**" = [
+    "S101"
+]
+"test/**" = [
+    "S101"
+]
+"torchgen/**" = [
+    "S101"
+]
+"torch/**" = [
+    "S101"
+]
+"tools/**" = [
+    "S101"
+]
+"setup.py" = [
+    "S101"
+]
+"functorch/**" = [
+    "S101"
+]
+"docs/**" = [
+    "S101"
+]
+"android/**" = [
+    "S101"
+]
+".github/**" = [
+    "S101"
+]
+".ci/**" = [
+    "S101"
+]
 
 [tool.codespell]
 ignore-words = "tools/linter/dictionary.txt"
+
+[tool.spin]
+package = 'torch'
+
+[tool.spin.commands]
+"Build" = [
+  ".spin/cmds.py:lint",
+  ".spin/cmds.py:fixlint",
+  ".spin/cmds.py:quicklint",
+  ".spin/cmds.py:quickfix",
+]
+"Regenerate" = [
+  ".spin/cmds.py:regenerate_version",
+  ".spin/cmds.py:regenerate_type_stubs",
+  ".spin/cmds.py:regenerate_clangtidy_files",
+]
diff --git a/pyrefly.toml b/pyrefly.toml
index 249b02227cec9..770f9edf4bdb2 100644
--- a/pyrefly.toml
+++ b/pyrefly.toml
@@ -32,7 +32,7 @@ project-excludes = [
   "torch/utils/tensorboard/summary.py",
   # formatting issues, will turn on after adjusting where suppressions can be
   # in import statements
-  "tools/flight_recorder/components/types.py",
+  "torch/distributed/flight_recorder/components/types.py",
   "torch/linalg/__init__.py",
   "torch/package/importer.py",
   "torch/package/_package_pickler.py",
diff --git a/requirements.txt b/requirements.txt
index 090a733726658..e9b5d4482bc5c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,18 +5,16 @@
 
 # Install / Development extra requirements
 build[uv]  # for building sdist and wheel
-expecttest==0.3.0
-filelock==3.18.0
-fsspec==2025.7.0
-hypothesis==5.35.1
-jinja2==3.1.6
-lintrunner==0.12.7 ; platform_machine != "s390x"
-networkx==2.8.8
-ninja==1.11.1.3
-numpy==2.0.2 ; python_version == "3.9"
-numpy==2.1.2 ; python_version > "3.9"
-optree==0.13.0
-psutil==7.0.0
-sympy==1.13.3
-typing-extensions==4.14.1
-wheel==0.45.1
+expecttest>=0.3.0
+filelock
+fsspec>=0.8.5
+hypothesis
+jinja2
+lintrunner ; platform_machine != "s390x" and platform_machine != "riscv64"
+networkx>=2.5.1
+optree>=0.13.0
+psutil
+spin
+sympy>=1.13.3
+typing-extensions>=4.13.2
+wheel
diff --git a/scripts/onnx/install-develop.sh b/scripts/onnx/install-develop.sh
index 9875f88fff18a..ff56312fd2f8f 100755
--- a/scripts/onnx/install-develop.sh
+++ b/scripts/onnx/install-develop.sh
@@ -10,7 +10,7 @@ tp2_dir="$top_dir/third_party"
 pip install ninja
 
 # Install onnx
-pip install --no-use-pep517 -e "$tp2_dir/onnx"
+pip install -e "$tp2_dir/onnx"
 
 # Install caffe2 and pytorch
 pip install -r "$top_dir/caffe2/requirements.txt"
diff --git a/setup.py b/setup.py
index dd8a52cbeb7c7..314f719ea67f0 100644
--- a/setup.py
+++ b/setup.py
@@ -1585,7 +1585,7 @@ def make_relative_rpath_args(path: str) -> list[str]:
     if cmake_cache_vars["USE_DISTRIBUTED"]:
         # Only enable fr_trace command if distributed is enabled
         entry_points["console_scripts"].append(
-            "torchfrtrace = tools.flight_recorder.fr_trace:main",
+            "torchfrtrace = torch.distributed.flight_recorder.fr_trace:main",
         )
     return ext_modules, cmdclass, packages, entry_points, extra_install_requires
 
@@ -1646,8 +1646,7 @@ def main() -> None:
     mirror_files_into_torchgen()
     if RUN_BUILD_DEPS:
         build_deps()
-
-    mirror_inductor_external_kernels()
+        mirror_inductor_external_kernels()
 
     (
         ext_modules,
diff --git a/test/ao/sparsity/test_data_sparsifier.py b/test/ao/sparsity/test_data_sparsifier.py
index fa08e8c90ac2f..46587833acb4d 100644
--- a/test/ao/sparsity/test_data_sparsifier.py
+++ b/test/ao/sparsity/test_data_sparsifier.py
@@ -208,7 +208,7 @@ def check_state_dict(self, data_list, data_with_config, defaults, **kwargs):
         assert len(sparsifier1.data_groups) == len(sparsifier2.data_groups)
 
         state1 = state_dict1["state"]
-        for name in state1.keys():
+        for name in state1:
             # compare mask
             assert name in sparsifier2.state
             assert "mask" in sparsifier2.state[name]
diff --git a/test/ao/sparsity/test_scheduler.py b/test/ao/sparsity/test_scheduler.py
index 0477b70fd8783..e7d4e8df90be5 100644
--- a/test/ao/sparsity/test_scheduler.py
+++ b/test/ao/sparsity/test_scheduler.py
@@ -75,6 +75,7 @@ def test_lambda_scheduler(self):
 
 class TestCubicScheduler(TestCase):
     def setUp(self):
+        super().setUp()
         self.model_sparse_config = [
             {"tensor_fqn": "0.weight", "sparsity_level": 0.8},
             {"tensor_fqn": "2.weight", "sparsity_level": 0.4},
diff --git a/test/ao/sparsity/test_sparsifier.py b/test/ao/sparsity/test_sparsifier.py
index a940a3e9febab..776a36d029b54 100644
--- a/test/ao/sparsity/test_sparsifier.py
+++ b/test/ao/sparsity/test_sparsifier.py
@@ -119,7 +119,7 @@ def test_state_dict(self):
         for idx in range(len(sparsifier0.groups)):
             mg0 = sparsifier0.groups[idx]
             mg1 = sparsifier1.groups[idx]
-            for key in mg0.keys():
+            for key in mg0:
                 assert key in mg1
                 if key == "module":
                     # We cannot compare modules as they are different
diff --git a/test/backends/xeon/test_launch.py b/test/backends/xeon/test_launch.py
index bab1006015212..311f622fafda2 100644
--- a/test/backends/xeon/test_launch.py
+++ b/test/backends/xeon/test_launch.py
@@ -11,6 +11,7 @@
 @unittest.skipIf(not IS_LINUX, "Only works on linux")
 class TestTorchrun(TestCase):
     def setUp(self):
+        super().setUp()
         self._test_dir = tempfile.mkdtemp(prefix=self.__class__.__name__)
 
     def tearDown(self):
diff --git a/test/conftest.py b/test/conftest.py
index de5818bda8f32..dd26655b33494 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -308,12 +308,16 @@ def __init__(self, config: Config) -> None:
         self.report_status = ""
         assert config.cache is not None
         self.cache: pytest.Cache = config.cache
-        self.directory = f"{STEPCURRENT_CACHE_DIR}/{config.getoption('stepcurrent')}"
-        self.lastrun: Optional[str] = self.cache.get(self.directory, None)
+        directory = f"{STEPCURRENT_CACHE_DIR}/{config.getoption('stepcurrent')}"
+        self.lastrun_location = f"{directory}/lastrun"
+        self.lastrun: Optional[str] = self.cache.get(self.lastrun_location, None)
         self.initial_val = self.lastrun
         self.skip: bool = config.getoption("stepcurrent_skip")
         self.run_single: bool = config.getoption("run_single")
 
+        self.made_failing_xml_location = f"{directory}/made_failing_xml"
+        self.cache.set(self.made_failing_xml_location, False)
+
     def pytest_collection_modifyitems(self, config: Config, items: list[Any]) -> None:
         if not self.lastrun:
             self.report_status = "Cannot find last run test, not skipping"
@@ -349,8 +353,10 @@ def pytest_report_collectionfinish(self) -> Optional[str]:
 
     def pytest_runtest_protocol(self, item, nextitem) -> None:
         self.lastrun = item.nodeid
-        self.cache.set(self.directory, self.lastrun)
+        self.cache.set(self.lastrun_location, self.lastrun)
 
     def pytest_sessionfinish(self, session, exitstatus):
         if exitstatus == 0:
-            self.cache.set(self.directory, self.initial_val)
+            self.cache.set(self.lastrun_location, self.initial_val)
+        if exitstatus != 0:
+            self.cache.set(self.made_failing_xml_location, True)
diff --git a/test/cpp/aoti_abi_check/CMakeLists.txt b/test/cpp/aoti_abi_check/CMakeLists.txt
index 4763621f60394..483814a0326d2 100644
--- a/test/cpp/aoti_abi_check/CMakeLists.txt
+++ b/test/cpp/aoti_abi_check/CMakeLists.txt
@@ -8,14 +8,21 @@ set(AOTI_ABI_CHECK_TEST_ROOT ${TORCH_ROOT}/test/cpp/aoti_abi_check)
 # Build the cpp gtest binary containing the cpp-only tests.
 set(AOTI_ABI_CHECK_TEST_SRCS
   ${AOTI_ABI_CHECK_TEST_ROOT}/main.cpp
+  ${AOTI_ABI_CHECK_TEST_ROOT}/test_accessor.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_cast.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_devicetype.cpp
+  ${AOTI_ABI_CHECK_TEST_ROOT}/test_dispatch.cpp
+  ${AOTI_ABI_CHECK_TEST_ROOT}/test_dispatch_v2.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_dtype.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_exception.cpp
+  ${AOTI_ABI_CHECK_TEST_ROOT}/test_headeronlyarrayref.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_macros.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_math.cpp
+  ${AOTI_ABI_CHECK_TEST_ROOT}/test_metaprogramming.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_rand.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_scalartype.cpp
+  ${AOTI_ABI_CHECK_TEST_ROOT}/test_typelist.cpp
+  ${AOTI_ABI_CHECK_TEST_ROOT}/test_typetraits.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_vec.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_vec_half.cpp
 )
@@ -44,6 +51,10 @@ endif()
 # Disable unused-variable warnings for variables that are only used to test compilation
 target_compile_options_if_supported(test_aoti_abi_check -Wno-unused-variable)
 target_compile_options_if_supported(test_aoti_abi_check -Wno-unused-but-set-variable)
+# Add -Wno-dangling-pointer for GCC 13
+if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13)
+  target_compile_options_if_supported(test_aoti_abi_check -Wno-dangling-pointer)
+endif()
 
 foreach(test_src ${AOTI_ABI_CHECK_VEC_TEST_SRCS})
   foreach(i RANGE ${NUM_CPU_CAPABILITY_NAMES})
diff --git a/test/cpp/aoti_abi_check/test_accessor.cpp b/test/cpp/aoti_abi_check/test_accessor.cpp
new file mode 100644
index 0000000000000..603ff48fb1bd5
--- /dev/null
+++ b/test/cpp/aoti_abi_check/test_accessor.cpp
@@ -0,0 +1,50 @@
+#include <gtest/gtest.h>
+#include <torch/headeronly/core/TensorAccessor.h>
+#include <string>
+
+TEST(TestAccessor, HeaderOnlyTensorAccessor) {
+  std::vector<int32_t> v = {11, 12, 13, 21, 22, 23};
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = {3, 1};
+
+  auto acc = torch::headeronly::HeaderOnlyTensorAccessor<int32_t, 2>(
+      v.data(), sizes.data(), strides.data());
+  EXPECT_EQ(acc[0][0], 11);
+  EXPECT_EQ(acc[0][1], 12);
+  EXPECT_EQ(acc[0][2], 13);
+  EXPECT_EQ(acc[1][0], 21);
+  EXPECT_EQ(acc[1][1], 22);
+  EXPECT_EQ(acc[1][2], 23);
+}
+
+TEST(TestAccessor, HeaderOnlyGenericPackedTensorAccessor) {
+  std::vector<int32_t> v = {11, 12, 13, 21, 22, 23};
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = {3, 1};
+
+  auto acc =
+      torch::headeronly::HeaderOnlyGenericPackedTensorAccessor<int32_t, 2>(
+          v.data(), sizes.data(), strides.data());
+  EXPECT_EQ(acc[0][0], 11);
+  EXPECT_EQ(acc[0][1], 12);
+  EXPECT_EQ(acc[0][2], 13);
+  EXPECT_EQ(acc[1][0], 21);
+  EXPECT_EQ(acc[1][1], 22);
+  EXPECT_EQ(acc[1][2], 23);
+
+  auto tacc = acc.transpose(0, 1);
+  EXPECT_EQ(tacc[0][0], 11);
+  EXPECT_EQ(tacc[0][1], 21);
+  EXPECT_EQ(tacc[1][0], 12);
+  EXPECT_EQ(tacc[1][1], 22);
+  EXPECT_EQ(tacc[2][0], 13);
+  EXPECT_EQ(tacc[2][1], 23);
+
+  try {
+    acc.transpose(0, 2);
+  } catch (const std::exception& e) {
+    EXPECT_TRUE(
+        std::string(e.what()).find("HeaderOnlyIndexBoundsCheck") !=
+        std::string::npos);
+  }
+}
diff --git a/test/cpp/aoti_abi_check/test_dispatch.cpp b/test/cpp/aoti_abi_check/test_dispatch.cpp
new file mode 100644
index 0000000000000..5eb08d0f43b0c
--- /dev/null
+++ b/test/cpp/aoti_abi_check/test_dispatch.cpp
@@ -0,0 +1,82 @@
+#include <gtest/gtest.h>
+
+#include <torch/headeronly/core/Dispatch.h>
+#include <torch/headeronly/core/Dispatch_v2.h>
+
+// MY_PRIVATE_CHECK_SELECTIVE_BUILD is a prelude to case block. For
+// testing, we do nothing:
+#define MY_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type) /* empty */
+
+#define MY_PRIVATE_CASE_TYPE_USING_HINT(...) \
+  THO_PRIVATE_CASE_TYPE_USING_HINT_TMPL(     \
+      MY_PRIVATE_CHECK_SELECTIVE_BUILD, __VA_ARGS__)
+
+#define MY_DISPATCH_CASE(...) \
+  THO_DISPATCH_CASE_TMPL(MY_PRIVATE_CASE_TYPE_USING_HINT, __VA_ARGS__)
+
+// MY_RECORD_KERNEL_FUNCTION_DTYPE is a prelude to switch
+// statement. For testing, we just avoid unused variable warning:
+#define MY_RECORD_KERNEL_FUNCTION_DTYPE(DISPATCHNAME, ENUMTYPE) \
+  (void)DISPATCHNAME
+
+// MY_CHECK_NOT_IMPLEMENTED is called in switch default block. For
+// testing, we count case mismatches:
+#define MY_CHECK_NOT_IMPLEMENTED(...) default_count++
+
+#define MY_DISPATCH_SWITCH(...) \
+  THO_DISPATCH_SWITCH_TMPL(     \
+      MY_RECORD_KERNEL_FUNCTION_DTYPE, MY_CHECK_NOT_IMPLEMENTED, __VA_ARGS__)
+
+// MY_CASE_FUNCTION is called in a case block. For testing, we count
+// case matches and ensure that scalar_t/index_t type is defined:
+#define MY_CASE_FUNCTION \
+  [&] {                  \
+    count++;             \
+    scalar_t tmp;        \
+    (void)tmp;           \
+  }
+#define MY_INDEX_CASE_FUNCTION \
+  [&] {                        \
+    count++;                   \
+    index_t tmp;               \
+    (void)tmp;                 \
+  }
+
+#define DEFINE_ITEM(TYPE, SCALARTYPE) ScalarType::SCALARTYPE,
+
+#define MY_DISPATCH_V2(TYPE, NAME, BODY, ...) \
+  THO_DISPATCH_V2_TMPL(                       \
+      MY_DISPATCH_SWITCH,                     \
+      MY_DISPATCH_CASE,                       \
+      TYPE,                                   \
+      NAME,                                   \
+      AT_WRAP(BODY),                          \
+      __VA_ARGS__)
+
+#define TEST_DISPATCH_V2(NAME, EXPECTEDCOUNT, ...)                             \
+  TEST(TestDispatchV2, NAME) {                                                 \
+    using torch::headeronly::ScalarType;                                       \
+    using torch::headeronly::impl::ScalarTypeToCPPTypeT;                       \
+    int8_t total_count = 0;                                                    \
+    int8_t count = 0;                                                          \
+    int8_t default_count = 0;                                                  \
+    for (ScalarType t :                                                        \
+         {AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_ITEM)}) {       \
+      total_count++;                                                           \
+      MY_DISPATCH_V2(t, "test_my_dispatch_v2", MY_CASE_FUNCTION, __VA_ARGS__); \
+    }                                                                          \
+    EXPECT_EQ(count, EXPECTEDCOUNT);                                           \
+    EXPECT_EQ(default_count + count, total_count);                             \
+  }
+
+TEST_DISPATCH_V2(AT_FLOAT8_TYPES_, 5, AT_FLOAT8_TYPES);
+TEST_DISPATCH_V2(AT_INTEGRAL_TYPES_, 5, AT_INTEGRAL_TYPES);
+TEST_DISPATCH_V2(AT_FLOATING_TYPES_, 2, AT_FLOATING_TYPES);
+TEST_DISPATCH_V2(AT_BAREBONES_UNSIGNED_TYPES_, 3, AT_BAREBONES_UNSIGNED_TYPES);
+TEST_DISPATCH_V2(AT_INTEGRAL_TYPES_V2_, 8, AT_INTEGRAL_TYPES_V2);
+TEST_DISPATCH_V2(AT_COMPLEX_TYPES_, 2, AT_COMPLEX_TYPES);
+TEST_DISPATCH_V2(AT_QINT_TYPES_, 3, AT_QINT_TYPES);
+TEST_DISPATCH_V2(AT_ALL_TYPES_, 7, AT_ALL_TYPES);
+TEST_DISPATCH_V2(AT_ALL_TYPES_AND_COMPLEX_, 9, AT_ALL_TYPES_AND_COMPLEX);
+
+#undef DEFINE_ITEM
diff --git a/test/cpp/aoti_abi_check/test_dispatch_v2.cpp b/test/cpp/aoti_abi_check/test_dispatch_v2.cpp
new file mode 100644
index 0000000000000..e475e9c802e32
--- /dev/null
+++ b/test/cpp/aoti_abi_check/test_dispatch_v2.cpp
@@ -0,0 +1,45 @@
+#include <gtest/gtest.h>
+#include <torch/headeronly/core/Dispatch_v2.h>
+#include <torch/headeronly/util/Exception.h>
+
+#define DEFINE_ITEM(TYPE, SCALARTYPE) ScalarType::SCALARTYPE,
+
+#define TEST_DISPATCH_V2(NAME, EXPECTEDCOUNT, ...)                       \
+  TEST(TestThoDispatchV2, NAME) {                                        \
+    using torch::headeronly::ScalarType;                                 \
+    using torch::headeronly::impl::ScalarTypeToCPPTypeT;                 \
+    int8_t total_count = 0;                                              \
+    int8_t count = 0;                                                    \
+    int8_t default_count = 0;                                            \
+    for (ScalarType t :                                                  \
+         {AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_ITEM)}) { \
+      total_count++;                                                     \
+      try {                                                              \
+        THO_DISPATCH_V2(                                                 \
+            t,                                                           \
+            "test_tho_dispatch_v2",                                      \
+            [&] {                                                        \
+              count++;                                                   \
+              scalar_t tmp;                                              \
+              (void)tmp;                                                 \
+            },                                                           \
+            __VA_ARGS__);                                                \
+      } catch (...) {                                                    \
+        default_count++; /* counts mismatches */                         \
+      }                                                                  \
+    }                                                                    \
+    EXPECT_EQ(count, EXPECTEDCOUNT);                                     \
+    EXPECT_EQ(default_count + count, total_count);                       \
+  }
+
+TEST_DISPATCH_V2(AT_FLOAT8_TYPES_, 5, AT_FLOAT8_TYPES);
+TEST_DISPATCH_V2(AT_INTEGRAL_TYPES_, 5, AT_INTEGRAL_TYPES);
+TEST_DISPATCH_V2(AT_FLOATING_TYPES_, 2, AT_FLOATING_TYPES);
+TEST_DISPATCH_V2(AT_BAREBONES_UNSIGNED_TYPES_, 3, AT_BAREBONES_UNSIGNED_TYPES);
+TEST_DISPATCH_V2(AT_INTEGRAL_TYPES_V2_, 8, AT_INTEGRAL_TYPES_V2);
+TEST_DISPATCH_V2(AT_COMPLEX_TYPES_, 2, AT_COMPLEX_TYPES);
+TEST_DISPATCH_V2(AT_QINT_TYPES_, 3, AT_QINT_TYPES);
+TEST_DISPATCH_V2(AT_ALL_TYPES_, 7, AT_ALL_TYPES);
+TEST_DISPATCH_V2(AT_ALL_TYPES_AND_COMPLEX_, 9, AT_ALL_TYPES_AND_COMPLEX);
+
+#undef DEFINE_ITEM
diff --git a/test/cpp/aoti_abi_check/test_headeronlyarrayref.cpp b/test/cpp/aoti_abi_check/test_headeronlyarrayref.cpp
new file mode 100644
index 0000000000000..184c0ade8360e
--- /dev/null
+++ b/test/cpp/aoti_abi_check/test_headeronlyarrayref.cpp
@@ -0,0 +1,52 @@
+#include <gtest/gtest.h>
+
+#include <torch/headeronly/util/HeaderOnlyArrayRef.h>
+
+#include <vector>
+
+using torch::headeronly::HeaderOnlyArrayRef;
+
+TEST(TestHeaderOnlyArrayRef, TestEmpty) {
+  HeaderOnlyArrayRef<float> arr;
+  ASSERT_TRUE(arr.empty());
+}
+
+TEST(TestHeaderOnlyArrayRef, TestSingleton) {
+  float val = 5.0f;
+  HeaderOnlyArrayRef<float> arr(val);
+  ASSERT_FALSE(arr.empty());
+  EXPECT_EQ(arr.size(), 1);
+  EXPECT_EQ(arr[0], val);
+}
+
+TEST(TestHeaderOnlyArrayRef, TestAPIs) {
+  std::vector<int> vec = {1, 2, 3, 4, 5, 6, 7};
+  HeaderOnlyArrayRef<int> arr(vec);
+  ASSERT_FALSE(arr.empty());
+  EXPECT_EQ(arr.size(), 7);
+  for (size_t i = 0; i < arr.size(); i++) {
+    EXPECT_EQ(arr[i], i + 1);
+    EXPECT_EQ(arr.at(i), i + 1);
+  }
+  EXPECT_EQ(arr.front(), 1);
+  EXPECT_EQ(arr.back(), 7);
+  ASSERT_TRUE(arr.slice(3, 4).equals(arr.slice(3)));
+}
+
+TEST(TestHeaderOnlyArrayRef, TestFromInitializerList) {
+  std::vector<int> vec = {1, 2, 3, 4, 5, 6, 7};
+  HeaderOnlyArrayRef<int> arr({1, 2, 3, 4, 5, 6, 7});
+  auto res_vec = arr.vec();
+  for (size_t i = 0; i < vec.size(); i++) {
+    EXPECT_EQ(vec[i], res_vec[i]);
+  }
+}
+
+TEST(TestHeaderOnlyArrayRef, TestFromRange) {
+  std::vector<int> vec = {1, 2, 3, 4, 5, 6, 7};
+  HeaderOnlyArrayRef<int> arr(vec.data() + 3, vec.data() + 7);
+  auto res_vec = arr.vec();
+  for (size_t i = 0; i < res_vec.size(); i++) {
+    EXPECT_EQ(vec[i + 3], res_vec[i]);
+  }
+}
diff --git a/c10/test/util/Metaprogramming_test.cpp b/test/cpp/aoti_abi_check/test_metaprogramming.cpp
similarity index 96%
rename from c10/test/util/Metaprogramming_test.cpp
rename to test/cpp/aoti_abi_check/test_metaprogramming.cpp
index 9a2bfe768175c..818761519069e 100644
--- a/c10/test/util/Metaprogramming_test.cpp
+++ b/test/cpp/aoti_abi_check/test_metaprogramming.cpp
@@ -1,9 +1,8 @@
-#include <c10/test/util/Macros.h>
-#include <c10/util/Metaprogramming.h>
 #include <gtest/gtest.h>
+#include <torch/headeronly/util/Metaprogramming.h>
 #include <cstdlib>
 
-using namespace c10::guts;
+using namespace torch::headeronly::guts;
 
 // NOLINTBEGIN(modernize*, cppcoreguidelines-special-member-functions)
 namespace {
@@ -65,6 +64,15 @@ static_assert(
         typename make_function_traits_t<void, typelist::typelist<int, float>>::
             func_type>::value,
     "");
+
+struct Functor final {
+  std::string operator()(int64_t a, float b) const;
+};
+static_assert(
+    std::is_same<
+        std::string(int64_t, float),
+        typename infer_function_traits_t<Functor>::func_type>::value,
+    "");
 } // namespace test_function_traits
 
 struct MovableOnly {
diff --git a/test/cpp/aoti_abi_check/test_scalartype.cpp b/test/cpp/aoti_abi_check/test_scalartype.cpp
index e0952a48e5af5..6df242b5a4cec 100644
--- a/test/cpp/aoti_abi_check/test_scalartype.cpp
+++ b/test/cpp/aoti_abi_check/test_scalartype.cpp
@@ -13,6 +13,17 @@ TEST(TestScalarType, ScalarTypeToCPPTypeT) {
 #undef DEFINE_CHECK
 }
 
+TEST(TestScalarType, CppTypeToScalarType) {
+  using torch::headeronly::CppTypeToScalarType;
+  using torch::headeronly::ScalarType;
+
+#define DEFINE_CHECK(TYPE, SCALARTYPE) \
+  EXPECT_EQ(CppTypeToScalarType<TYPE>::value, ScalarType::SCALARTYPE);
+
+  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CHECK);
+#undef DEFINE_CHECK
+}
+
 #define DEFINE_CHECK(TYPE, SCALARTYPE)                                       \
   {                                                                          \
     EXPECT_EQ(                                                               \
@@ -90,3 +101,14 @@ TEST(TestScalarType, toUnderlying) {
   AT_FORALL_FLOAT8_TYPES(DEFINE_CHECK);
 #undef DEFINE_CHECK
 }
+
+TEST(TestScalarType, isQIntType) {
+  using torch::headeronly::isQIntType;
+  using torch::headeronly::ScalarType;
+#define DEFINE_CHECK(_, name) EXPECT_TRUE(isQIntType(ScalarType::name));
+  AT_FORALL_QINT_TYPES(DEFINE_CHECK);
+#undef DEFINE_CHECK
+#define DEFINE_CHECK(_, name) EXPECT_FALSE(isQIntType(ScalarType::name));
+  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CHECK);
+#undef DEFINE_CHECK
+}
diff --git a/c10/test/util/TypeList_test.cpp b/test/cpp/aoti_abi_check/test_typelist.cpp
similarity index 99%
rename from c10/test/util/TypeList_test.cpp
rename to test/cpp/aoti_abi_check/test_typelist.cpp
index 274cbfa442186..e3afe8f99dd8f 100644
--- a/c10/test/util/TypeList_test.cpp
+++ b/test/cpp/aoti_abi_check/test_typelist.cpp
@@ -1,8 +1,8 @@
-#include <c10/util/TypeList.h>
 #include <gtest/gtest.h>
+#include <torch/headeronly/util/TypeList.h>
 #include <memory>
 
-using namespace c10::guts::typelist;
+using namespace torch::headeronly::guts::typelist;
 // NOLINTBEGIN(modernize-unary-static-assert)
 namespace test_size {
 class MyClass {};
diff --git a/c10/test/util/TypeTraits_test.cpp b/test/cpp/aoti_abi_check/test_typetraits.cpp
similarity index 98%
rename from c10/test/util/TypeTraits_test.cpp
rename to test/cpp/aoti_abi_check/test_typetraits.cpp
index 43b48cefcf3e9..1a00299b4d99e 100644
--- a/c10/test/util/TypeTraits_test.cpp
+++ b/test/cpp/aoti_abi_check/test_typetraits.cpp
@@ -1,7 +1,7 @@
-#include <c10/util/TypeTraits.h>
 #include <gtest/gtest.h>
+#include <torch/headeronly/util/TypeTraits.h>
 
-using namespace c10::guts;
+using namespace torch::headeronly::guts;
 
 // NOLINTBEGIN(modernize-unary-static-assert)
 namespace {
diff --git a/test/cpp/api/CMakeLists.txt b/test/cpp/api/CMakeLists.txt
index 8261aae3b5607..a92832a4d04c9 100644
--- a/test/cpp/api/CMakeLists.txt
+++ b/test/cpp/api/CMakeLists.txt
@@ -70,6 +70,13 @@ if(NOT MSVC)
   if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12)
     target_compile_options_if_supported(test_api "-Wno-error=nonnull")
   endif()
+
+  # Add -Wno-error=array-bounds for GCC 13+
+  # See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113239
+  if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13)
+    target_compile_options_if_supported(test_api "-Wno-error=array-bounds")
+  endif()
+
 endif()
 
 if(INSTALL_TEST)
diff --git a/test/cpp/api/init_baseline.py b/test/cpp/api/init_baseline.py
index 47b202e86311d..4042657b4d5c3 100644
--- a/test/cpp/api/init_baseline.py
+++ b/test/cpp/api/init_baseline.py
@@ -64,7 +64,7 @@ def run(initializer):
 
 def main():
     initializer_parameter_map = {}
-    for initializer in INITIALIZERS.keys():
+    for initializer in INITIALIZERS:
         sys.stderr.write(f"Evaluating {initializer} ...\n")
         initializer_parameter_map[initializer] = run(initializer)
 
diff --git a/test/cpp/api/optim_baseline.py b/test/cpp/api/optim_baseline.py
index 7e278d4e42086..e1a3c91b7128f 100644
--- a/test/cpp/api/optim_baseline.py
+++ b/test/cpp/api/optim_baseline.py
@@ -130,7 +130,7 @@ def main():
     options = parser.parse_args()
 
     optimizer_parameter_map = {}
-    for optimizer in OPTIMIZERS.keys():
+    for optimizer in OPTIMIZERS:
         sys.stderr.write(f"Evaluating {optimizer} ...\n")
         optimizer_parameter_map[optimizer] = run(
             optimizer, options.iterations, options.sample_every
diff --git a/test/cpp/jit/test_custom_operators.cpp b/test/cpp/jit/test_custom_operators.cpp
index 58f87717844de..66295d0380629 100644
--- a/test/cpp/jit/test_custom_operators.cpp
+++ b/test/cpp/jit/test_custom_operators.cpp
@@ -15,7 +15,7 @@ namespace jit {
 TEST(CustomOperatorTest, InferredSchema) {
   torch::RegisterOperators reg(
       "foo::bar", [](double a, at::Tensor b) { return a + b; });
-  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar"));
+  auto ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar"));
   ASSERT_EQ(ops.size(), 1);
 
   auto& op = ops.front();
@@ -43,8 +43,7 @@ TEST(CustomOperatorTest, ExplicitSchema) {
       "foo::bar_with_schema(float a, Tensor b) -> Tensor",
       [](double a, at::Tensor b) { return a + b; });
 
-  auto& ops =
-      getAllOperatorsFor(Symbol::fromQualString("foo::bar_with_schema"));
+  auto ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar_with_schema"));
   ASSERT_EQ(ops.size(), 1);
 
   auto& op = ops.front();
@@ -77,7 +76,7 @@ TEST(CustomOperatorTest, ListParameters) {
          torch::List<c10::complex<double>> complexdoubles,
          torch::List<at::Tensor> tensors) { return floats; });
 
-  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists"));
+  auto ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists"));
   ASSERT_EQ(ops.size(), 1);
 
   auto& op = ops.front();
@@ -123,7 +122,7 @@ TEST(CustomOperatorTest, ListParameters2) {
       "foo::lists2(Tensor[] tensors) -> Tensor[]",
       [](torch::List<at::Tensor> tensors) { return tensors; });
 
-  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists2"));
+  auto ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists2"));
   ASSERT_EQ(ops.size(), 1);
 
   auto& op = ops.front();
@@ -213,7 +212,7 @@ TEST(TestCustomOperator, OperatorGeneratorUndeclared) {
       },
       aliasAnalysisFromSchema())});
 
-  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::not_exist"));
+  auto ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::not_exist"));
   ASSERT_EQ(ops.size(), 0);
 }
 
@@ -232,7 +231,7 @@ TEST(TestCustomOperator, OperatorGeneratorBasic) {
       },
       aliasAnalysisFromSchema())});
 
-  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::bar"));
+  auto ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::bar"));
   ASSERT_EQ(ops.size(), 1);
 
   auto& op = ops.front();
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/__init__.py b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/__init__.py
similarity index 100%
rename from test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/__init__.py
rename to test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/__init__.py
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/get_any_data_ptr.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/get_any_data_ptr.cpp
new file mode 100644
index 0000000000000..15360278c2bce
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/get_any_data_ptr.cpp
@@ -0,0 +1,20 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+
+using torch::stable::Tensor;
+
+uint64_t get_any_data_ptr(Tensor t, bool mutable_) {
+  if (mutable_) {
+    return reinterpret_cast<uint64_t>(t.mutable_data_ptr());
+  } else {
+    return reinterpret_cast<uint64_t>(t.const_data_ptr());
+  }
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("get_any_data_ptr(Tensor t, bool mutable_) -> int");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("get_any_data_ptr", TORCH_BOX(&get_any_data_ptr));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/get_template_any_data_ptr.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/get_template_any_data_ptr.cpp
new file mode 100644
index 0000000000000..7682e1d135610
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/get_template_any_data_ptr.cpp
@@ -0,0 +1,34 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/headeronly/core/ScalarType.h>
+
+using torch::stable::Tensor;
+
+uint64_t get_template_any_data_ptr(Tensor t, torch::headeronly::ScalarType dtype, bool mutable_) {
+#define DEFINE_CASE(T, name)                                            \
+  case torch::headeronly::ScalarType::name: {                           \
+    if (mutable_) {                                                     \
+      return reinterpret_cast<uint64_t>(t.mutable_data_ptr<T>());       \
+    } else {                                                            \
+      return reinterpret_cast<uint64_t>(t.const_data_ptr<T>());         \
+    }                                                                   \
+  }
+  switch (dtype) {
+    // per aten/src/ATen/templates/TensorMethods.cpp:
+    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CASE)
+    DEFINE_CASE(uint16_t, UInt16)
+    DEFINE_CASE(uint32_t, UInt32)
+    DEFINE_CASE(uint64_t, UInt64)
+  default:
+      return 0;
+  }
+#undef DEFINE_CASE
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("get_template_any_data_ptr(Tensor t, ScalarType dtype, bool mutable_) -> int");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("get_template_any_data_ptr", TORCH_BOX(&get_template_any_data_ptr));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/make_tensor_clones_and_call_foreach.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/make_tensor_clones_and_call_foreach.cpp
new file mode 100644
index 0000000000000..d3dbab5891394
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/make_tensor_clones_and_call_foreach.cpp
@@ -0,0 +1,41 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/ops.h>
+#include <torch/csrc/stable/tensor.h>
+
+#include <vector>
+
+using torch::stable::Tensor;
+
+// Declare my__foreach_mul (defined in my__foreach_mul.cpp)
+extern std::vector<Tensor> my__foreach_mul(
+    torch::headeronly::HeaderOnlyArrayRef<Tensor> self,
+    torch::headeronly::HeaderOnlyArrayRef<Tensor> other);
+
+// Helper function for cloning
+Tensor my_clone(Tensor t) {
+  return clone(t);
+}
+
+std::vector<Tensor> make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) {
+  // This function tests that my__foreach_mul can take in std::initializer_lists
+  // in addition to std::vectors.
+  Tensor t1_1 = my_clone(t1);
+  Tensor t1_2 = my_clone(t1);
+  Tensor t2_1 = my_clone(t2);
+  Tensor t2_2 = my_clone(t2);
+  return my__foreach_mul({t1_1, t2_1}, {t1_2, t2_2});
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def(
+      "make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) -> Tensor[]");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(
+    libtorch_agnostic_2_10,
+    CompositeExplicitAutograd,
+    m) {
+  m.impl(
+      "make_tensor_clones_and_call_foreach",
+      TORCH_BOX(&make_tensor_clones_and_call_foreach));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/mv_tensor_accessor_cpu.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/mv_tensor_accessor_cpu.cpp
new file mode 100644
index 0000000000000..705439efffe63
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/mv_tensor_accessor_cpu.cpp
@@ -0,0 +1,40 @@
+// This is duplicated from the libtorch_agnostic_2_9_extension
+// as a negative test for test_version_compatibility.py
+
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/ops.h>
+#include <torch/headeronly/util/Exception.h>
+#include <torch/headeronly/core/ScalarType.h>
+#include <torch/headeronly/core/Dispatch_v2.h>
+#include <torch/headeronly/core/TensorAccessor.h>
+
+#include "tensor_accessor_kernel.h"
+
+using torch::stable::Tensor;
+
+Tensor mv_tensor_accessor_cpu(Tensor m, Tensor v) {
+  STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
+  STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
+  STD_TORCH_CHECK(m.size(1) == v.size(0), "m.shape[1] == v.shape[0] must hold");
+  STD_TORCH_CHECK(m.scalar_type() == v.scalar_type(), "m and v must have the same dtype");
+  STD_TORCH_CHECK(m.device() == v.device(), "m and v must be on the same device");
+  Tensor res = new_empty(m, {m.size(0)});
+  THO_DISPATCH_V2(m.scalar_type(), "mv_tensor_accessor_cpu",
+                  AT_WRAP(([&]() {
+                    auto resa = Accessor_cpu<scalar_t, 1>(reinterpret_cast<scalar_t*>(res.data_ptr()), res.sizes().data(), res.strides().data());
+                    auto ma = Accessor_cpu<scalar_t, 2>(reinterpret_cast<scalar_t*>(m.data_ptr()), m.sizes().data(), m.strides().data());
+                    auto va = Accessor_cpu<scalar_t, 1>(reinterpret_cast<scalar_t*>(v.data_ptr()), v.sizes().data(), v.strides().data());
+                    mv_tensor_accessor_kernel<Accessor_cpu, scalar_t>(resa, ma, va);
+                  })),
+                  AT_FLOATING_TYPES);
+  return res;
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("mv_tensor_accessor_cpu(Tensor res, Tensor m, Tensor v) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("mv_tensor_accessor_cpu", TORCH_BOX(&mv_tensor_accessor_cpu));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/mv_tensor_accessor_cuda.cu b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/mv_tensor_accessor_cuda.cu
new file mode 100644
index 0000000000000..7773210a089ee
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/mv_tensor_accessor_cuda.cu
@@ -0,0 +1,47 @@
+// This is duplicated from the libtorch_agnostic_2_9_extension
+// as a negative test for test_version_compatibility.py
+
+#include "tensor_accessor_kernel.h"
+
+#include <cuda_runtime.h>
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/ops.h>
+#include <torch/csrc/stable/tensor.h>
+
+using torch::stable::Tensor;
+
+Tensor mv_tensor_accessor_cuda(Tensor m, Tensor v) {
+  STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
+  STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
+  STD_TORCH_CHECK(m.size(1) == v.size(0), "m.shape[1] == v.shape[0] must hold");
+  STD_TORCH_CHECK(
+      m.scalar_type() == v.scalar_type(), "m and v must have the same dtype");
+  STD_TORCH_CHECK(
+      m.device() == v.device(), "m and v must be on the same device");
+  Tensor res = new_empty(m, {m.size(0)});
+  THO_DISPATCH_V2(
+      m.scalar_type(),
+      "mv_tensor_accessor_cuda",
+      AT_WRAP(([&]() {
+        auto resa = Accessor_cuda<scalar_t, 1>(
+            reinterpret_cast<scalar_t*>(res.data_ptr()),
+            res.sizes().data(),
+            res.strides().data());
+        auto ma = Accessor_cuda<scalar_t, 2>(
+            reinterpret_cast<scalar_t*>(m.data_ptr()),
+            m.sizes().data(),
+            m.strides().data());
+        auto va = Accessor_cuda<scalar_t, 1>(
+            reinterpret_cast<scalar_t*>(v.data_ptr()),
+            v.sizes().data(),
+            v.strides().data());
+        mv_tensor_accessor_kernel<Accessor_cuda, scalar_t>
+            <<<1, 1, 0, 0>>>(resa, ma, va);
+      })),
+      AT_FLOATING_TYPES);
+  return res;
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CUDA, m) {
+  m.impl("mv_tensor_accessor", TORCH_BOX(&mv_tensor_accessor_cuda));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my__foreach_mul.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my__foreach_mul.cpp
new file mode 100644
index 0000000000000..834a63afea646
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my__foreach_mul.cpp
@@ -0,0 +1,20 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <vector>
+
+using torch::stable::Tensor;
+
+std::vector<Tensor> my__foreach_mul(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
+  std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
+  aoti_torch_call_dispatcher("aten::_foreach_mul", "List", stack.data());
+  return torch::stable::detail::to<std::vector<Tensor>>(stack[0]);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("my__foreach_mul(Tensor[] self, Tensor[] other) -> Tensor[]");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("my__foreach_mul", TORCH_BOX(&my__foreach_mul));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my__foreach_mul_.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my__foreach_mul_.cpp
new file mode 100644
index 0000000000000..8409e6890bdd0
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my__foreach_mul_.cpp
@@ -0,0 +1,19 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/stableivalue_conversions.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+using torch::stable::Tensor;
+
+void my__foreach_mul_(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
+  std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
+  aoti_torch_call_dispatcher("aten::_foreach_mul_", "List", stack.data());
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("my__foreach_mul_(Tensor(a!)[] self, Tensor[] other) -> ()");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("my__foreach_mul_", TORCH_BOX(&my__foreach_mul_));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_empty.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_empty.cpp
new file mode 100644
index 0000000000000..6278dca9f281d
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_empty.cpp
@@ -0,0 +1,25 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/device.h>
+#include <torch/csrc/stable/ops.h>
+
+#include <optional>
+
+using torch::stable::Tensor;
+
+Tensor my_empty(
+    torch::headeronly::HeaderOnlyArrayRef<int64_t> size,
+    std::optional<torch::headeronly::ScalarType> dtype,
+    std::optional<torch::stable::Device> device,
+    std::optional<bool> pin_memory) {
+  return empty(size, dtype, device, pin_memory);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def(
+      "my_empty(int[] size, ScalarType? dtype=None, Device? device=None, bool? pin_memory=None) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("my_empty", TORCH_BOX(&my_empty));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_reshape.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_reshape.cpp
new file mode 100644
index 0000000000000..0a2b1f70f2156
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_reshape.cpp
@@ -0,0 +1,17 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/ops.h>
+
+using torch::stable::Tensor;
+
+Tensor my_reshape(Tensor t, torch::headeronly::HeaderOnlyArrayRef<int64_t> shape) {
+  return reshape(t, shape);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("my_reshape(Tensor t, int[] shape) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("my_reshape", TORCH_BOX(&my_reshape));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_view.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_view.cpp
new file mode 100644
index 0000000000000..25d8c54589247
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_view.cpp
@@ -0,0 +1,20 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/ops.h>
+
+using torch::stable::Tensor;
+
+Tensor my_view(Tensor t, torch::headeronly::HeaderOnlyArrayRef<int64_t> size) {
+  return view(t, size);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("my_view(Tensor t, int[] size) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(
+    libtorch_agnostic_2_10,
+    CompositeExplicitAutograd,
+    m) {
+  m.impl("my_view", TORCH_BOX(&my_view));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/tensor_accessor_kernel.h b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/tensor_accessor_kernel.h
new file mode 100644
index 0000000000000..642f3f496f4ed
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/tensor_accessor_kernel.h
@@ -0,0 +1,31 @@
+// This is duplicated from the libtorch_agnostic_2_9_extension
+// as a negative test for test_version_compatibility.py
+
+#pragma once
+
+#include <torch/headeronly/core/Dispatch_v2.h>
+#include <torch/headeronly/core/TensorAccessor.h>
+
+template <typename T, size_t N>
+using Accessor_cpu = torch::headeronly::HeaderOnlyTensorAccessor<T, N>;
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#define MAYBE_GLOBAL __global__
+
+template <typename T, size_t N>
+using Accessor_cuda = torch::headeronly::HeaderOnlyGenericPackedTensorAccessor<T, N, torch::headeronly::RestrictPtrTraits>;
+
+#else
+#define MAYBE_GLOBAL
+#endif
+
+template <template <typename, size_t> class Accessor, typename scalar_t>
+MAYBE_GLOBAL void mv_tensor_accessor_kernel(Accessor<scalar_t, 1> resa, Accessor<scalar_t, 2> ma, Accessor<scalar_t, 1> va) {
+  for (int64_t i = 0; i < resa.size(0); i++) {
+    scalar_t val = 0;
+    for (int64_t j = 0; j < ma.size(1); j++) {
+      val += ma[i][j] * va[j];
+    }
+    resa[i] = val;
+  }
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_constructor.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_constructor.cpp
new file mode 100644
index 0000000000000..67d5a300fc04a
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_constructor.cpp
@@ -0,0 +1,37 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/device.h>
+
+#include <string>
+
+torch::stable::Device test_device_constructor(
+    bool is_cuda,
+    torch::stable::DeviceIndex index,
+    bool use_str) {
+  using torch::stable::Device;
+  using torch::stable::DeviceType;
+
+  if (use_str) {
+    std::string device_str;
+    if (is_cuda) {
+      device_str = "cuda:" + std::to_string(index);
+    } else {
+      device_str = "cpu";
+    }
+    return Device(device_str);
+  } else {
+    if (is_cuda) {
+      return Device(DeviceType::CUDA, index);
+    } else {
+      return Device(DeviceType::CPU);
+    }
+  }
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def(
+      "test_device_constructor(bool is_cuda, DeviceIndex index, bool use_str) -> Device");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_device_constructor", TORCH_BOX(&test_device_constructor));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_equality.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_equality.cpp
new file mode 100644
index 0000000000000..247cd727175f6
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_equality.cpp
@@ -0,0 +1,14 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/device.h>
+
+bool test_device_equality(torch::stable::Device d1, torch::stable::Device d2) {
+  return d1 == d2;
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_device_equality(Device d1, Device d2) -> bool");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_device_equality", TORCH_BOX(&test_device_equality));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_index.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_index.cpp
new file mode 100644
index 0000000000000..dba40ea289e6d
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_index.cpp
@@ -0,0 +1,14 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/device.h>
+
+torch::stable::DeviceIndex test_device_index(torch::stable::Device device) {
+  return device.index();
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_device_index(Device device) -> DeviceIndex");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_device_index", TORCH_BOX(&test_device_index));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_is_cpu.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_is_cpu.cpp
new file mode 100644
index 0000000000000..58e1af91dfd50
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_is_cpu.cpp
@@ -0,0 +1,14 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/device.h>
+
+bool test_device_is_cpu(torch::stable::Device device) {
+  return device.is_cpu();
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_device_is_cpu(Device device) -> bool");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_device_is_cpu", TORCH_BOX(&test_device_is_cpu));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_is_cuda.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_is_cuda.cpp
new file mode 100644
index 0000000000000..e08709f30c2d7
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_is_cuda.cpp
@@ -0,0 +1,14 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/device.h>
+
+bool test_device_is_cuda(torch::stable::Device device) {
+  return device.is_cuda();
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_device_is_cuda(Device device) -> bool");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_device_is_cuda", TORCH_BOX(&test_device_is_cuda));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_set_index.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_set_index.cpp
new file mode 100644
index 0000000000000..a588db4d4e311
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_set_index.cpp
@@ -0,0 +1,17 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/device.h>
+
+torch::stable::Device test_device_set_index(
+    torch::stable::Device device,
+    torch::stable::DeviceIndex index) {
+  device.set_index(index);
+  return device;
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_device_set_index(Device device, DeviceIndex index) -> Device");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_device_set_index", TORCH_BOX(&test_device_set_index));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_get_num_threads.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_get_num_threads.cpp
new file mode 100644
index 0000000000000..0c16661830615
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_get_num_threads.cpp
@@ -0,0 +1,14 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/ops.h>
+
+uint32_t test_get_num_threads() {
+  return torch::stable::get_num_threads();
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_get_num_threads() -> int");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_get_num_threads", TORCH_BOX(&test_get_num_threads));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_parallel_for.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_parallel_for.cpp
new file mode 100644
index 0000000000000..3c4be2eb5f552
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_parallel_for.cpp
@@ -0,0 +1,49 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/ops.h>
+#include <torch/csrc/stable/device.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/csrc/inductor/aoti_torch/generated/c_shim_aten.h>
+
+using torch::stable::Tensor;
+
+Tensor test_parallel_for(int64_t size, int64_t grain_size) {
+  AtenTensorHandle tensor_handle;
+  int64_t stride = 1;
+
+  aoti_torch_empty_strided(
+      1,
+      &size,
+      &stride,
+      aoti_torch_dtype_int64(),
+      aoti_torch_device_type_cpu(),
+      0,
+      &tensor_handle);
+
+  Tensor tensor(tensor_handle);
+  int64_t* data_ptr = reinterpret_cast<int64_t*>(tensor.data_ptr());
+
+  torch::stable::zero_(tensor);
+
+  // Use parallel_for to fill each element with its index
+  // If using a parallel path, the thread id is encoded in the upper 32 bits
+  torch::stable::parallel_for(
+      0, size, grain_size, [data_ptr](int64_t begin, int64_t end) {
+        for (auto i = begin; i < end; i++) {
+          STD_TORCH_CHECK(i <= UINT32_MAX);
+          uint32_t thread_id;
+          torch_get_thread_idx(&thread_id);
+          data_ptr[i] = i | (static_cast<int64_t>(thread_id) << 32);
+        }
+      });
+
+  return tensor;
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_parallel_for(int size, int grain_size) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_parallel_for", TORCH_BOX(&test_parallel_for));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_tensor_device.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_tensor_device.cpp
new file mode 100644
index 0000000000000..de00b6318a1a3
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_tensor_device.cpp
@@ -0,0 +1,17 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/device.h>
+
+using torch::stable::Tensor;
+
+torch::stable::Device test_tensor_device(torch::stable::Tensor tensor) {
+  return tensor.device();
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_tensor_device(Tensor t) -> Device");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_tensor_device", TORCH_BOX(&test_tensor_device));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/ops.py b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/ops.py
new file mode 100644
index 0000000000000..db1a4fd43033c
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/ops.py
@@ -0,0 +1,225 @@
+import torch
+from torch import Tensor
+
+
+def my__foreach_mul_(tensors, others) -> ():
+    """
+    Updates tensors to be the result of pointwise multiplying with others.
+
+    Args:
+        tensors: list of tensors
+        others: list of tensors (with the same corresponding shapes as tensors)
+
+    Returns: nothing, tensors is updated in place.
+    """
+    torch.ops.libtorch_agnostic_2_10.my__foreach_mul_.default(tensors, others)
+
+
+def my__foreach_mul(tensors, others) -> list[Tensor]:
+    """
+    Returns a list of tensors that are the results of pointwise multiplying
+    tensors and others.
+
+    Args:
+        tensors: list of tensors
+        others: list of tensors (with the same corresponding shapes as tensors)
+
+    Returns: list of multiplied tensors
+    """
+    return torch.ops.libtorch_agnostic_2_10.my__foreach_mul.default(tensors, others)
+
+
+def make_tensor_clones_and_call_foreach(t1, t2) -> list[Tensor]:
+    """
+    Returns a list of 2 tensors corresponding to the square of the inputs.
+
+    Args:
+        t1: Tensor
+        t2: Tensor
+
+    Returns: list of [t1^2, t2^2]
+    """
+    return torch.ops.libtorch_agnostic_2_10.make_tensor_clones_and_call_foreach.default(
+        t1, t2
+    )
+
+
+def test_tensor_device(t):
+    """
+    Tests Tensor device() method.
+
+    Args:
+        t: Tensor - tensor to get device from
+
+    Returns: Device - device of the tensor
+    """
+    return torch.ops.libtorch_agnostic_2_10.test_tensor_device.default(t)
+
+
+def test_device_constructor(is_cuda, index, use_str):
+    """
+    Tests creating a Device from DeviceType and index, or from a string.
+
+    Args:
+        is_cuda: bool - if True, creates CUDA device; if False, creates CPU device
+        index: int - device index
+        use_str: bool - if True, constructs from string; if False, constructs from DeviceType
+
+    Returns: Device - A device with the specified type and index
+    """
+    return torch.ops.libtorch_agnostic_2_10.test_device_constructor.default(
+        is_cuda, index, use_str
+    )
+
+
+def test_device_equality(d1, d2) -> bool:
+    """
+    Tests Device equality operator.
+
+    Args:
+        d1: Device - first device
+        d2: Device - second device
+
+    Returns: bool - True if devices are equal
+    """
+    return torch.ops.libtorch_agnostic_2_10.test_device_equality.default(d1, d2)
+
+
+def test_device_set_index(device, index):
+    """
+    Tests Device set_index() method.
+
+    Args:
+        device: Device - device to modify
+        index: int - new device index
+
+    Returns: Device - device with updated index
+    """
+    return torch.ops.libtorch_agnostic_2_10.test_device_set_index.default(device, index)
+
+
+def test_device_index(device) -> int:
+    """
+    Tests Device index() method.
+
+    Args:
+        device: Device - device to query
+
+    Returns: int - device index
+    """
+    return torch.ops.libtorch_agnostic_2_10.test_device_index.default(device)
+
+
+def test_device_is_cuda(device) -> bool:
+    """
+    Tests Device is_cuda() method.
+
+    Args:
+        device: Device - device to check
+
+    Returns: bool - True if device is CUDA
+    """
+    return torch.ops.libtorch_agnostic_2_10.test_device_is_cuda.default(device)
+
+
+def test_device_is_cpu(device) -> bool:
+    """
+    Tests Device is_cpu() method.
+
+    Args:
+        device: Device - device to check
+
+    Returns: bool - True if device is CPU
+    """
+    return torch.ops.libtorch_agnostic_2_10.test_device_is_cpu.default(device)
+
+
+def test_parallel_for(size, grain_size) -> Tensor:
+    """
+    Tests the parallel_for functionality by using it to fill a tensor with indices.
+    Args:
+        size: int - size of the tensor to create
+        grain_size: int - grain size for parallel_for
+    Returns: Tensor - a 1D int64 tensor where each element contains its index
+        (if multiple threads are used the threadid will be encoded in the upper 32 bits)
+    """
+    return torch.ops.libtorch_agnostic_2_10.test_parallel_for.default(size, grain_size)
+
+
+def test_get_num_threads() -> int:
+    """
+    Tests the get_num_threads functionality by returning the number of threads
+    for the parallel backend.
+
+    Returns: int - the number of threads for the parallel backend
+    """
+    return torch.ops.libtorch_agnostic_2_10.test_get_num_threads.default()
+
+
+def my_empty(size, dtype=None, device=None, pin_memory=None) -> Tensor:
+    """
+    Creates an empty tensor with the specified size, dtype, device, and pin_memory.
+
+    Args:
+        size: list[int] - size of the tensor to create
+        dtype: ScalarType or None - data type of the tensor
+        device: Device or None - device on which to create the tensor
+        pin_memory: bool or None - whether to use pinned memory
+
+    Returns: Tensor - an uninitialized tensor with the specified properties
+    """
+    return torch.ops.libtorch_agnostic_2_10.my_empty.default(
+        size, dtype, device, pin_memory
+    )
+
+
+def my_reshape(t, shape) -> Tensor:
+    """
+    Returns a tensor with the same data but different shape.
+
+    Args:
+        t: Tensor - tensor to reshape
+        shape: list[int] - new shape for the tensor
+
+    Returns: Tensor - reshaped tensor
+    """
+    return torch.ops.libtorch_agnostic_2_10.my_reshape.default(t, shape)
+
+
+def my_view(t, size) -> Tensor:
+    """
+    Returns a new tensor with the same data as the input tensor but of a different shape.
+
+    Args:
+        t: Tensor - tensor to view
+        size: list[int] - new size for the tensor
+
+    Returns: Tensor - tensor with new view
+    """
+    return torch.ops.libtorch_agnostic_2_10.my_view.default(t, size)
+
+
+def get_any_data_ptr(t, mutable) -> int:
+    """
+    Return data pointer value of the tensor.
+    Args:
+        t: Input tensor
+        mutable: whether data pointer qualifier is mutable or const
+    Returns: int - pointer value
+    """
+    return torch.ops.libtorch_agnostic_2_10.get_any_data_ptr.default(t, mutable)
+
+
+def get_template_any_data_ptr(t, dtype, mutable) -> int:
+    """
+    Return data pointer value of the tensor iff it has dtype.
+    Args:
+        t: Input tensor
+        dtype: Input dtype
+        mutable: whether data pointer qualifier is mutable or const
+    Returns: int - pointer value
+    Raises RuntimeError when t.dtype() != dtype.
+    """
+    return torch.ops.libtorch_agnostic_2_10.get_template_any_data_ptr.default(
+        t, dtype, mutable
+    )
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/setup.py b/test/cpp_extensions/libtorch_agnostic_2_10_extension/setup.py
similarity index 70%
rename from test/cpp_extensions/libtorch_agnostic_extension/setup.py
rename to test/cpp_extensions/libtorch_agnostic_2_10_extension/setup.py
index b7141a3e6fcd6..ff2aeff5e932b 100644
--- a/test/cpp_extensions/libtorch_agnostic_extension/setup.py
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/setup.py
@@ -9,7 +9,7 @@
 
 
 ROOT_DIR = Path(__file__).parent
-CSRC_DIR = ROOT_DIR / "libtorch_agnostic" / "csrc"
+CSRC_DIR = ROOT_DIR / "libtorch_agnostic_2_10" / "csrc"
 
 
 class clean(distutils.command.clean.clean):
@@ -18,13 +18,13 @@ def run(self):
         distutils.command.clean.clean.run(self)
 
         # Remove extension
-        for path in (ROOT_DIR / "libtorch_agnostic").glob("**/*.so"):
+        for path in (ROOT_DIR / "libtorch_agnostic_2_10").glob("**/*.so"):
             path.unlink()
         # Remove build and dist and egg-info directories
         dirs = [
             ROOT_DIR / "build",
             ROOT_DIR / "dist",
-            ROOT_DIR / "libtorch_agnostic.egg-info",
+            ROOT_DIR / "libtorch_agnostic_2_10.egg-info",
         ]
         for path in dirs:
             if path.exists():
@@ -33,20 +33,25 @@ def run(self):
 
 def get_extension():
     extra_compile_args = {
-        "cxx": ["-fdiagnostics-color=always"],
+        "cxx": [
+            "-fdiagnostics-color=always",
+            "-DTORCH_STABLE_ONLY",
+            "-DTORCH_TARGET_VERSION=0x020a000000000000",
+        ],
     }
+    sources = list(CSRC_DIR.glob("**/*.cpp"))
 
     extension = CppExtension
     # allow including <cuda_runtime.h>
     if torch.cuda.is_available():
         extra_compile_args["cxx"].append("-DLAE_USE_CUDA")
+        extra_compile_args["nvcc"] = ["-O2"]
         extension = CUDAExtension
-
-    sources = list(CSRC_DIR.glob("**/*.cpp"))
+        sources.extend(CSRC_DIR.glob("**/*.cu"))
 
     return [
         extension(
-            "libtorch_agnostic._C",
+            "libtorch_agnostic_2_10._C",
             sources=sorted(str(s) for s in sources),
             py_limited_api=True,
             extra_compile_args=extra_compile_args,
@@ -56,12 +61,12 @@ def get_extension():
 
 
 setup(
-    name="libtorch_agnostic",
+    name="libtorch_agnostic_2_10",
     version="0.0",
     author="PyTorch Core Team",
-    description="Example of libtorch agnostic extension",
+    description="Example of libtorch agnostic extension for PyTorch 2.10+",
     packages=find_packages(exclude=("test",)),
-    package_data={"libtorch_agnostic": ["*.dll", "*.dylib", "*.so"]},
+    package_data={"libtorch_agnostic_2_10": ["*.dll", "*.dylib", "*.so"]},
     install_requires=[
         "torch",
     ],
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/test_version_compatibility.py b/test/cpp_extensions/libtorch_agnostic_2_10_extension/test_version_compatibility.py
new file mode 100644
index 0000000000000..a094c57f8e614
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/test_version_compatibility.py
@@ -0,0 +1,308 @@
+# Owner(s): ["module: cpp"]
+
+"""
+Unit tests to verify that each function file requires PyTorch 2.10+.
+
+This test suite compiles each .cpp file in the csrc directory with
+TORCH_TARGET_VERSION=2.9.0 and expects compilation to fail.
+If compilation succeeds, it means that either
+
+(1) The test function works with 2.9.0 and should not be in this directory.
+(2) The test function tests APIs that do not have proper TORCH_FEATURE_VERSION
+    guards. If this is the case, and you incorrectly move the test function into
+    libtorch_agnostic_2_9_extension the libtorch_agnostic_targetting CI workflow
+    will catch this.
+
+Run this script with VERSION_COMPAT_DEBUG=1 to see compilation errors.
+"""
+
+import os
+import subprocess
+import tempfile
+from pathlib import Path
+
+from torch.testing._internal.common_utils import IS_WINDOWS, run_tests, TestCase
+from torch.utils.cpp_extension import CUDA_HOME, include_paths as torch_include_paths
+
+
+# TODO: Fix this error in Windows:
+# numba.cuda.cudadrv.driver:driver.py:384 Call to cuInit results in CUDA_ERROR_NO_DEVICE
+if not IS_WINDOWS:
+
+    class FunctionVersionCompatibilityTest(TestCase):
+        """Test that all function files require PyTorch 2.10+."""
+
+        @classmethod
+        def setUpClass(cls):
+            """Set up test environment once for all tests."""
+            cls.csrc_dir = Path(__file__).parent / "libtorch_agnostic_2_10" / "csrc"
+            cls.build_dir = Path(tempfile.mkdtemp(prefix="version_check_"))
+
+            cls.pytorch_includes = [
+                f"-I{path}" for path in torch_include_paths(device_type="cpu")
+            ]
+            cls.cuda_includes = []
+            if CUDA_HOME:
+                cuda_include_path = os.path.join(CUDA_HOME, "include")
+                if os.path.exists(cuda_include_path):
+                    cls.cuda_includes = [f"-I{cuda_include_path}"]
+
+            cls.cuda_available = cls._check_cuda_available()
+
+        @classmethod
+        def tearDownClass(cls):
+            """Clean up build directory."""
+            import shutil
+
+            if cls.build_dir.exists():
+                shutil.rmtree(cls.build_dir)
+
+        @staticmethod
+        def _check_cuda_available() -> bool:
+            """Check if CUDA is available."""
+            try:
+                import torch
+
+                return torch.cuda.is_available()
+            except ImportError:
+                return False
+
+        def _compile_cpp_file(
+            self, source_file: Path, output_file: Path
+        ) -> tuple[bool, str]:
+            """
+            Compile a C++ file with TORCH_TARGET_VERSION=2.9.0.
+            Returns (success, error_message).
+            """
+            torch_version_2_9 = "0x0209000000000000"
+
+            cmd = [
+                "g++",
+                "-c",
+                "-std=c++17",
+                f"-DTORCH_TARGET_VERSION={torch_version_2_9}",
+                f"-I{source_file.parent}",  # For includes in same directory
+                *self.pytorch_includes,
+            ]
+
+            # Add CUDA flags if available
+            if self.cuda_available:
+                cmd.extend(self.cuda_includes)
+
+            cmd.extend([str(source_file), "-o", str(output_file)])
+
+            result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
+
+            if result.returncode == 0:
+                return True, ""
+            else:
+                return False, result.stderr
+
+        def _compile_cu_file(
+            self, source_file: Path, output_file: Path
+        ) -> tuple[bool, str]:
+            """
+            Compile a CUDA file with TORCH_TARGET_VERSION=2.9.0.
+            Returns (success, error_message).
+            """
+            if not CUDA_HOME:
+                return False, "CUDA_HOME not set"
+
+            torch_version_2_9 = "0x0209000000000000"
+
+            cmd = [
+                os.path.join(CUDA_HOME, "bin", "nvcc"),
+                "-c",
+                "-std=c++17",
+                f"-DTORCH_TARGET_VERSION={torch_version_2_9}",
+                f"-I{source_file.parent}",  # For includes in same directory
+                *self.pytorch_includes,
+                *self.cuda_includes,
+            ]
+
+            cmd.extend([str(source_file), "-o", str(output_file)])
+
+            result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
+
+            if result.returncode == 0:
+                return True, ""
+            else:
+                return False, result.stderr
+
+        def _test_function_file(self, source_file: Path):
+            """Test that a function file fails to compile with TORCH_TARGET_VERSION=2.9.0."""
+            func_name = source_file.stem
+            obj_file = self.build_dir / f"{func_name}.o"
+
+            # Choose the appropriate compiler based on file extension
+            if source_file.suffix == ".cu":
+                if not self.cuda_available:
+                    self.skipTest(f"CUDA not available, skipping {source_file.name}")
+                success, error_msg = self._compile_cu_file(source_file, obj_file)
+            else:
+                success, error_msg = self._compile_cpp_file(source_file, obj_file)
+
+            obj_file.unlink(missing_ok=True)
+
+            # Print error details for debugging
+            if not success:
+                relevant_errors = self._extract_relevant_errors(error_msg)
+                if relevant_errors:
+                    print(f"\n  Compilation errors for {func_name} (requires 2.10+):")
+                    for err in relevant_errors:
+                        print(f"    {err}")
+
+            self.assertFalse(
+                success,
+                f"Function {func_name} compiled successfully with TORCH_TARGET_VERSION=2.9.0. "
+                f"This could mean two things.\n\t1. It should run with 2.9.0 and should be "
+                "moved to libtorch_agnostic_2_9_extension\n\t2. The function(s) it tests do not use the "
+                "proper TORCH_FEATURE_VERSION guards\n\nThe libtorch_agnostic_targetting CI workflow will "
+                "verify if you incorrectly move this to the 2_9 extension instead of adding "
+                "the appropriate version guards.",
+            )
+
+        def test_mv_tensor_accessor_cpu_works_with_2_9(self):
+            """Test that mv_tensor_accessor_cpu.cpp compiles successfully with 2.9.0.
+
+            This is a negative test - it ensures that a file we expect to work with 2.9.0
+            actually does compile. This validates that our test infrastructure correctly
+            distinguishes between files that require 2.10+ and those that don't.
+            """
+            cpp_file = self.csrc_dir / "mv_tensor_accessor_cpu.cpp"
+
+            if not cpp_file.exists():
+                self.skipTest(f"{cpp_file} not found - this is a test file only")
+
+            obj_file = self.build_dir / "mv_tensor_accessor_cpu.o"
+            success, error_msg = self._compile_cpp_file(cpp_file, obj_file)
+
+            # Clean up
+            obj_file.unlink(missing_ok=True)
+
+            if not success:
+                relevant_errors = self._extract_relevant_errors(error_msg)
+                if relevant_errors:
+                    print(
+                        "\n  Unexpected compilation errors for mv_tensor_accessor_cpu:"
+                    )
+                    for err in relevant_errors:
+                        print(f"{err}")
+
+            self.assertTrue(
+                success,
+                f"mv_tensor_accessor_cpu.cpp failed to compile with TORCH_TARGET_VERSION=2.9.0. "
+                f"This file is expected to work with 2.9.0 since it doesn't use 2.10+ features. "
+                f"Error: {error_msg}",
+            )
+
+        def test_mv_tensor_accessor_cuda_works_with_2_9(self):
+            """Test that mv_tensor_accessor_cuda.cu compiles successfully with 2.9.0.
+
+            This is a negative test - it ensures that a .cu file we expect to work with 2.9.0
+            actually does compile. This validates that our test infrastructure correctly
+            compiles CUDA files and distinguishes between files that require 2.10+ and those
+            that don't.
+            """
+            if not self.cuda_available:
+                self.skipTest(
+                    "CUDA not available, skipping mv_tensor_accessor_cuda.cu test"
+                )
+
+            cu_file = self.csrc_dir / "mv_tensor_accessor_cuda.cu"
+
+            if not cu_file.exists():
+                self.skipTest(f"{cu_file} not found - this is a test file only")
+
+            obj_file = self.build_dir / "cuda_kernel.o"
+            success, error_msg = self._compile_cu_file(cu_file, obj_file)
+
+            # Clean up
+            obj_file.unlink(missing_ok=True)
+
+            if not success:
+                relevant_errors = self._extract_relevant_errors(error_msg)
+                if relevant_errors:
+                    print(
+                        "\n  Unexpected compilation errors for mv_tensor_accessor_cuda.cu:"
+                    )
+                    for err in relevant_errors:
+                        print(f"{err}")
+
+            self.assertTrue(
+                success,
+                f"mv_tensor_accessor_cuda.cu failed to compile with TORCH_TARGET_VERSION=2.9.0. "
+                f"This file is expected to work with 2.9.0 since it doesn't use 2.10+ features. "
+                f"Error: {error_msg}",
+            )
+
+        @staticmethod
+        def _extract_relevant_errors(error_msg: str) -> list[str]:
+            """Extract the most relevant error messages."""
+            error_lines = error_msg.strip().split("\n")
+            relevant_errors = []
+
+            for line in error_lines:
+                line_lower = line.lower()
+                if (
+                    "error:" in line_lower
+                    or "undefined" in line_lower
+                    or "undeclared" in line_lower
+                    or "no member named" in line_lower
+                ):
+                    relevant_errors.append(line.strip())
+
+            return relevant_errors
+
+    # Dynamically create test methods for each .cpp and .cu file
+
+    def _create_test_method_for_file(source_file: Path):
+        """Create a test method for a specific source file."""
+
+        def test_method_impl(self):
+            self._test_function_file(source_file)
+
+        # Set a descriptive name and docstring
+        func_name = source_file.stem
+        file_ext = source_file.suffix
+        test_method_impl.__name__ = f"test_{func_name}_requires_2_10"
+        test_method_impl.__doc__ = (
+            f"Test that {func_name}{file_ext} requires PyTorch 2.10+"
+        )
+
+        return test_method_impl
+
+    # Test discovery: generate a test for each .cpp and .cu file
+    _csrc_dir = Path(__file__).parent / "libtorch_agnostic_2_10" / "csrc"
+    if _csrc_dir.exists():
+        # Collect both .cpp and .cu files, excluding those used for negative test
+        # already defined above
+        _source_files = sorted(
+            [
+                f
+                for f in _csrc_dir.rglob("*.cpp")
+                if f.name not in ("mv_tensor_accessor_cpu.cpp",)
+            ]
+            + [
+                f
+                for f in _csrc_dir.rglob("*.cu")
+                if f.name not in ("mv_tensor_accessor_cuda.cu",)
+            ]
+        )
+
+        for _source_file in _source_files:
+            _test_method = _create_test_method_for_file(_source_file)
+            setattr(
+                FunctionVersionCompatibilityTest, _test_method.__name__, _test_method
+            )
+
+        del (
+            _create_test_method_for_file,
+            _csrc_dir,
+            _source_files,
+            _source_file,
+            _test_method,
+        )
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/__init__.py b/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/__init__.py
new file mode 100644
index 0000000000000..7fa8732335cf0
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/__init__.py
@@ -0,0 +1,21 @@
+import ctypes
+from pathlib import Path
+
+import torch
+
+
+so_files = list(Path(__file__).parent.glob("_C*.so"))
+assert len(so_files) == 1, f"Expected one _C*.so file, found {len(so_files)}"
+
+# use ctypes.CDLL instead of load_library to be able to test the unload logic
+# below code is reduced from the load_library code
+with torch._ops.dl_open_guard():
+    loaded_lib = ctypes.CDLL(so_files[0])
+
+from . import ops
+
+
+__all__ = [
+    "loaded_lib",
+    "ops",
+]
diff --git a/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/csrc/cuda_kernel.cu b/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/csrc/cuda_kernel.cu
new file mode 100644
index 0000000000000..88c19d0ebf062
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/csrc/cuda_kernel.cu
@@ -0,0 +1,44 @@
+#include "kernel.h"
+
+#include <cuda_runtime.h>
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/ops.h>
+#include <torch/csrc/stable/tensor.h>
+
+using torch::stable::Tensor;
+
+Tensor mv_tensor_accessor_cuda(Tensor m, Tensor v) {
+  STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
+  STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
+  STD_TORCH_CHECK(m.size(1) == v.size(0), "m.shape[1] == v.shape[0] must hold");
+  STD_TORCH_CHECK(
+      m.scalar_type() == v.scalar_type(), "m and v must have the same dtype");
+  STD_TORCH_CHECK(
+      m.device() == v.device(), "m and v must be on the same device");
+  Tensor res = new_empty(m, {m.size(0)});
+  THO_DISPATCH_V2(
+      m.scalar_type(),
+      "mv_tensor_accessor_cuda",
+      AT_WRAP(([&]() {
+        auto resa = Accessor_cuda<scalar_t, 1>(
+            reinterpret_cast<scalar_t*>(res.data_ptr()),
+            res.sizes().data(),
+            res.strides().data());
+        auto ma = Accessor_cuda<scalar_t, 2>(
+            reinterpret_cast<scalar_t*>(m.data_ptr()),
+            m.sizes().data(),
+            m.strides().data());
+        auto va = Accessor_cuda<scalar_t, 1>(
+            reinterpret_cast<scalar_t*>(v.data_ptr()),
+            v.sizes().data(),
+            v.strides().data());
+        mv_tensor_accessor_kernel<Accessor_cuda, scalar_t>
+            <<<1, 1, 0, 0>>>(resa, ma, va);
+      })),
+      AT_FLOATING_TYPES);
+  return res;
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CUDA, m) {
+  m.impl("mv_tensor_accessor", TORCH_BOX(&mv_tensor_accessor_cuda));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/csrc/kernel.cpp b/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/csrc/kernel.cpp
new file mode 100644
index 0000000000000..0304dfd8f0f4c
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/csrc/kernel.cpp
@@ -0,0 +1,435 @@
+#include "kernel.h"
+
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/ops.h>
+#include <torch/headeronly/util/Exception.h>
+#include <torch/headeronly/core/ScalarType.h>
+
+#ifdef LAE_USE_CUDA
+#include <cuda_runtime.h>
+#include <torch/csrc/stable/accelerator.h>
+#endif
+
+#include <optional>
+
+void inline sgd_math(
+  float* param_ptr,
+  float* grad_ptr,
+  float* out_ptr,
+  const float weight_decay,
+  const double lr,
+  const bool maximize,
+  int64_t size
+){
+  int64_t d = 0;
+  for (; d < size; d++) {
+    float grad_val = grad_ptr[d];
+    if (maximize) grad_val = -grad_val;
+    if (weight_decay != 0.0){
+      grad_val += param_ptr[d] * weight_decay;
+    }
+    out_ptr[d] = param_ptr[d] - grad_val * float(lr);
+  }
+}
+
+using torch::stable::Tensor;
+
+Tensor sgd_out_of_place(
+    const Tensor param,
+    const Tensor grad,
+    const double weight_decay,
+    const double lr,
+    const bool maximize) {
+  STD_TORCH_CHECK(param.dim() == 1, "param must be 1D");
+
+  // these test the get_device() and get_device_index() methods
+  // while ascertaining that we are still on CPU
+  STD_TORCH_CHECK(param.get_device() == -1, "CPU device index = -1");
+  STD_TORCH_CHECK(param.get_device_index() == -1, "CPU device index = -1");
+
+  // testing Tensor strides + stride
+  STD_TORCH_CHECK(param.strides()[0] == param.stride(0));
+
+  auto out = new_empty(param, param.sizes());
+
+  sgd_math(
+    reinterpret_cast<float*>(param.data_ptr()),
+    reinterpret_cast<float*>(grad.data_ptr()),
+    reinterpret_cast<float*>(out.data_ptr()),
+    float(weight_decay),
+    lr,
+    maximize,
+    param.numel()
+  );
+
+  return out;
+}
+
+STABLE_TORCH_LIBRARY(libtorch_agnostic_2_9, m) {
+  m.def("sgd_out_of_place(Tensor param, Tensor grad, float weight_decay, float lr, bool maximize) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CPU, m) {
+  m.impl("sgd_out_of_place", TORCH_BOX(&sgd_out_of_place));
+}
+
+Tensor identity(Tensor t) {
+  return t;
+}
+
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
+  m.def("identity(Tensor t) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CUDA, m) {
+  m.impl("identity", TORCH_BOX(&identity));
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CPU, m) {
+  m.impl("identity", TORCH_BOX(&identity));
+}
+
+Tensor my_abs(Tensor t) {
+  const auto num_args = 1;
+  StableIValue stack[num_args];
+  stack[0] = torch::stable::detail::from(t);
+  aoti_torch_call_dispatcher("aten::abs", "", stack);
+  return torch::stable::detail::to<Tensor>(stack[0]);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
+  m.def("my_abs(Tensor t) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
+  m.impl("my_abs", TORCH_BOX(&my_abs));
+}
+
+Tensor my_ones_like(Tensor t, StableIValue device) {
+  const auto num_args = 6;
+  StableIValue stack[num_args];
+
+  auto mf = aoti_torch_memory_format_contiguous_format();
+
+  stack[0] = torch::stable::detail::from(t);
+  stack[1] = torch::stable::detail::from(std::optional(t.scalar_type()));    // dtype
+  stack[2] = torch::stable::detail::from(std::nullopt);              // layout
+  stack[3] = torch::stable::detail::from(std::optional(device));     // device
+  stack[4] = torch::stable::detail::from(std::optional(false));      // pin_memory
+  stack[5] = torch::stable::detail::from(std::optional(mf));         // memory_format
+
+  aoti_torch_call_dispatcher("aten::ones_like", "", stack);
+
+  return torch::stable::detail::to<Tensor>(stack[0]);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
+  m.def("my_ones_like(Tensor t, Device d) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
+  m.impl("my_ones_like", TORCH_BOX(&my_ones_like));
+}
+
+std::tuple<Tensor, Tensor, bool> exp_neg_is_leaf(Tensor t1, Tensor t2, Tensor t3) {
+  StableIValue stack_exp[1];
+  stack_exp[0] = torch::stable::detail::from(t1);
+  aoti_torch_call_dispatcher("aten::exp", "", stack_exp);
+
+  StableIValue stack_neg[1];
+  stack_neg[0] = torch::stable::detail::from(t2);
+  aoti_torch_call_dispatcher("aten::neg", "", stack_neg);
+
+  StableIValue stack_is_leaf[1];
+  stack_is_leaf[0] = torch::stable::detail::from(t3);
+  aoti_torch_call_dispatcher("aten::is_leaf", "", stack_is_leaf);
+
+  return std::make_tuple(
+    torch::stable::detail::to<Tensor>(stack_exp[0]),
+    torch::stable::detail::to<Tensor>(stack_neg[0]),
+    torch::stable::detail::to<bool>(stack_is_leaf[0]));
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
+  m.def("exp_neg_is_leaf(Tensor t1, Tensor t2, Tensor t3) -> (Tensor, Tensor, bool)");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
+  m.impl("exp_neg_is_leaf", TORCH_BOX(&exp_neg_is_leaf));
+}
+
+Tensor neg_exp(Tensor t) {
+  StableIValue stack[1];
+  stack[0] = torch::stable::detail::from(t);
+  aoti_torch_call_dispatcher("aten::exp", "", stack);
+  aoti_torch_call_dispatcher("aten::neg", "", stack);
+  return torch::stable::detail::to<Tensor>(stack[0]);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
+  m.def("neg_exp(Tensor t) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
+  m.impl("neg_exp", TORCH_BOX(&neg_exp));
+}
+
+Tensor divide_neg_exp(Tensor t) {
+  StableIValue stack_neg[1];
+  stack_neg[0] = torch::stable::detail::from(t);
+
+  StableIValue stack_exp[1];
+  stack_exp[0] = torch::stable::detail::from(t);
+  aoti_torch_call_dispatcher("aten::exp", "", stack_exp);
+  aoti_torch_call_dispatcher("aten::neg", "", stack_neg);
+
+  StableIValue stack_div[2];
+  stack_div[0] = stack_neg[0];
+  stack_div[1] = stack_exp[0];
+  aoti_torch_call_dispatcher("aten::divide", "Tensor", stack_div);
+  return torch::stable::detail::to<Tensor>(stack_div[0]);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
+  m.def("divide_neg_exp(Tensor t) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
+  m.impl("divide_neg_exp", TORCH_BOX(&divide_neg_exp));
+}
+
+bool is_contiguous(Tensor t) {
+  return t.is_contiguous();
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
+  m.def("is_contiguous(Tensor t) -> bool");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
+  m.impl("is_contiguous", TORCH_BOX(&is_contiguous));
+}
+
+Tensor my_transpose(Tensor t, int64_t dim0, int64_t dim1) {
+  return transpose(t, dim0, dim1);
+}
+
+Tensor my_empty_like(Tensor t) {
+  return empty_like(t);
+}
+
+bool my_is_cpu(Tensor t) {
+  return t.is_cpu();
+}
+
+Tensor fill_infinity(Tensor t) {
+  auto value = std::numeric_limits<float>::infinity();
+  return fill_(t, value);
+}
+
+Tensor my_pad(Tensor t) {
+  std::string mode = "constant";
+  double value = 0.0;
+  return pad(t, {1, 2, 2, 1}, mode, value);
+}
+
+Tensor my_narrow(Tensor t, int64_t dim, int64_t start, int64_t length) {
+  return narrow(t, dim, start, length);
+}
+
+Tensor my_new_empty_dtype_variant(Tensor t) {
+  // Still using a std::vector below even though people can just pass in an
+  // initializer list (which will be implicitly converted to an HeaderOnlyArrayRef)
+  // directly.
+  // This is to test that passing in a std::vector works for BC. (It gets
+  // implicitly converted to HeaderOnlyArrayRef too!)
+  std::vector<int64_t> sizes = {2, 5};
+  auto dtype = std::make_optional(torch::headeronly::ScalarType::BFloat16);
+  return new_empty(t, sizes, dtype);
+}
+
+Tensor my_new_zeros_dtype_variant(Tensor t) {
+  auto dtype = std::make_optional(at::ScalarType::Float);
+  return new_zeros(t, {2, 5}, dtype);
+}
+
+Tensor my_copy_(Tensor dst, Tensor src, bool non_blocking) {
+  return copy_(dst, src, non_blocking);
+}
+
+Tensor my_clone(Tensor t) {
+  return clone(t);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
+  m.def("my_transpose(Tensor t, int dim0, int dim1) -> Tensor");
+  m.def("my_empty_like(Tensor t) -> Tensor");
+  m.def("fill_infinity(Tensor(a!) t) -> Tensor(a!)");
+  m.def("my_pad(Tensor t) -> Tensor");
+  m.def("my_narrow(Tensor t, int dim, int start, int length) -> Tensor");
+  m.def("my_new_empty_dtype_variant(Tensor t) -> Tensor");
+  m.def("my_new_zeros_dtype_variant(Tensor t) -> Tensor");
+  m.def("my_copy_(Tensor dst, Tensor src, bool non_blocking) -> Tensor");
+  m.def("my_clone(Tensor t) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
+  m.impl("my_transpose", TORCH_BOX(&my_transpose));
+  m.impl("my_empty_like", TORCH_BOX(&my_empty_like));
+  m.impl("fill_infinity", TORCH_BOX(&fill_infinity));
+  m.impl("my_is_cpu", TORCH_BOX(&my_is_cpu));
+  m.impl("my_new_empty_dtype_variant", TORCH_BOX(&my_new_empty_dtype_variant));
+  m.impl("my_new_zeros_dtype_variant", TORCH_BOX(&my_new_zeros_dtype_variant));
+  m.impl("my_copy_", TORCH_BOX(&my_copy_));
+  m.impl("my_clone", TORCH_BOX(&my_clone));
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeImplicitAutograd, m) {
+  m.impl("my_pad", TORCH_BOX(&my_pad));
+  m.impl("my_narrow", TORCH_BOX(&my_narrow));
+}
+
+Tensor my_zero_(Tensor t) {
+  return zero_(t);
+}
+
+Tensor my_amax(Tensor t) {
+  return amax(t, 0, false);
+}
+
+Tensor my_amax_vec(Tensor t) {
+  return amax(t, {0,1}, false);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
+  m.def("my_zero_(Tensor(a!) t) -> Tensor(a!)");
+  m.def("my_amax(Tensor a) -> Tensor");
+  m.def("my_amax_vec(Tensor a) -> Tensor");
+  m.def("my_is_cpu(Tensor t) -> bool");
+  m.def("test_default_constructor(bool undefined) -> bool");
+}
+
+bool test_default_constructor(bool defined) {
+  Tensor out;
+  if (defined) {
+    AtenTensorHandle defined_ath;
+    int64_t sizes[] = {2, 3};
+    int64_t strides[] = {3, 1};
+    aoti_torch_empty_strided(
+        2,
+        sizes,
+        strides,
+        aoti_torch_dtype_float32(),
+        aoti_torch_device_type_cpu(),
+        0,
+        &defined_ath);
+    out = Tensor(defined_ath);
+  }
+  return out.defined();
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
+  m.impl("my_zero_", TORCH_BOX(&my_zero_));
+  m.impl("my_amax", TORCH_BOX(&my_amax));
+  m.impl("my_amax_vec", TORCH_BOX(&my_amax_vec));
+  m.impl("test_default_constructor", TORCH_BOX(&test_default_constructor));
+}
+
+Tensor mv_tensor_accessor_cpu(Tensor m, Tensor v) {
+  STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
+  STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
+  STD_TORCH_CHECK(m.size(1) == v.size(0), "m.shape[1] == v.shape[0] must hold");
+  STD_TORCH_CHECK(m.scalar_type() == v.scalar_type(), "m and v must have the same dtype");
+  STD_TORCH_CHECK(m.device() == v.device(), "m and v must be on the same device");
+  Tensor res = new_empty(m, {m.size(0)});
+  THO_DISPATCH_V2(m.scalar_type(), "mv_tensor_accessor_cpu",
+                  AT_WRAP(([&]() {
+                    auto resa = Accessor_cpu<scalar_t, 1>(reinterpret_cast<scalar_t*>(res.data_ptr()), res.sizes().data(), res.strides().data());
+                    auto ma = Accessor_cpu<scalar_t, 2>(reinterpret_cast<scalar_t*>(m.data_ptr()), m.sizes().data(), m.strides().data());
+                    auto va = Accessor_cpu<scalar_t, 1>(reinterpret_cast<scalar_t*>(v.data_ptr()), v.sizes().data(), v.strides().data());
+                    mv_tensor_accessor_kernel<Accessor_cpu, scalar_t>(resa, ma, va);
+                  })),
+                  AT_FLOATING_TYPES);
+  return res;
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
+  m.def("mv_tensor_accessor(Tensor m, Tensor v) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CPU, m) {
+  m.impl("mv_tensor_accessor", TORCH_BOX(&mv_tensor_accessor_cpu));
+}
+
+// Test functions for torch::stable::accelerator APIs
+
+#ifdef LAE_USE_CUDA
+int64_t test_device_guard(int64_t device_index) {
+  using torch::stable::accelerator::DeviceGuard;
+
+  STD_TORCH_CHECK(
+      device_index >= std::numeric_limits<int32_t>::min() &&
+          device_index <= std::numeric_limits<int32_t>::max(),
+      "Device index is out of range of DeviceIndex (int32_t).");
+
+  DeviceGuard guard(device_index);
+  int currentDevice;
+  cudaError_t err = cudaGetDevice(&currentDevice);
+  STD_TORCH_CHECK(err == cudaSuccess);
+  return currentDevice;
+}
+
+int64_t test_device_guard_set_index() {
+  using torch::stable::accelerator::DeviceGuard;
+
+  DeviceGuard guard(1);
+  guard.set_index(0);
+  int currentDevice;
+  cudaError_t err = cudaGetDevice(&currentDevice);
+  STD_TORCH_CHECK(err == cudaSuccess);
+  return currentDevice;
+}
+
+int64_t test_stream(int32_t device_index) {
+  STD_TORCH_CHECK(
+      device_index >= std::numeric_limits<int32_t>::min() &&
+          device_index <= std::numeric_limits<int32_t>::max(),
+      "Device index is out of range of DeviceIndex (int32_t).");
+
+  return torch::stable::accelerator::getCurrentStream(device_index).id();
+}
+
+int64_t test_get_current_device_index() {
+  return torch::stable::accelerator::getCurrentDeviceIndex();
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
+  m.def("test_device_guard(int device_index) -> int");
+  m.def("test_device_guard_set_index() -> int");
+  m.def("test_stream(int device_index) -> int");
+  m.def("test_get_current_device_index() -> int");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
+  m.impl("test_device_guard", TORCH_BOX(&test_device_guard));
+  m.impl("test_device_guard_set_index", TORCH_BOX(&test_device_guard_set_index));
+  m.impl("test_stream", TORCH_BOX(&test_stream));
+  m.impl("test_get_current_device_index", TORCH_BOX(&test_get_current_device_index));
+}
+
+#endif // LAE_USE_CUDA
+
+Tensor my_flatten(Tensor t, int64_t start_dim, int64_t end_dim) {
+  return flatten(t, start_dim, end_dim);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
+  m.def("my_flatten(Tensor t, int start_dim=0, int end_dim=-1) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
+  m.impl("my_flatten", TORCH_BOX(&my_flatten));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/csrc/kernel.h b/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/csrc/kernel.h
new file mode 100644
index 0000000000000..3bbc6d118da52
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/csrc/kernel.h
@@ -0,0 +1,26 @@
+#include <torch/headeronly/core/Dispatch_v2.h>
+#include <torch/headeronly/core/TensorAccessor.h>
+
+template <typename T, size_t N>
+using Accessor_cpu = torch::headeronly::HeaderOnlyTensorAccessor<T, N>;
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#define MAYBE_GLOBAL __global__
+
+template <typename T, size_t N>
+using Accessor_cuda = torch::headeronly::HeaderOnlyGenericPackedTensorAccessor<T, N, torch::headeronly::RestrictPtrTraits>;
+
+#else
+#define MAYBE_GLOBAL
+#endif
+
+template <template <typename, size_t> class Accessor, typename scalar_t>
+MAYBE_GLOBAL void mv_tensor_accessor_kernel(Accessor<scalar_t, 1> resa, Accessor<scalar_t, 2> ma, Accessor<scalar_t, 1> va) {
+  for (int64_t i = 0; i < resa.size(0); i++) {
+    scalar_t val = 0;
+    for (int64_t j = 0; j < ma.size(1); j++) {
+      val += ma[i][j] * va[j];
+    }
+    resa[i] = val;
+  }
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py b/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/ops.py
similarity index 68%
rename from test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
rename to test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/ops.py
index 0000d667e1cbc..04a1377836554 100644
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
+++ b/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/ops.py
@@ -20,7 +20,7 @@ def sgd_out_of_place(param, grad, weight_decay, lr, maximize) -> Tensor:
         a 1D float Tensor the same shape as param
 
     """
-    return torch.ops.libtorch_agnostic.sgd_out_of_place.default(
+    return torch.ops.libtorch_agnostic_2_9.sgd_out_of_place.default(
         param, grad, weight_decay, lr, maximize
     )
 
@@ -35,7 +35,7 @@ def identity(t) -> Tensor:
     Returns:
         a Tensor, the same as input.
     """
-    return torch.ops.libtorch_agnostic.identity.default(t)
+    return torch.ops.libtorch_agnostic_2_9.identity.default(t)
 
 
 def my_abs(t) -> Tensor:
@@ -48,7 +48,7 @@ def my_abs(t) -> Tensor:
     Returns:
         a Tensor
     """
-    return torch.ops.libtorch_agnostic.my_abs.default(t)
+    return torch.ops.libtorch_agnostic_2_9.my_abs.default(t)
 
 
 def my_is_cpu(t) -> bool:
@@ -61,7 +61,7 @@ def my_is_cpu(t) -> bool:
     Returns:
         a bool
     """
-    return torch.ops.libtorch_agnostic.my_is_cpu.default(t)
+    return torch.ops.libtorch_agnostic_2_9.my_is_cpu.default(t)
 
 
 def my_ones_like(tensor, device) -> Tensor:
@@ -76,7 +76,7 @@ def my_ones_like(tensor, device) -> Tensor:
         a ones Tensor with the same dtype and shape and other attributes
         like the input tensor
     """
-    return torch.ops.libtorch_agnostic.my_ones_like.default(tensor, device)
+    return torch.ops.libtorch_agnostic_2_9.my_ones_like.default(tensor, device)
 
 
 def exp_neg_is_leaf(t1, t2, t3) -> tuple[Tensor, Tensor, bool]:
@@ -92,7 +92,7 @@ def exp_neg_is_leaf(t1, t2, t3) -> tuple[Tensor, Tensor, bool]:
     Returns:
         (exp(t1), neg(t2), is_leaf(t3))
     """
-    return torch.ops.libtorch_agnostic.exp_neg_is_leaf.default(t1, t2, t3)
+    return torch.ops.libtorch_agnostic_2_9.exp_neg_is_leaf.default(t1, t2, t3)
 
 
 def neg_exp(t) -> Tensor:
@@ -104,7 +104,7 @@ def neg_exp(t) -> Tensor:
 
     Returns: neg(exp(t))
     """
-    return torch.ops.libtorch_agnostic.neg_exp.default(t)
+    return torch.ops.libtorch_agnostic_2_9.neg_exp.default(t)
 
 
 def divide_neg_exp(t) -> Tensor:
@@ -116,7 +116,7 @@ def divide_neg_exp(t) -> Tensor:
 
     Returns: divide(neg(t), exp(t))
     """
-    return torch.ops.libtorch_agnostic.divide_neg_exp.default(t)
+    return torch.ops.libtorch_agnostic_2_9.divide_neg_exp.default(t)
 
 
 def is_contiguous(t) -> bool:
@@ -128,7 +128,7 @@ def is_contiguous(t) -> bool:
 
     Returns: is_contiguous(t)
     """
-    return torch.ops.libtorch_agnostic.is_contiguous.default(t)
+    return torch.ops.libtorch_agnostic_2_9.is_contiguous.default(t)
 
 
 def my_transpose(t, dim0, dim1) -> Tensor:
@@ -140,7 +140,7 @@ def my_transpose(t, dim0, dim1) -> Tensor:
 
     Returns: my_transpose(t, dim0, dim1)
     """
-    return torch.ops.libtorch_agnostic.my_transpose.default(t, dim0, dim1)
+    return torch.ops.libtorch_agnostic_2_9.my_transpose.default(t, dim0, dim1)
 
 
 def my_empty_like(t) -> Tensor:
@@ -152,7 +152,7 @@ def my_empty_like(t) -> Tensor:
 
     Returns: my_empty_like(t)
     """
-    return torch.ops.libtorch_agnostic.my_empty_like.default(t)
+    return torch.ops.libtorch_agnostic_2_9.my_empty_like.default(t)
 
 
 def my_zero_(t) -> Tensor:
@@ -164,7 +164,7 @@ def my_zero_(t) -> Tensor:
 
     Returns: my_zero_(t)
     """
-    return torch.ops.libtorch_agnostic.my_zero_.default(t)
+    return torch.ops.libtorch_agnostic_2_9.my_zero_.default(t)
 
 
 def my_amax(t) -> Tensor:
@@ -176,7 +176,7 @@ def my_amax(t) -> Tensor:
 
     Returns: amax(t)
     """
-    return torch.ops.libtorch_agnostic.my_amax.default(t)
+    return torch.ops.libtorch_agnostic_2_9.my_amax.default(t)
 
 
 def my_amax_vec(t) -> Tensor:
@@ -188,7 +188,7 @@ def my_amax_vec(t) -> Tensor:
 
     Returns: amax(t)
     """
-    return torch.ops.libtorch_agnostic.my_amax_vec.default(t)
+    return torch.ops.libtorch_agnostic_2_9.my_amax_vec.default(t)
 
 
 def fill_infinity(t) -> Tensor:
@@ -200,7 +200,7 @@ def fill_infinity(t) -> Tensor:
 
     Returns: The modified tensor (same as input)
     """
-    return torch.ops.libtorch_agnostic.fill_infinity.default(t)
+    return torch.ops.libtorch_agnostic_2_9.fill_infinity.default(t)
 
 
 def test_default_constructor(defined) -> bool:
@@ -212,7 +212,21 @@ def test_default_constructor(defined) -> bool:
 
     Returns: bool - result of calling .defined() on the tensor
     """
-    return torch.ops.libtorch_agnostic.test_default_constructor.default(defined)
+    return torch.ops.libtorch_agnostic_2_9.test_default_constructor.default(defined)
+
+
+def mv_tensor_accessor(m, v) -> Tensor:
+    """
+    Returns matrix-vector product.
+
+    Args:
+        m: any 2-D Tensor with shape (N, M)
+        v: any 1-D Tensor with shape (M,)
+
+    Returns:
+        a 1-D Tensor with shape (N,)
+    """
+    return torch.ops.libtorch_agnostic_2_9.mv_tensor_accessor.default(m, v)
 
 
 def my_pad(t) -> Tensor:
@@ -224,7 +238,7 @@ def my_pad(t) -> Tensor:
 
     Returns: Padded tensor with padding [1, 2, 2, 1], mode "constant", value 0.0
     """
-    return torch.ops.libtorch_agnostic.my_pad.default(t)
+    return torch.ops.libtorch_agnostic_2_9.my_pad.default(t)
 
 
 def my_narrow(t, dim, start, length) -> Tensor:
@@ -239,7 +253,7 @@ def my_narrow(t, dim, start, length) -> Tensor:
 
     Returns: Narrowed tensor
     """
-    return torch.ops.libtorch_agnostic.my_narrow.default(t, dim, start, length)
+    return torch.ops.libtorch_agnostic_2_9.my_narrow.default(t, dim, start, length)
 
 
 def my_copy_(dst, src, non_blocking) -> Tensor:
@@ -253,7 +267,7 @@ def my_copy_(dst, src, non_blocking) -> Tensor:
 
     Returns: Updated tensor
     """
-    return torch.ops.libtorch_agnostic.my_copy_.default(dst, src, non_blocking)
+    return torch.ops.libtorch_agnostic_2_9.my_copy_.default(dst, src, non_blocking)
 
 
 def my_clone(t) -> Tensor:
@@ -265,7 +279,7 @@ def my_clone(t) -> Tensor:
 
     Returns: Cloned tensor
     """
-    return torch.ops.libtorch_agnostic.my_clone.default(t)
+    return torch.ops.libtorch_agnostic_2_9.my_clone.default(t)
 
 
 def test_device_guard(device_index) -> int:
@@ -277,7 +291,7 @@ def test_device_guard(device_index) -> int:
 
     Returns: result of cudaGetDevice() as an integer after using the guard
     """
-    return torch.ops.libtorch_agnostic.test_device_guard.default(device_index)
+    return torch.ops.libtorch_agnostic_2_9.test_device_guard.default(device_index)
 
 
 def test_device_guard_set_index() -> int:
@@ -287,7 +301,7 @@ def test_device_guard_set_index() -> int:
 
     Returns: result of cudaGetDevice() as an integer after using set_index
     """
-    return torch.ops.libtorch_agnostic.test_device_guard_set_index.default()
+    return torch.ops.libtorch_agnostic_2_9.test_device_guard_set_index.default()
 
 
 def test_stream(device_index) -> int:
@@ -299,7 +313,7 @@ def test_stream(device_index) -> int:
 
     Returns: Stream ID as an integer
     """
-    return torch.ops.libtorch_agnostic.test_stream.default(device_index)
+    return torch.ops.libtorch_agnostic_2_9.test_stream.default(device_index)
 
 
 def test_get_current_device_index() -> int:
@@ -308,7 +322,7 @@ def test_get_current_device_index() -> int:
 
     Returns: Current device index as an integer
     """
-    return torch.ops.libtorch_agnostic.test_get_current_device_index.default()
+    return torch.ops.libtorch_agnostic_2_9.test_get_current_device_index.default()
 
 
 def my_new_empty_dtype_variant(t) -> Tensor:
@@ -320,7 +334,7 @@ def my_new_empty_dtype_variant(t) -> Tensor:
 
     Returns: New empty tensor with shape [2, 5] and dtype bfloat16
     """
-    return torch.ops.libtorch_agnostic.my_new_empty_dtype_variant.default(t)
+    return torch.ops.libtorch_agnostic_2_9.my_new_empty_dtype_variant.default(t)
 
 
 def my_new_zeros_dtype_variant(t) -> Tensor:
@@ -332,4 +346,18 @@ def my_new_zeros_dtype_variant(t) -> Tensor:
 
     Returns: New zeros tensor
     """
-    return torch.ops.libtorch_agnostic.my_new_zeros_dtype_variant.default(t)
+    return torch.ops.libtorch_agnostic_2_9.my_new_zeros_dtype_variant.default(t)
+
+
+def my_flatten(t, start_dim=0, end_dim=-1) -> Tensor:
+    """
+    Flattens the input tensor from start_dim to end_dim into a single dimension.
+
+    Args:
+        t: Tensor - tensor to flatten
+        start_dim: int - first dimension to flatten (default: 0)
+        end_dim: int - last dimension to flatten (default: -1)
+
+    Returns: Tensor - flattened tensor
+    """
+    return torch.ops.libtorch_agnostic_2_9.my_flatten.default(t, start_dim, end_dim)
diff --git a/test/cpp_extensions/libtorch_agnostic_2_9_extension/setup.py b/test/cpp_extensions/libtorch_agnostic_2_9_extension/setup.py
new file mode 100644
index 0000000000000..8543d496a432f
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_9_extension/setup.py
@@ -0,0 +1,82 @@
+import distutils.command.clean
+import shutil
+from pathlib import Path
+
+from setuptools import find_packages, setup
+
+import torch
+from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension
+
+
+ROOT_DIR = Path(__file__).parent
+CSRC_DIR = ROOT_DIR / "libtorch_agnostic_2_9" / "csrc"
+
+
+class clean(distutils.command.clean.clean):
+    def run(self):
+        # Run default behavior first
+        distutils.command.clean.clean.run(self)
+
+        # Remove extension
+        for path in (ROOT_DIR / "libtorch_agnostic_2_9").glob("**/*.so"):
+            path.unlink()
+        # Remove build and dist and egg-info directories
+        dirs = [
+            ROOT_DIR / "build",
+            ROOT_DIR / "dist",
+            ROOT_DIR / "libtorch_agnostic_2_9.egg-info",
+        ]
+        for path in dirs:
+            if path.exists():
+                shutil.rmtree(str(path), ignore_errors=True)
+
+
+def get_extension():
+    extra_compile_args = {
+        "cxx": [
+            "-fdiagnostics-color=always",
+            "-DTORCH_STABLE_ONLY",
+            "-DTORCH_TARGET_VERSION=0x0209000000000000",
+        ],
+    }
+    sources = list(CSRC_DIR.glob("**/*.cpp"))
+
+    extension = CppExtension
+    # allow including <cuda_runtime.h>
+    if torch.cuda.is_available():
+        extra_compile_args["cxx"].append("-DLAE_USE_CUDA")
+        extra_compile_args["nvcc"] = [
+            "-O2",
+            "-DTORCH_TARGET_VERSION=0x0209000000000000",
+        ]
+        extension = CUDAExtension
+        sources.extend(CSRC_DIR.glob("**/*.cu"))
+
+    return [
+        extension(
+            "libtorch_agnostic_2_9._C",
+            sources=sorted(str(s) for s in sources),
+            py_limited_api=True,
+            extra_compile_args=extra_compile_args,
+            extra_link_args=[],
+        )
+    ]
+
+
+setup(
+    name="libtorch_agnostic_2_9",
+    version="0.0",
+    author="PyTorch Core Team",
+    description="Example of libtorch agnostic extension for PyTorch 2.9",
+    packages=find_packages(exclude=("test",)),
+    package_data={"libtorch_agnostic_2_9": ["*.dll", "*.dylib", "*.so"]},
+    install_requires=[
+        "torch",
+    ],
+    ext_modules=get_extension(),
+    cmdclass={
+        "build_ext": BuildExtension.with_options(no_python_abi_suffix=True),
+        "clean": clean,
+    },
+    options={"bdist_wheel": {"py_limited_api": "cp39"}},
+)
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
deleted file mode 100644
index 58c812b08cccb..0000000000000
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
+++ /dev/null
@@ -1,576 +0,0 @@
-#include <torch/csrc/inductor/aoti_torch/c/shim.h>
-#include <torch/csrc/stable/accelerator.h>
-#include <torch/csrc/stable/library.h>
-#include <torch/csrc/stable/tensor.h>
-#include <torch/csrc/stable/ops.h>
-#include <torch/headeronly/util/Exception.h>
-#include <torch/headeronly/core/ScalarType.h>
-
-#ifdef LAE_USE_CUDA
-#include <cuda_runtime.h>
-#endif
-
-#include <optional>
-
-void inline sgd_math(
-  float* param_ptr,
-  float* grad_ptr,
-  float* out_ptr,
-  const float weight_decay,
-  const double lr,
-  const bool maximize,
-  int64_t size
-){
-  int64_t d = 0;
-  for (; d < size; d++) {
-    float grad_val = grad_ptr[d];
-    if (maximize) grad_val = -grad_val;
-    if (weight_decay != 0.0){
-      grad_val += param_ptr[d] * weight_decay;
-    }
-    out_ptr[d] = param_ptr[d] - grad_val * float(lr);
-  }
-}
-
-using torch::stable::Tensor;
-
-Tensor sgd_out_of_place(
-    const Tensor param,
-    const Tensor grad,
-    const float weight_decay,
-    const double lr,
-    const bool maximize) {
-  STD_TORCH_CHECK(param.dim() == 1, "param must be 1D");
-
-  // these test the get_device() and get_device_index() methods
-  // while ascertaining that we are still on CPU
-  STD_TORCH_CHECK(param.get_device() == -1, "CPU device index = -1");
-  STD_TORCH_CHECK(param.get_device_index() == -1, "CPU device index = -1");
-
-  int64_t *param_sizes;
-  int64_t *param_strides;
-  aoti_torch_get_sizes(param.get(), &param_sizes);
-  aoti_torch_get_strides(param.get(), &param_strides);
-
-  int32_t param_dtype;
-  aoti_torch_get_dtype(param.get(), &param_dtype);
-
-  int32_t param_device_type;
-  aoti_torch_get_device_type(param.get(), &param_device_type);
-
-  AtenTensorHandle out_ath;
-  aoti_torch_empty_strided(param.dim(), param_sizes, param_strides, param_dtype, param_device_type, param.get_device(), &out_ath);
-  auto out = Tensor(out_ath);
-
-  sgd_math(
-    reinterpret_cast<float*>(param.data_ptr()),
-    reinterpret_cast<float*>(grad.data_ptr()),
-    reinterpret_cast<float*>(out.data_ptr()),
-    weight_decay,
-    lr,
-    maximize,
-    param.numel()
-  );
-
-  return out;
-}
-
-void boxed_sgd_out_of_place(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  Tensor res = sgd_out_of_place(
-    to<Tensor>(stack[0]),
-    to<Tensor>(stack[1]),
-    float(to<double>(stack[2])),
-    to<double>(stack[3]),
-    to<bool>(stack[4]));
-
-  stack[0] = from(res);
-}
-
-STABLE_TORCH_LIBRARY(libtorch_agnostic, m) {
-  m.def("sgd_out_of_place(Tensor param, Tensor grad, float weight_decay, float lr, bool maximize) -> Tensor");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
-  m.impl("sgd_out_of_place", &boxed_sgd_out_of_place);
-}
-
-Tensor identity(Tensor t) {
-  return t;
-}
-
-void boxed_identity(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  Tensor res = identity(to<Tensor>(stack[0]));
-  stack[0] = from(res);
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
-  m.def("identity(Tensor t) -> Tensor");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CUDA, m) {
-  m.impl("identity", &boxed_identity);
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
-  m.impl("identity", &boxed_identity);
-}
-
-Tensor my_abs(Tensor t) {
-  const auto num_args = 1;
-  StableIValue stack[num_args];
-  stack[0] = from(t);
-  aoti_torch_call_dispatcher("aten::abs", "", stack);
-  return to<Tensor>(stack[0]);
-}
-
-void boxed_my_abs(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  Tensor tensor_res = my_abs(to<Tensor>(stack[0]));
-  stack[0] = from(tensor_res);
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
-  m.def("my_abs(Tensor t) -> Tensor");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
-  m.impl("my_abs", &boxed_my_abs);
-}
-
-Tensor my_ones_like(Tensor t, StableIValue device) {
-  const auto num_args = 6;
-  StableIValue stack[num_args];
-
-  auto mf = aoti_torch_memory_format_contiguous_format();
-
-  stack[0] = from(t);
-  stack[1] = from(std::optional(t.scalar_type()));    // dtype
-  stack[2] = from(std::nullopt);              // layout
-  stack[3] = from(std::optional(device));     // device
-  stack[4] = from(std::optional(false));      // pin_memory
-  stack[5] = from(std::optional(mf));         // memory_format
-
-  aoti_torch_call_dispatcher("aten::ones_like", "", stack);
-
-  return to<Tensor>(stack[0]);
-}
-
-void boxed_my_ones_like(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  Tensor res = my_ones_like(to<Tensor>(stack[0]), stack[1]);
-  stack[0] = from(res);
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
-  m.def("my_ones_like(Tensor t, Device d) -> Tensor");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
-  m.impl("my_ones_like", &boxed_my_ones_like);
-}
-
-std::tuple<Tensor, Tensor, bool> exp_neg_is_leaf(Tensor t1, Tensor t2, Tensor t3) {
-  StableIValue stack_exp[1];
-  stack_exp[0] = from(t1);
-  aoti_torch_call_dispatcher("aten::exp", "", stack_exp);
-
-  StableIValue stack_neg[1];
-  stack_neg[0] = from(t2);
-  aoti_torch_call_dispatcher("aten::neg", "", stack_neg);
-
-  StableIValue stack_is_leaf[1];
-  stack_is_leaf[0] = from(t3);
-  aoti_torch_call_dispatcher("aten::is_leaf", "", stack_is_leaf);
-
-  return std::make_tuple(
-    to<Tensor>(stack_exp[0]),
-    to<Tensor>(stack_neg[0]),
-    to<bool>(stack_is_leaf[0]));
-}
-
-void boxed_exp_neg_is_leaf(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  auto tuple = exp_neg_is_leaf(to<Tensor>(stack[0]), to<Tensor>(stack[1]), to<Tensor>(stack[2]));
-  stack[0] = from(std::get<0>(tuple));
-  stack[1] = from(std::get<1>(tuple));
-  stack[2] = from(std::get<2>(tuple));
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
-  m.def("exp_neg_is_leaf(Tensor t1, Tensor t2, Tensor t3) -> (Tensor, Tensor, bool)");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
-  m.impl("exp_neg_is_leaf", &boxed_exp_neg_is_leaf);
-}
-
-Tensor neg_exp(Tensor t) {
-  StableIValue stack[1];
-  stack[0] = from(t);
-  aoti_torch_call_dispatcher("aten::exp", "", stack);
-  aoti_torch_call_dispatcher("aten::neg", "", stack);
-  return to<Tensor>(stack[0]);
-}
-
-void boxed_neg_exp(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  Tensor res = neg_exp(to<Tensor>(stack[0]));
-  stack[0] = from(res);
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
-  m.def("neg_exp(Tensor t) -> Tensor");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
-  m.impl("neg_exp", &boxed_neg_exp);
-}
-
-Tensor divide_neg_exp(Tensor t) {
-  StableIValue stack_neg[1];
-  stack_neg[0] = from(t);
-
-  StableIValue stack_exp[1];
-  stack_exp[0] = from(t);
-  aoti_torch_call_dispatcher("aten::exp", "", stack_exp);
-  aoti_torch_call_dispatcher("aten::neg", "", stack_neg);
-
-  StableIValue stack_div[2];
-  stack_div[0] = stack_neg[0];
-  stack_div[1] = stack_exp[0];
-  aoti_torch_call_dispatcher("aten::divide", "Tensor", stack_div);
-  return to<Tensor>(stack_div[0]);
-}
-
-void boxed_divide_neg_exp(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  Tensor res = divide_neg_exp(to<Tensor>(stack[0]));
-  stack[0] = from(res);
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
-  m.def("divide_neg_exp(Tensor t) -> Tensor");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
-  m.impl("divide_neg_exp", &boxed_divide_neg_exp);
-}
-
-bool is_contiguous(Tensor t) {
-  return t.is_contiguous();
-}
-
-void boxed_is_contiguous(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  bool res = is_contiguous(to<Tensor>(stack[0]));
-  stack[0] = from(res);
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
-  m.def("is_contiguous(Tensor t) -> bool");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
-  m.impl("is_contiguous", &boxed_is_contiguous);
-}
-
-Tensor my_transpose(Tensor t, int64_t dim0, int64_t dim1) {
-  return transpose(t, dim0, dim1);
-}
-
-void boxed_my_transpose(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  auto res = my_transpose(to<Tensor>(stack[0]), to<int64_t>(stack[1]), to<int64_t>(stack[2]));
-
-  stack[0] = from(res);
-}
-
-Tensor my_empty_like(Tensor t) {
-  return empty_like(t);
-}
-
-void boxed_empty_like(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  auto res = my_empty_like(to<Tensor>(stack[0]));
-  stack[0] = from(res);
-}
-
-bool my_is_cpu(Tensor t) {
-  return t.is_cpu();
-}
-
-
-void boxed_my_is_cpu(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  auto res = my_is_cpu(to<Tensor>(stack[0]));
-  stack[0] = from(res);
-}
-
-Tensor fill_infinity(Tensor t) {
-  auto value = std::numeric_limits<float>::infinity();
-  return fill_(t, value);
-}
-
-void boxed_fill_infinity(
-    StableIValue* stack,
-    uint64_t num_args,
-    uint64_t num_outputs) {
-  auto res = fill_infinity(to<Tensor>(stack[0]));
-  stack[0] = from(res);
-}
-
-Tensor my_pad(Tensor t) {
-  std::vector<int64_t> padding = {1, 2, 2, 1};
-  std::string mode = "constant";
-  double value = 0.0;
-  return pad(t, padding, mode, value);
-}
-
-void boxed_my_pad(
-    StableIValue* stack,
-    uint64_t num_args,
-    uint64_t num_outputs) {
-  auto res = my_pad(to<Tensor>(stack[0]));
-  stack[0] = from(res);
-}
-
-Tensor my_narrow(Tensor t, int64_t dim, int64_t start, int64_t length) {
-  return narrow(t, dim, start, length);
-}
-
-void boxed_my_narrow(
-    StableIValue* stack,
-    uint64_t num_args,
-    uint64_t num_outputs) {
-  auto res = my_narrow(
-      to<Tensor>(stack[0]),
-      to<int64_t>(stack[1]),
-      to<int64_t>(stack[2]),
-      to<int64_t>(stack[3]));
-  stack[0] = from(res);
-}
-
-Tensor my_new_empty_dtype_variant(Tensor t) {
-  std::vector<int64_t> sizes = {2, 5};
-  auto dtype = std::make_optional(torch::headeronly::ScalarType::BFloat16);
-  return new_empty(t, sizes, dtype);
-}
-
-void boxed_my_new_empty_dtype_variant(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  auto res = my_new_empty_dtype_variant(to<Tensor>(stack[0]));
-  stack[0] = from(res);
-}
-
-Tensor my_new_zeros_dtype_variant(Tensor t) {
-  std::vector<int64_t> sizes = {2, 5};
-  auto dtype = std::make_optional(at::ScalarType::Float);
-  return new_zeros(t, sizes, dtype);
-}
-
-void boxed_my_new_zeros_dtype_variant(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  auto res = my_new_zeros_dtype_variant(to<Tensor>(stack[0]));
-  stack[0] = from(res);
-}
-
-Tensor my_copy_(Tensor dst, Tensor src, bool non_blocking) {
-  return copy_(dst, src, non_blocking);
-}
-
-void boxed_my_copy_(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  Tensor tensor_res = my_copy_(to<Tensor>(stack[0]), to<Tensor>(stack[1]), to<bool>(stack[2]));
-  stack[0] = from(tensor_res);
-}
-
-Tensor my_clone(Tensor t) {
-  return clone(t);
-}
-
-void boxed_my_clone(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  Tensor tensor_res = my_clone(to<Tensor>(stack[0]));
-  stack[0] = from(tensor_res);
-}
-
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
-  m.def("my_transpose(Tensor t, int dim0, int dim1) -> Tensor");
-  m.def("my_empty_like(Tensor t) -> Tensor");
-  m.def("fill_infinity(Tensor(a!) t) -> Tensor(a!)");
-  m.def("my_pad(Tensor t) -> Tensor");
-  m.def("my_narrow(Tensor t, int dim, int start, int length) -> Tensor");
-  m.def("my_new_empty_dtype_variant(Tensor t) -> Tensor");
-  m.def("my_new_zeros_dtype_variant(Tensor t) -> Tensor");
-  m.def("my_copy_(Tensor dst, Tensor src, bool non_blocking) -> Tensor");
-  m.def("my_clone(Tensor t) -> Tensor");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
-  m.impl("my_transpose", &boxed_my_transpose);
-  m.impl("my_empty_like", &boxed_empty_like);
-  m.impl("fill_infinity", &boxed_fill_infinity);
-  m.impl("my_is_cpu", &boxed_my_is_cpu);
-  m.impl("my_new_empty_dtype_variant", &boxed_my_new_empty_dtype_variant);
-  m.impl("my_new_zeros_dtype_variant", &boxed_my_new_zeros_dtype_variant);
-  m.impl("my_copy_", &boxed_my_copy_);
-  m.impl("my_clone", &boxed_my_clone);
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeImplicitAutograd, m) {
-  m.impl("my_pad", &boxed_my_pad);
-  m.impl("my_narrow", &boxed_my_narrow);
-}
-
-Tensor my_zero_(Tensor t) {
-  return zero_(t);
-}
-
-void boxed_my_zero_(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  auto res = my_zero_(to<Tensor>(stack[0]));
-  stack[0] = from(res);
-}
-
-Tensor my_amax(Tensor t) {
-  return amax(t, 0, false);
-}
-
-void boxed_my_amax(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  auto res = my_amax(to<Tensor>(stack[0]));
-  stack[0] = from(res);
-}
-
-Tensor my_amax_vec(Tensor t) {
-  std::vector<int64_t> v = {0,1};
-  return amax(t, v, false);
-}
-
-void boxed_my_amax_vec(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  auto res = my_amax_vec(to<Tensor>(stack[0]));
-  stack[0] = from(res);
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
-  m.def("my_zero_(Tensor(a!) t) -> Tensor(a!)");
-  m.def("my_amax(Tensor a) -> Tensor");
-  m.def("my_amax_vec(Tensor a) -> Tensor");
-  m.def("my_is_cpu(Tensor t) -> bool");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
-  m.impl("my_zero_", &boxed_my_zero_);
-}
-
-bool test_default_constructor(bool defined) {
-  Tensor out;
-  if (defined) {
-    AtenTensorHandle defined_ath;
-    int64_t sizes[] = {2, 3};
-    int64_t strides[] = {3, 1};
-    aoti_torch_empty_strided(
-        2,
-        sizes,
-        strides,
-        aoti_torch_dtype_float32(),
-        aoti_torch_device_type_cpu(),
-        0,
-        &defined_ath);
-    out = Tensor(defined_ath);
-  }
-  return out.defined();
-}
-
-void boxed_test_default_constructor(
-    StableIValue* stack,
-    uint64_t num_args,
-    uint64_t num_outputs) {
-  bool res = test_default_constructor(to<bool>(stack[0]));
-  stack[0] = from(res);
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
-  m.def("test_default_constructor(bool undefined) -> bool");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
-  m.impl("test_default_constructor", &boxed_test_default_constructor);
-  m.impl("my_amax", &boxed_my_amax);
-  m.impl("my_amax_vec", &boxed_my_amax_vec);
-}
-
-// Test functions for torch::stable::accelerator APIs
-
-#ifdef LAE_USE_CUDA
-int64_t test_device_guard(int64_t device_index) {
-  using torch::stable::accelerator::DeviceGuard;
-
-  STD_TORCH_CHECK(
-      device_index >= std::numeric_limits<int32_t>::min() &&
-          device_index <= std::numeric_limits<int32_t>::max(),
-      "Device index is out of range of DeviceIndex (int32_t).");
-
-  DeviceGuard guard(device_index);
-  int currentDevice;
-  cudaError_t err = cudaGetDevice(&currentDevice);
-  STD_TORCH_CHECK(err == cudaSuccess);
-  return currentDevice;
-}
-
-void boxed_test_device_guard(
-    StableIValue* stack,
-    uint64_t num_args,
-    uint64_t num_outputs) {
-  int res = test_device_guard(static_cast<int64_t>(to<int64_t>(stack[0])));
-  stack[0] = from(res);
-}
-
-int64_t test_device_guard_set_index() {
-  using torch::stable::accelerator::DeviceGuard;
-
-  DeviceGuard guard(1);
-  guard.set_index(0);
-  int currentDevice;
-  cudaError_t err = cudaGetDevice(&currentDevice);
-  STD_TORCH_CHECK(err == cudaSuccess);
-  return currentDevice;
-}
-
-void boxed_test_device_guard_set_index(
-    StableIValue* stack,
-    uint64_t num_args,
-    uint64_t num_outputs) {
-  int64_t res = test_device_guard_set_index();
-  stack[0] = from(res);
-}
-
-int64_t test_stream(int32_t device_index) {
-  STD_TORCH_CHECK(
-      device_index >= std::numeric_limits<int32_t>::min() &&
-          device_index <= std::numeric_limits<int32_t>::max(),
-      "Device index is out of range of DeviceIndex (int32_t).");
-
-  return torch::stable::accelerator::getCurrentStream(device_index).id();
-}
-
-void boxed_test_stream(
-    StableIValue* stack,
-    uint64_t num_args,
-    uint64_t num_outputs) {
-  int64_t res = test_stream(static_cast<int64_t>(to<int64_t>(stack[0])));
-  stack[0] = from(res);
-}
-
-int64_t test_get_current_device_index() {
-  return torch::stable::accelerator::getCurrentDeviceIndex();
-}
-
-void boxed_test_get_current_device_index(
-    StableIValue* stack,
-    uint64_t num_args,
-    uint64_t num_outputs) {
-  int64_t res = test_get_current_device_index();
-  stack[0] = from(res);
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
-  m.def("test_device_guard(int device_index) -> int");
-  m.def("test_device_guard_set_index() -> int");
-  m.def("test_stream(int device_index) -> int");
-  m.def("test_get_current_device_index() -> int");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
-  m.impl("test_device_guard", &boxed_test_device_guard);
-  m.impl("test_device_guard_set_index", &boxed_test_device_guard_set_index);
-  m.impl("test_stream", &boxed_test_stream);
-  m.impl("test_get_current_device_index", &boxed_test_get_current_device_index);
-}
-#endif // LAE_USE_CUDA
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py b/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
deleted file mode 100644
index 35610332a36cd..0000000000000
--- a/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
+++ /dev/null
@@ -1,373 +0,0 @@
-# Owner(s): ["module: cpp"]
-
-import math
-from pathlib import Path
-
-import torch
-from torch.testing._internal.common_device_type import (
-    deviceCountAtLeast,
-    instantiate_device_type_tests,
-    onlyCPU,
-    onlyCUDA,
-)
-from torch.testing._internal.common_utils import (
-    install_cpp_extension,
-    IS_WINDOWS,
-    run_tests,
-    TestCase,
-    xfailIfTorchDynamo,
-)
-
-
-# TODO: Fix this error in Windows:
-# LINK : error LNK2001: unresolved external symbol PyInit__C
-if not IS_WINDOWS:
-
-    class TestLibtorchAgnostic(TestCase):
-        @classmethod
-        def setUpClass(cls):
-            try:
-                import libtorch_agnostic  # noqa: F401
-            except Exception:
-                install_cpp_extension(extension_root=Path(__file__).parent.parent)
-
-        @onlyCPU
-        def test_slow_sgd(self, device):
-            import libtorch_agnostic
-
-            param = torch.rand(5, device=device)
-            grad = torch.rand_like(param)
-            weight_decay = 0.01
-            lr = 0.001
-            maximize = False
-
-            new_param = libtorch_agnostic.ops.sgd_out_of_place(
-                param, grad, weight_decay, lr, maximize
-            )
-            torch._fused_sgd_(
-                (param,),
-                (grad,),
-                (),
-                weight_decay=weight_decay,
-                momentum=0.0,
-                lr=lr,
-                dampening=0.0,
-                nesterov=False,
-                maximize=maximize,
-                is_first_step=False,
-            )
-            self.assertEqual(new_param, param)
-
-        @onlyCUDA
-        def test_identity_does_not_hog_memory(self, device):
-            import libtorch_agnostic
-
-            def _run_identity(prior_mem):
-                t = torch.rand(32, 32, device=device)
-                self.assertGreater(torch.cuda.memory_allocated(device), prior_mem)
-                identi_t = libtorch_agnostic.ops.identity(t)
-                assert identi_t is t
-
-            init_mem = torch.cuda.memory_allocated(device)
-
-            for _ in range(3):
-                _run_identity(init_mem)
-                curr_mem = torch.cuda.memory_allocated(device)
-                self.assertEqual(curr_mem, init_mem)
-
-        def test_exp_neg_is_leaf(self, device):
-            import libtorch_agnostic
-
-            t1 = torch.rand(2, 3, device=device)
-            t2 = torch.rand(3, 2, device=device)
-            t3 = torch.rand(2, device=device)
-
-            exp, neg, is_leaf = libtorch_agnostic.ops.exp_neg_is_leaf(t1, t2, t3)
-            self.assertEqual(exp, torch.exp(t1))
-            self.assertEqual(neg, torch.neg(t2))
-            self.assertEqual(is_leaf, t3.is_leaf)
-
-        def test_my_abs(self, device):
-            import libtorch_agnostic
-
-            t = torch.rand(32, 16, device=device) - 0.5
-            res = libtorch_agnostic.ops.my_abs(t)
-            self.assertEqual(res, torch.abs(t))
-
-            def _make_cuda_tensors(prior_mem):
-                cuda_t = libtorch_agnostic.ops.my_abs(t)
-                self.assertGreater(torch.cuda.memory_allocated(device), prior_mem)
-                self.assertEqual(cuda_t, torch.abs(t))
-
-            if t.is_cuda:
-                init_mem = torch.cuda.memory_allocated(device)
-                for _ in range(3):
-                    _make_cuda_tensors(init_mem)
-                    curr_mem = torch.cuda.memory_allocated(device)
-                    self.assertEqual(curr_mem, init_mem)
-
-        def test_neg_exp(self, device):
-            import libtorch_agnostic
-
-            t = torch.rand(32, 16, device=device) - 0.5
-            res = libtorch_agnostic.ops.neg_exp(t)
-            self.assertEqual(res, torch.neg(torch.exp(t)))
-
-            def _make_cuda_tensors(prior_mem):
-                cuda_res = libtorch_agnostic.ops.neg_exp(t)
-                self.assertGreater(torch.cuda.memory_allocated(device), prior_mem)
-                self.assertEqual(cuda_res, torch.neg(torch.exp(t)))
-
-            if t.is_cuda:
-                init_mem = torch.cuda.memory_allocated(device)
-                for _ in range(3):
-                    _make_cuda_tensors(init_mem)
-                    curr_mem = torch.cuda.memory_allocated(device)
-                    self.assertEqual(curr_mem, init_mem)
-
-        def test_divide_neg_exp(self, device):
-            import libtorch_agnostic
-
-            t = torch.zeros(2, 3, device=device) - 0.5
-            res = libtorch_agnostic.ops.divide_neg_exp(t)
-            self.assertEqual(res, torch.neg(t) / torch.exp(t))
-
-            def _make_cuda_tensors(prior_mem):
-                cuda_res = libtorch_agnostic.ops.divide_neg_exp(t)
-                self.assertGreater(torch.cuda.memory_allocated(device), prior_mem)
-                self.assertEqual(cuda_res, torch.neg(t) / torch.exp(t))
-
-            if t.is_cuda:
-                init_mem = torch.cuda.memory_allocated(device)
-                for _ in range(3):
-                    _make_cuda_tensors(init_mem)
-                    curr_mem = torch.cuda.memory_allocated(device)
-                    self.assertEqual(curr_mem, init_mem)
-
-        def test_is_contiguous(self, device):
-            import libtorch_agnostic
-
-            t = torch.rand(2, 7, device=device)
-            self.assertTrue(libtorch_agnostic.ops.is_contiguous(t))
-            self.assertFalse(libtorch_agnostic.ops.is_contiguous(t.transpose(0, 1)))
-
-        # TODO: Debug this:
-        # torch._dynamo.exc.TorchRuntimeError: Dynamo failed to run FX node with fake tensors:
-        # call_function libtorch_agnostic.my_ones_like.default(*(FakeTensor(..., size=(3, 1)), 'cpu'),
-        # **{}): got AssertionError("tensor's device must be `meta`, got cpu instead")
-        @xfailIfTorchDynamo
-        def test_my_ones_like(self, device):
-            import libtorch_agnostic
-
-            t = torch.rand(3, 1, device=device) - 0.5
-            cpu_t = libtorch_agnostic.ops.my_ones_like(t, "cpu")
-            self.assertEqual(cpu_t, torch.ones_like(t, device="cpu"))
-
-            def _make_cuda_tensors(prior_mem):
-                cuda_t = libtorch_agnostic.ops.my_ones_like(t, device)
-                self.assertGreater(torch.cuda.memory_allocated(device), prior_mem)
-                self.assertEqual(cuda_t, torch.ones_like(t, device=device))
-
-            if t.is_cuda:
-                init_mem = torch.cuda.memory_allocated(device)
-                for _ in range(3):
-                    _make_cuda_tensors(init_mem)
-                    curr_mem = torch.cuda.memory_allocated(device)
-                    self.assertEqual(curr_mem, init_mem)
-
-        def test_my_transpose(self, device):
-            import libtorch_agnostic
-
-            t = torch.rand(2, 7, device=device)
-            out = libtorch_agnostic.ops.my_transpose(t, 0, 1)
-            self.assertEqual(out, torch.transpose(t, 0, 1))
-
-            with self.assertRaisesRegex(RuntimeError, "API call failed"):
-                libtorch_agnostic.ops.my_transpose(t, 1, 2)
-
-        def test_my_empty_like(self, device):
-            import libtorch_agnostic
-
-            deterministic = torch.are_deterministic_algorithms_enabled()
-            try:
-                # set use_deterministic_algorithms to fill uninitialized memory
-                torch.use_deterministic_algorithms(True)
-
-                t = torch.rand(2, 7, device=device)
-                out = libtorch_agnostic.ops.my_empty_like(t)
-                self.assertTrue(id(out != id(t)))
-                self.assertEqual(out, torch.empty_like(t))
-            finally:
-                torch.use_deterministic_algorithms(deterministic)
-
-        @onlyCPU
-        def test_my_zero_(self, device):
-            import libtorch_agnostic
-
-            t = torch.rand(2, 7, device=device)
-            out = libtorch_agnostic.ops.my_zero_(t)
-            self.assertEqual(id(out), id(t))
-            self.assertEqual(out, torch.zeros_like(t))
-
-        def test_my_amax(self, device):
-            import libtorch_agnostic
-
-            t = torch.rand(2, 7, device=device)
-            out = libtorch_agnostic.ops.my_amax(t)
-            self.assertEqual(out, torch.amax(t, 0))
-
-        def test_my_amax_vec(self, device):
-            import libtorch_agnostic
-
-            t = torch.rand(2, 7, 5, device=device)
-            out = libtorch_agnostic.ops.my_amax_vec(t)
-            self.assertEqual(out, torch.amax(t, (0, 1)))
-
-        def test_my_is_cpu(self, device):
-            import libtorch_agnostic
-
-            t = torch.rand(2, 7, device=device)
-            out = libtorch_agnostic.ops.my_is_cpu(t)
-            self.assertEqual(out, t.is_cpu)
-
-        def test_fill_infinity(self, device):
-            import libtorch_agnostic
-
-            t = torch.rand(3, 4, device=device)
-            out = libtorch_agnostic.ops.fill_infinity(t)
-
-            self.assertEqual(id(out), id(t))
-            expected = torch.full_like(t, math.inf)
-            self.assertEqual(out, expected)
-
-        @onlyCPU
-        def test_default_constructor(self):
-            import libtorch_agnostic
-
-            defined_tensor_is_defined = libtorch_agnostic.ops.test_default_constructor(
-                True
-            )
-            self.assertTrue(defined_tensor_is_defined)
-
-            undefined_tensor_is_defined = (
-                libtorch_agnostic.ops.test_default_constructor(False)
-            )
-            self.assertFalse(undefined_tensor_is_defined)
-
-        def test_my_pad(self, device):
-            import libtorch_agnostic
-
-            t = torch.rand(2, 3, device=device)
-            out = libtorch_agnostic.ops.my_pad(t)
-            expected = torch.nn.functional.pad(t, [1, 2, 2, 1], "constant", 0.0)
-            self.assertEqual(out, expected)
-
-        def test_my_narrow(self, device):
-            import libtorch_agnostic
-
-            t = torch.randn(2, 5, device=device)
-
-            dim0 = 0
-            start0 = 0
-            length0 = 1
-            out0 = libtorch_agnostic.ops.my_narrow(t, dim0, start0, length0)
-            expected0 = torch.narrow(t, dim0, start0, length0)
-            self.assertEqual(out0, expected0)
-
-        @onlyCUDA
-        @deviceCountAtLeast(2)
-        def test_device_guard(self, device):
-            import libtorch_agnostic
-
-            device_index = 1
-            out = libtorch_agnostic.ops.test_device_guard(device_index)
-            self.assertEqual(out, device_index)
-
-        @onlyCUDA
-        @deviceCountAtLeast(2)
-        def test_device_guard_set_index(self, device):
-            import libtorch_agnostic
-
-            # This test creates a DeviceGuard with index 1, then sets it to index 0
-            # and returns the current device (should be 0)
-            out = libtorch_agnostic.ops.test_device_guard_set_index()
-            self.assertEqual(out, 0)
-
-        @onlyCUDA
-        def test_stream(self, device):
-            import libtorch_agnostic
-
-            stream = torch.cuda.Stream()
-            device = torch.cuda.current_device()
-
-            with stream:
-                expected_stream_id = torch.cuda.current_stream(0).stream_id
-                stream_id = libtorch_agnostic.ops.test_stream(device)
-
-            self.assertEqual(stream_id, expected_stream_id)
-
-        @onlyCUDA
-        @deviceCountAtLeast(2)
-        def test_get_current_device_index(self, device):
-            import libtorch_agnostic
-
-            prev_device = torch.cuda.current_device()
-
-            try:
-                expected_device = 1
-                torch.cuda.set_device(expected_device)
-
-                current_device = libtorch_agnostic.ops.test_get_current_device_index()
-                self.assertEqual(current_device, expected_device)
-            finally:
-                torch.cuda.set_device(prev_device)
-
-        def test_my_new_empty_dtype_variant(self, device):
-            import libtorch_agnostic
-
-            deterministic = torch.are_deterministic_algorithms_enabled()
-            try:
-                # set use_deterministic_algorithms to fill uninitialized memory
-                torch.use_deterministic_algorithms(True)
-                t = torch.randn(3, 4, device=device)
-                out = libtorch_agnostic.ops.my_new_empty_dtype_variant(t)
-                ref_out = t.new_empty((2, 5), dtype=torch.bfloat16)
-
-                self.assertEqual(out, ref_out, exact_device=True)
-            finally:
-                torch.use_deterministic_algorithms(deterministic)
-
-        def test_my_new_zeros_dtype_variant(self, device):
-            import libtorch_agnostic
-
-            t = torch.randn(3, 4, device=device)
-            out = libtorch_agnostic.ops.my_new_zeros_dtype_variant(t)
-            ref_out = t.new_zeros((2, 5), dtype=torch.float)
-            self.assertEqual(out, ref_out, exact_device=True)
-
-        def test_my_copy_(self, device):
-            import libtorch_agnostic
-
-            dst = torch.empty(2, 5, device=device)
-            src = torch.randn(2, 5, device=device)
-
-            result = libtorch_agnostic.ops.my_copy_(dst, src, False)
-            expected = src
-            self.assertEqual(result, expected)
-            self.assertEqual(result.data_ptr(), dst.data_ptr())
-
-        def test_my_clone(self, device):
-            import libtorch_agnostic
-
-            t = torch.randn(2, 5, device=device)
-
-            result = libtorch_agnostic.ops.my_clone(t)
-            expected = t.clone()
-            self.assertEqual(result, expected)
-            self.assertNotEqual(result.data_ptr(), expected.data_ptr())
-            self.assertEqual(result.stride(), expected.stride())
-
-    instantiate_device_type_tests(TestLibtorchAgnostic, globals(), except_for=None)
-
-if __name__ == "__main__":
-    run_tests()
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegException.h b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegException.h
index 16c1ee1ca2309..ece19a016fd61 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegException.h
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegException.h
@@ -4,17 +4,12 @@
 
 #include <c10/util/Exception.h>
 
-void orCheckFail(
-    const char* func,
-    const char* file,
-    uint32_t line,
-    const char* msg = "");
-
-#define OPENREG_CHECK(EXPR, ...)                                               \
-  do {                                                                         \
-    const orError_t __err = EXPR;                                              \
-    if (__err != orSuccess) {                                                  \
-      orCheckFail(                                                             \
-          __func__, __FILE__, static_cast<uint32_t>(__LINE__), ##__VA_ARGS__); \
-    }                                                                          \
+void orCheckFail(const char* func, const char* file, uint32_t line, const char* msg = "");
+
+#define OPENREG_CHECK(EXPR, ...)                                                       \
+  do {                                                                                 \
+    const orError_t __err = EXPR;                                                      \
+    if (C10_UNLIKELY(__err != orSuccess)) {                                            \
+      orCheckFail(__func__, __FILE__, static_cast<uint32_t>(__LINE__), ##__VA_ARGS__); \
+    }                                                                                  \
   } while (0)
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.cpp
index ac39453a7f4d1..96464581f13c4 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.cpp
@@ -1,3 +1,4 @@
+#include <c10/util/Exception.h>
 #include <include/openreg.h>
 
 #include "OpenRegException.h"
@@ -9,21 +10,22 @@ orError_t GetDeviceCount(int* dev_count) {
   return orGetDeviceCount(dev_count);
 }
 
-orError_t GetDevice(c10::DeviceIndex* device) {
+orError_t GetDevice(DeviceIndex* device) {
   int tmp_device = -1;
   auto err = orGetDevice(&tmp_device);
-  *device = static_cast<c10::DeviceIndex>(tmp_device);
+  *device = static_cast<DeviceIndex>(tmp_device);
   return err;
 }
-
-orError_t SetDevice(c10::DeviceIndex device) {
+// LITERALINCLUDE START: OPENREG SetDevice FUNCTION
+orError_t SetDevice(DeviceIndex device) {
   int cur_device = -1;
-  orGetDevice(&cur_device);
+  OPENREG_CHECK(orGetDevice(&cur_device));
   if (device == cur_device) {
     return orSuccess;
   }
   return orSetDevice(device);
 }
+// LITERALINCLUDE END: OPENREG SetDevice FUNCTION
 
 int device_count_impl() {
   int count = 0;
@@ -31,34 +33,37 @@ int device_count_impl() {
   return count;
 }
 
-OPENREG_EXPORT c10::DeviceIndex device_count() noexcept {
+OPENREG_EXPORT DeviceIndex device_count() noexcept {
   // initialize number of devices only once
   static int count = []() {
     try {
       auto result = device_count_impl();
       TORCH_CHECK(
-          result <= std::numeric_limits<c10::DeviceIndex>::max(),
+          result <= std::numeric_limits<DeviceIndex>::max(),
           "Too many devices, DeviceIndex overflowed");
       return result;
-    } catch (const c10::Error& ex) {
+    } catch (const Error& ex) {
       // We don't want to fail, but still log the warning
       // msg() returns the message without the stack trace
       TORCH_WARN("Device initialization: ", ex.msg());
       return 0;
     }
   }();
-  return static_cast<c10::DeviceIndex>(count);
+  return static_cast<DeviceIndex>(count);
 }
 
-OPENREG_EXPORT c10::DeviceIndex current_device() {
-  c10::DeviceIndex cur_device = -1;
-  GetDevice(&cur_device);
+OPENREG_EXPORT DeviceIndex current_device() {
+  DeviceIndex cur_device = -1;
+  OPENREG_CHECK(GetDevice(&cur_device));
   return cur_device;
 }
 
-OPENREG_EXPORT void set_device(c10::DeviceIndex device) {
-  SetDevice(device);
+// LITERALINCLUDE START: OPENREG set_device FUNCTION
+OPENREG_EXPORT void set_device(DeviceIndex device) {
+  check_device_index(device);
+  OPENREG_CHECK(SetDevice(device));
 }
+// LITERALINCLUDE END: OPENREG set_device FUNCTION
 
 OPENREG_EXPORT DeviceIndex ExchangeDevice(DeviceIndex device) {
   int current_device = -1;
@@ -71,4 +76,8 @@ OPENREG_EXPORT DeviceIndex ExchangeDevice(DeviceIndex device) {
   return current_device;
 }
 
+OPENREG_EXPORT DeviceIndex maybe_exchange_device(DeviceIndex to_device) {
+  check_device_index(to_device);
+  return ExchangeDevice(to_device);
+}
 } // namespace c10::openreg
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.h b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.h
index c2eb1e8074961..4bc6f2a35c4cc 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.h
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.h
@@ -9,10 +9,20 @@
 
 namespace c10::openreg {
 
-OPENREG_EXPORT c10::DeviceIndex device_count() noexcept;
-OPENREG_EXPORT c10::DeviceIndex current_device();
-OPENREG_EXPORT void set_device(c10::DeviceIndex device);
+OPENREG_EXPORT DeviceIndex device_count() noexcept;
+OPENREG_EXPORT DeviceIndex current_device();
+OPENREG_EXPORT void set_device(DeviceIndex device);
+OPENREG_EXPORT DeviceIndex maybe_exchange_device(DeviceIndex to_device);
 
 OPENREG_EXPORT DeviceIndex ExchangeDevice(DeviceIndex device);
 
+static inline void check_device_index(int64_t device) {
+  TORCH_CHECK(device >= 0 && device < c10::openreg::device_count(),
+              "The device index is out of range. It must be in [0, ",
+              static_cast<int>(c10::openreg::device_count()),
+              "), but got ",
+              static_cast<int>(device),
+              ".");
+}
+
 } // namespace c10::openreg
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegGenerator.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegGenerator.cpp
index c2e03f66adc41..7aa0ca54b09ca 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegGenerator.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegGenerator.cpp
@@ -5,6 +5,7 @@ static std::vector<at::Generator> default_generators;
 
 namespace c10::openreg {
 
+// LITERALINCLUDE START: OPENREG GET DEFAULT GENERATOR IMPL
 const at::Generator& getDefaultOpenRegGenerator(c10::DeviceIndex device_index) {
   static bool flag [[maybe_unused]] = []() {
     auto deivce_nums = device_count();
@@ -24,5 +25,6 @@ const at::Generator& getDefaultOpenRegGenerator(c10::DeviceIndex device_index) {
   }
   return default_generators[idx];
 }
+// LITERALINCLUDE END: OPENREG GET DEFAULT GENERATOR IMPL
 
 } // namespace c10::openreg
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegGuard.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegGuard.cpp
index d50e56e40942d..4986a0a9910c0 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegGuard.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegGuard.cpp
@@ -2,6 +2,8 @@
 
 namespace c10::openreg {
 
+// LITERALINCLUDE START: OPENREG GUARD REGISTRATION
 C10_REGISTER_GUARD_IMPL(PrivateUse1, OpenRegGuardImpl);
+// LITERALINCLUDE END: OPENREG GUARD REGISTRATION
 
 } // namespace c10::openreg
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegGuard.h b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegGuard.h
index ad89b7a208cb4..59bc2d5cdbff5 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegGuard.h
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegGuard.h
@@ -11,6 +11,7 @@
 
 namespace c10::openreg {
 
+// LITERALINCLUDE START: OPENREG DEVICE MGMT GUARD IMPL EXAMPLE
 struct OpenRegGuardImpl final : public c10::impl::DeviceGuardImplInterface {
   static constexpr DeviceType static_type = c10::DeviceType::PrivateUse1;
 
@@ -58,6 +59,7 @@ struct OpenRegGuardImpl final : public c10::impl::DeviceGuardImplInterface {
 
     set_device(d.index());
   }
+// LITERALINCLUDE END: OPENREG DEVICE MGMT GUARD IMPL EXAMPLE
 
   /**
    * Set the current device to c10::Device, without checking for errors
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegHooks.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegHooks.cpp
index 57bc2d9f0d1bc..bc02b14d6b48a 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegHooks.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegHooks.cpp
@@ -1,5 +1,6 @@
 #include "OpenRegHooks.h"
 
+// LITERALINCLUDE START: OPENREG HOOK REGISTER
 namespace c10::openreg {
 
 static bool register_hook_flag [[maybe_unused]] = []() {
@@ -9,3 +10,4 @@ static bool register_hook_flag [[maybe_unused]] = []() {
 }();
 
 } // namespace c10::openreg
+// LITERALINCLUDE END: OPENREG HOOK REGISTER
\ No newline at end of file
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegHooks.h b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegHooks.h
index e6eb0c6f26083..1db986efbb146 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegHooks.h
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegHooks.h
@@ -8,17 +8,58 @@
 
 #include <include/openreg.h>
 
+#include "OpenRegFunctions.h"
 #include "OpenRegGenerator.h"
 
 namespace c10::openreg {
-struct OpenRegHooksInterface : public at::PrivateUse1HooksInterface {
+struct OPENREG_EXPORT OpenRegHooksInterface : public at::PrivateUse1HooksInterface {
   OpenRegHooksInterface() {};
   ~OpenRegHooksInterface() override = default;
 
-  bool hasPrimaryContext(c10::DeviceIndex device_index) const override {
+  void init() const override {
+    // Initialize OpenReg runtime if needed
+    // This is called when PyTorch first accesses the device
+  }
+
+  bool hasPrimaryContext(DeviceIndex device_index) const override {
+    return true;
+  }
+
+  bool isBuilt() const override {
+    // This extension is compiled as part of the OpenReg test extension.
     return true;
   }
 
+  bool isAvailable() const override {
+    // Consider OpenReg available if there's at least one device reported.
+    return device_count() > 0;
+  }
+
+  DeviceIndex deviceCount() const override {
+    return device_count();
+  }
+
+  void setCurrentDevice(DeviceIndex device) const override {
+    set_device(device);
+  }
+
+  DeviceIndex getCurrentDevice() const override {
+    return current_device();
+  }
+
+  DeviceIndex exchangeDevice(DeviceIndex device) const override {
+    return ExchangeDevice(device);
+  }
+
+  DeviceIndex maybeExchangeDevice(DeviceIndex device) const override {
+    // Only exchange if the requested device is valid; otherwise, no-op and return current
+    auto count = device_count();
+    if (device < 0 || device >= count) {
+      return getCurrentDevice();
+    }
+    return exchangeDevice(device);
+  }
+
   at::Allocator* getPinnedMemoryAllocator() const override {
     return at::getHostAllocator(at::kPrivateUse1);
   }
@@ -30,12 +71,23 @@ struct OpenRegHooksInterface : public at::PrivateUse1HooksInterface {
     return attr.type == orMemoryTypeHost;
   }
 
-  const at::Generator& getDefaultGenerator(
-      c10::DeviceIndex device_index) const override {
+  at::Device getDeviceFromPtr(void* data) const override {
+    orPointerAttributes attr{};
+    auto err = orPointerGetAttributes(&attr, data);
+    if (err == orSuccess && attr.type == orMemoryTypeDevice) {
+      return at::Device(at::DeviceType::PrivateUse1, static_cast<int>(attr.device));
+    } else {
+      TORCH_CHECK(false, "failed to get device from pointer");
+    }
+    return at::Device(at::DeviceType::PrivateUse1, current_device());
+  }
+  // LITERALINCLUDE START: OPENREG HOOK EXAMPLES
+  const at::Generator& getDefaultGenerator(DeviceIndex device_index) const override {
     return getDefaultOpenRegGenerator(device_index);
   }
+  // LITERALINCLUDE END: OPENREG HOOK EXAMPLES
 
-  at::Generator getNewGenerator(c10::DeviceIndex device_index) const override {
+  at::Generator getNewGenerator(DeviceIndex device_index) const override {
     return at::make_generator<OpenRegGeneratorImpl>(device_index);
   }
 };
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegStream.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegStream.cpp
index aa6c325d077de..4821f416ce749 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegStream.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegStream.cpp
@@ -140,6 +140,11 @@ static void initDeviceStreamState(DeviceIndex device_index) {
 static void initOpenRegStreamsOnce() {
   c10::call_once(init_flag, initGlobalStreamState);
 
+  for (const auto i : c10::irange(num_devices)) {
+    c10::call_once(
+        device_flags[i], initDeviceStreamState, static_cast<DeviceIndex>(i));
+  }
+
   if (current_streams) {
     return;
   }
@@ -202,8 +207,6 @@ OpenRegStream getStreamFromPool(const int priority, DeviceIndex device_index) {
   if (device_index == -1) {
     device_index = current_device();
   }
-  c10::call_once(
-      device_flags[device_index], initDeviceStreamState, device_index);
   auto pri_idx =
       std::clamp(priority, 0, max_compile_time_stream_priorities - 1);
   const auto idx = get_idx(priority_counters[device_index][pri_idx]);
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_device.py b/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_device.py
index cb1256b0d63c7..f925f15600ce7 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_device.py
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_device.py
@@ -27,6 +27,10 @@ def test_device_context(self):
             self.assertEqual(torch.accelerator.current_device_index(), 1)
         self.assertEqual(torch.accelerator.current_device_index(), device)
 
+    def test_invalid_device_index(self):
+        with self.assertRaisesRegex(RuntimeError, "The device index is out of range"):
+            torch.accelerator.set_device_index(2)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/csrc/Module.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/csrc/Module.cpp
index 38c456339003e..4150e5252a50b 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/csrc/Module.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/csrc/Module.cpp
@@ -17,6 +17,7 @@ static PyObject* _initExtension(PyObject* self, PyObject* noargs) {
   END_HANDLE_TH_ERRORS
 }
 
+// LITERALINCLUDE START: OPENREG GET DEFAULT GENERATOR
 static PyObject* _getDefaultGenerator(PyObject* self, PyObject* arg) {
   HANDLE_TH_ERRORS
   TORCH_CHECK(
@@ -31,19 +32,23 @@ static PyObject* _getDefaultGenerator(PyObject* self, PyObject* arg) {
 
   END_HANDLE_TH_ERRORS
 }
+// LITERALINCLUDE END: OPENREG GET DEFAULT GENERATOR
+
+// LITERALINCLUDE START: MODULE SET DEVICE HELPER
 
 PyObject* _setDevice(PyObject* self, PyObject* arg) {
   HANDLE_TH_ERRORS
   TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to setDevice");
-  auto device = THPUtils_unpackLong(arg);
-
+  auto device = THPUtils_unpackDeviceIndex(arg);
   torch::utils::device_lazy_init(at::kPrivateUse1);
-  c10::openreg::set_device(static_cast<c10::DeviceIndex>(device));
+  c10::openreg::set_device(device);
 
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }
 
+// LITERALINCLUDE END: MODULE SET DEVICE HELPER
+
 PyObject* _exchangeDevice(PyObject* self, PyObject* arg) {
   HANDLE_TH_ERRORS
   TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to exchangeDevice");
@@ -73,6 +78,7 @@ PyObject* _getDeviceCount(PyObject* self, PyObject* noargs) {
   END_HANDLE_TH_ERRORS
 }
 
+// LITERALINCLUDE START: OPENREG MODULE METHODS
 static PyMethodDef methods[] = {
     {"_init", _initExtension, METH_NOARGS, nullptr},
     {"_get_default_generator", _getDefaultGenerator, METH_O, nullptr},
@@ -81,7 +87,7 @@ static PyMethodDef methods[] = {
     {"_exchangeDevice", _exchangeDevice, METH_O, nullptr},
     {"_get_device_count", _getDeviceCount, METH_NOARGS, nullptr},
     {nullptr, nullptr, 0, nullptr}};
-
+// LITERALINCLUDE END: OPENREG MODULE METHODS
 /*
  * When ASAN is enabled, PyTorch modifies the dlopen flag during import,
  * causing all global and weak symbols in _C.so and its dependent libraries
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/__init__.py b/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/__init__.py
index 7c8712666a212..2ff558dfb33e2 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/__init__.py
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/__init__.py
@@ -41,8 +41,13 @@ def current_device():
     return torch_openreg._C._get_device()
 
 
+# LITERALINCLUDE START: PYTHON SET DEVICE FUNCTION
 def set_device(device) -> None:
-    return torch_openreg._C._set_device(device)
+    if device >= 0:
+        torch_openreg._C._set_device(device)
+
+
+# LITERALINCLUDE END: PYTHON SET DEVICE FUNCTION
 
 
 def init():
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/random.py b/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/random.py
index 6817bd7962b09..002bae43afc30 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/random.py
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/random.py
@@ -45,6 +45,7 @@ def initial_seed() -> int:
     return default_generator.initial_seed()
 
 
+# LITERALINCLUDE START: OPENREG MANUAL SEED
 def manual_seed(seed: int) -> None:
     seed = int(seed)
 
@@ -53,6 +54,9 @@ def manual_seed(seed: int) -> None:
     default_generator.manual_seed(seed)
 
 
+# LITERALINCLUDE END: OPENREG MANUAL SEED
+
+
 def manual_seed_all(seed: int) -> None:
     seed = int(seed)
 
diff --git a/test/cpp_extensions/test_libtorch_agnostic.py b/test/cpp_extensions/test_libtorch_agnostic.py
new file mode 100644
index 0000000000000..48ede590cecbf
--- /dev/null
+++ b/test/cpp_extensions/test_libtorch_agnostic.py
@@ -0,0 +1,772 @@
+# Owner(s): ["module: cpp"]
+
+import math
+import unittest
+from pathlib import Path
+
+import torch
+from torch.testing._internal.common_device_type import (
+    deviceCountAtLeast,
+    instantiate_device_type_tests,
+    onlyCPU,
+    onlyCUDA,
+)
+from torch.testing._internal.common_utils import (
+    install_cpp_extension,
+    IS_WINDOWS,
+    run_tests,
+    skipIfTorchDynamo,
+    TestCase,
+    xfailIfTorchDynamo,
+)
+
+
+def get_supported_dtypes():
+    """Return a list of dtypes that are supported by torch stable ABI."""
+    return [
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.uint8,
+        torch.uint16,
+        torch.uint32,
+        torch.uint64,
+        torch.bfloat16,
+        torch.float16,
+        torch.float32,
+        torch.float64,
+        torch.float8_e5m2,
+        torch.float8_e4m3fn,
+        torch.float8_e5m2fnuz,
+        torch.float8_e4m3fnuz,
+        torch.complex32,
+        torch.complex64,
+        torch.complex128,
+        torch.bool,
+    ]
+
+
+def skipIfTorchVersionLessThan(major, minor):
+    """Skip test if PyTorch version is less than specified version."""
+
+    def decorator(func):
+        version_parts = torch.__version__.split(".")
+        current_major = int(version_parts[0])
+        current_minor = int(
+            version_parts[1].split("+")[0].split("a")[0].split("b")[0].split("rc")[0]
+        )
+
+        should_skip = (current_major < major) or (
+            current_major == major and current_minor < minor
+        )
+        reason = f"Test requires PyTorch >= {major}.{minor}, current version is {torch.__version__}"
+
+        return unittest.skipIf(should_skip, reason)(func)
+
+    return decorator
+
+
+# TODO: Fix this error in Windows:
+# LINK : error LNK2001: unresolved external symbol PyInit__C
+if not IS_WINDOWS:
+
+    class TestLibtorchAgnostic(TestCase):
+        """
+        Tests for versioned libtorch_agnostic extensions.
+
+        This test class supports testing both:
+
+        - libtorch_agnostic_2_9: Extension built with TORCH_TARGET_VERSION=2.9.0
+        - libtorch_agnostic_2_10: Extension built with TORCH_TARGET_VERSION=2.10.0
+
+        Tests should be decorated with @skipIfTorchVersionLessThan to indicate the
+        version that they target.
+        """
+
+        @classmethod
+        def setUpClass(cls):
+            # Build both 2.9 and 2.10 extensions
+            base_dir = Path(__file__).parent
+
+            try:
+                import libtorch_agnostic_2_9  # noqa: F401
+            except Exception:
+                install_cpp_extension(
+                    extension_root=base_dir / "libtorch_agnostic_2_9_extension"
+                )
+
+            # Only build 2.10 extension if running on PyTorch 2.10+
+            import re
+
+            version_parts = torch.__version__.split(".")
+            current_major = int(version_parts[0])
+            # Extract just the numeric part of the minor version (handles "10+git", "10a1", etc.)
+            current_minor = int(re.match(r"\d+", version_parts[1]).group())
+
+            if (current_major > 2) or (current_major == 2 and current_minor >= 10):
+                try:
+                    import libtorch_agnostic_2_10  # noqa: F401
+                except Exception:
+                    install_cpp_extension(
+                        extension_root=base_dir / "libtorch_agnostic_2_10_extension"
+                    )
+            else:
+                print(
+                    f"Skipping 2.10 extension (running on PyTorch {torch.__version__})"
+                )
+
+        @onlyCPU
+        def test_slow_sgd(self, device):
+            import libtorch_agnostic_2_9 as libtorch_agnostic
+
+            param = torch.rand(5, device=device)
+            grad = torch.rand_like(param)
+            weight_decay = 0.01
+            lr = 0.001
+            maximize = False
+
+            new_param = libtorch_agnostic.ops.sgd_out_of_place(
+                param, grad, weight_decay, lr, maximize
+            )
+            torch._fused_sgd_(
+                (param,),
+                (grad,),
+                (),
+                weight_decay=weight_decay,
+                momentum=0.0,
+                lr=lr,
+                dampening=0.0,
+                nesterov=False,
+                maximize=maximize,
+                is_first_step=False,
+            )
+            self.assertEqual(new_param, param)
+
+        @onlyCUDA
+        def test_identity_does_not_hog_memory(self, device):
+            import libtorch_agnostic_2_9 as libtorch_agnostic
+
+            def _run_identity(prior_mem):
+                t = torch.rand(32, 32, device=device)
+                self.assertGreater(torch.cuda.memory_allocated(device), prior_mem)
+                identi_t = libtorch_agnostic.ops.identity(t)
+                assert identi_t is t
+
+            init_mem = torch.cuda.memory_allocated(device)
+
+            for _ in range(3):
+                _run_identity(init_mem)
+                curr_mem = torch.cuda.memory_allocated(device)
+                self.assertEqual(curr_mem, init_mem)
+
+        def test_exp_neg_is_leaf(self, device):
+            import libtorch_agnostic_2_9 as libtorch_agnostic
+
+            t1 = torch.rand(2, 3, device=device)
+            t2 = torch.rand(3, 2, device=device)
+            t3 = torch.rand(2, device=device)
+
+            exp, neg, is_leaf = libtorch_agnostic.ops.exp_neg_is_leaf(t1, t2, t3)
+            self.assertEqual(exp, torch.exp(t1))
+            self.assertEqual(neg, torch.neg(t2))
+            self.assertEqual(is_leaf, t3.is_leaf)
+
+        def test_my_abs(self, device):
+            import libtorch_agnostic_2_9 as libtorch_agnostic
+
+            t = torch.rand(32, 16, device=device) - 0.5
+            res = libtorch_agnostic.ops.my_abs(t)
+            self.assertEqual(res, torch.abs(t))
+
+            def _make_cuda_tensors(prior_mem):
+                cuda_t = libtorch_agnostic.ops.my_abs(t)
+                self.assertGreater(torch.cuda.memory_allocated(device), prior_mem)
+                self.assertEqual(cuda_t, torch.abs(t))
+
+            if t.is_cuda:
+                init_mem = torch.cuda.memory_allocated(device)
+                for _ in range(3):
+                    _make_cuda_tensors(init_mem)
+                    curr_mem = torch.cuda.memory_allocated(device)
+                    self.assertEqual(curr_mem, init_mem)
+
+        def test_neg_exp(self, device):
+            import libtorch_agnostic_2_9 as libtorch_agnostic
+
+            t = torch.rand(32, 16, device=device) - 0.5
+            res = libtorch_agnostic.ops.neg_exp(t)
+            self.assertEqual(res, torch.neg(torch.exp(t)))
+
+            def _make_cuda_tensors(prior_mem):
+                cuda_res = libtorch_agnostic.ops.neg_exp(t)
+                self.assertGreater(torch.cuda.memory_allocated(device), prior_mem)
+                self.assertEqual(cuda_res, torch.neg(torch.exp(t)))
+
+            if t.is_cuda:
+                init_mem = torch.cuda.memory_allocated(device)
+                for _ in range(3):
+                    _make_cuda_tensors(init_mem)
+                    curr_mem = torch.cuda.memory_allocated(device)
+                    self.assertEqual(curr_mem, init_mem)
+
+        def test_divide_neg_exp(self, device):
+            import libtorch_agnostic_2_9 as libtorch_agnostic
+
+            t = torch.zeros(2, 3, device=device) - 0.5
+            res = libtorch_agnostic.ops.divide_neg_exp(t)
+            self.assertEqual(res, torch.neg(t) / torch.exp(t))
+
+            def _make_cuda_tensors(prior_mem):
+                cuda_res = libtorch_agnostic.ops.divide_neg_exp(t)
+                self.assertGreater(torch.cuda.memory_allocated(device), prior_mem)
+                self.assertEqual(cuda_res, torch.neg(t) / torch.exp(t))
+
+            if t.is_cuda:
+                init_mem = torch.cuda.memory_allocated(device)
+                for _ in range(3):
+                    _make_cuda_tensors(init_mem)
+                    curr_mem = torch.cuda.memory_allocated(device)
+                    self.assertEqual(curr_mem, init_mem)
+
+        def test_is_contiguous(self, device):
+            import libtorch_agnostic_2_9 as libtorch_agnostic
+
+            t = torch.rand(2, 7, device=device)
+            self.assertTrue(libtorch_agnostic.ops.is_contiguous(t))
+            self.assertFalse(libtorch_agnostic.ops.is_contiguous(t.transpose(0, 1)))
+
+        # TODO: Debug this:
+        # torch._dynamo.exc.TorchRuntimeError: Dynamo failed to run FX node with fake tensors:
+        # call_function libtorch_agnostic.my_ones_like.default(*(FakeTensor(..., size=(3, 1)), 'cpu'),
+        # **{}): got AssertionError("tensor's device must be `meta`, got cpu instead")
+        @xfailIfTorchDynamo
+        def test_my_ones_like(self, device):
+            import libtorch_agnostic_2_9 as libtorch_agnostic
+
+            t = torch.rand(3, 1, device=device) - 0.5
+            cpu_t = libtorch_agnostic.ops.my_ones_like(t, "cpu")
+            self.assertEqual(cpu_t, torch.ones_like(t, device="cpu"))
+
+            def _make_cuda_tensors(prior_mem):
+                cuda_t = libtorch_agnostic.ops.my_ones_like(t, device)
+                self.assertGreater(torch.cuda.memory_allocated(device), prior_mem)
+                self.assertEqual(cuda_t, torch.ones_like(t, device=device))
+
+            if t.is_cuda:
+                init_mem = torch.cuda.memory_allocated(device)
+                for _ in range(3):
+                    _make_cuda_tensors(init_mem)
+                    curr_mem = torch.cuda.memory_allocated(device)
+                    self.assertEqual(curr_mem, init_mem)
+
+        def test_my_transpose(self, device):
+            import libtorch_agnostic_2_9 as libtorch_agnostic
+
+            t = torch.rand(2, 7, device=device)
+            out = libtorch_agnostic.ops.my_transpose(t, 0, 1)
+            self.assertEqual(out, torch.transpose(t, 0, 1))
+
+            with self.assertRaisesRegex(RuntimeError, "API call failed"):
+                libtorch_agnostic.ops.my_transpose(t, 1, 2)
+
+        def test_my_empty_like(self, device):
+            import libtorch_agnostic_2_9 as libtorch_agnostic
+
+            deterministic = torch.are_deterministic_algorithms_enabled()
+            try:
+                # set use_deterministic_algorithms to fill uninitialized memory
+                torch.use_deterministic_algorithms(True)
+
+                t = torch.rand(2, 7, device=device)
+                out = libtorch_agnostic.ops.my_empty_like(t)
+                self.assertTrue(id(out != id(t)))
+                self.assertEqual(out, torch.empty_like(t))
+            finally:
+                torch.use_deterministic_algorithms(deterministic)
+
+        @onlyCPU
+        def test_my_zero_(self, device):
+            import libtorch_agnostic_2_9 as libtorch_agnostic
+
+            t = torch.rand(2, 7, device=device)
+            out = libtorch_agnostic.ops.my_zero_(t)
+            self.assertEqual(id(out), id(t))
+            self.assertEqual(out, torch.zeros_like(t))
+
+        def test_my_amax(self, device):
+            import libtorch_agnostic_2_9 as libtorch_agnostic
+
+            t = torch.rand(2, 7, device=device)
+            out = libtorch_agnostic.ops.my_amax(t)
+            self.assertEqual(out, torch.amax(t, 0))
+
+        def test_my_amax_vec(self, device):
+            import libtorch_agnostic_2_9 as libtorch_agnostic
+
+            t = torch.rand(2, 7, 5, device=device)
+            out = libtorch_agnostic.ops.my_amax_vec(t)
+            self.assertEqual(out, torch.amax(t, (0, 1)))
+
+        def test_my_is_cpu(self, device):
+            import libtorch_agnostic_2_9 as libtorch_agnostic
+
+            t = torch.rand(2, 7, device=device)
+            out = libtorch_agnostic.ops.my_is_cpu(t)
+            self.assertEqual(out, t.is_cpu)
+
+        def test_fill_infinity(self, device):
+            import libtorch_agnostic_2_9 as libtorch_agnostic
+
+            t = torch.rand(3, 4, device=device)
+            out = libtorch_agnostic.ops.fill_infinity(t)
+
+            self.assertEqual(id(out), id(t))
+            expected = torch.full_like(t, math.inf)
+            self.assertEqual(out, expected)
+
+        @onlyCPU
+        def test_default_constructor(self):
+            import libtorch_agnostic_2_9 as libtorch_agnostic
+
+            defined_tensor_is_defined = libtorch_agnostic.ops.test_default_constructor(
+                True
+            )
+            self.assertTrue(defined_tensor_is_defined)
+
+            undefined_tensor_is_defined = (
+                libtorch_agnostic.ops.test_default_constructor(False)
+            )
+            self.assertFalse(undefined_tensor_is_defined)
+
+        def test_my_pad(self, device):
+            import libtorch_agnostic_2_9 as libtorch_agnostic
+
+            t = torch.rand(2, 3, device=device)
+            out = libtorch_agnostic.ops.my_pad(t)
+            expected = torch.nn.functional.pad(t, [1, 2, 2, 1], "constant", 0.0)
+            self.assertEqual(out, expected)
+
+        def test_my_narrow(self, device):
+            import libtorch_agnostic_2_9 as libtorch_agnostic
+
+            t = torch.randn(2, 5, device=device)
+
+            dim0 = 0
+            start0 = 0
+            length0 = 1
+            out0 = libtorch_agnostic.ops.my_narrow(t, dim0, start0, length0)
+            expected0 = torch.narrow(t, dim0, start0, length0)
+            self.assertEqual(out0, expected0)
+
+        @onlyCUDA
+        @deviceCountAtLeast(2)
+        def test_device_guard(self, device):
+            import libtorch_agnostic_2_9 as libtorch_agnostic
+
+            device_index = 1
+            out = libtorch_agnostic.ops.test_device_guard(device_index)
+            self.assertEqual(out, device_index)
+
+        @onlyCUDA
+        @deviceCountAtLeast(2)
+        def test_device_guard_set_index(self, device):
+            import libtorch_agnostic_2_9 as libtorch_agnostic
+
+            # This test creates a DeviceGuard with index 1, then sets it to index 0
+            # and returns the current device (should be 0)
+            out = libtorch_agnostic.ops.test_device_guard_set_index()
+            self.assertEqual(out, 0)
+
+        @onlyCUDA
+        def test_stream(self, device):
+            import libtorch_agnostic_2_9 as libtorch_agnostic
+
+            stream = torch.cuda.Stream()
+            device = torch.cuda.current_device()
+
+            with stream:
+                expected_stream_id = torch.cuda.current_stream(0).stream_id
+                stream_id = libtorch_agnostic.ops.test_stream(device)
+
+            self.assertEqual(stream_id, expected_stream_id)
+
+        @onlyCUDA
+        @deviceCountAtLeast(2)
+        def test_get_current_device_index(self, device):
+            import libtorch_agnostic_2_9 as libtorch_agnostic
+
+            prev_device = torch.cuda.current_device()
+
+            try:
+                expected_device = 1
+                torch.cuda.set_device(expected_device)
+
+                current_device = libtorch_agnostic.ops.test_get_current_device_index()
+                self.assertEqual(current_device, expected_device)
+            finally:
+                torch.cuda.set_device(prev_device)
+
+        def test_my_new_empty_dtype_variant(self, device):
+            import libtorch_agnostic_2_9 as libtorch_agnostic
+
+            deterministic = torch.are_deterministic_algorithms_enabled()
+            try:
+                # set use_deterministic_algorithms to fill uninitialized memory
+                torch.use_deterministic_algorithms(True)
+                t = torch.randn(3, 4, device=device)
+                out = libtorch_agnostic.ops.my_new_empty_dtype_variant(t)
+                ref_out = t.new_empty((2, 5), dtype=torch.bfloat16)
+
+                self.assertEqual(out, ref_out, exact_device=True)
+            finally:
+                torch.use_deterministic_algorithms(deterministic)
+
+        def test_my_new_zeros_dtype_variant(self, device):
+            import libtorch_agnostic_2_9 as libtorch_agnostic
+
+            t = torch.randn(3, 4, device=device)
+            out = libtorch_agnostic.ops.my_new_zeros_dtype_variant(t)
+            ref_out = t.new_zeros((2, 5), dtype=torch.float)
+            self.assertEqual(out, ref_out, exact_device=True)
+
+        def test_my_copy_(self, device):
+            import libtorch_agnostic_2_9 as libtorch_agnostic
+
+            dst = torch.empty(2, 5, device=device)
+            src = torch.randn(2, 5, device=device)
+
+            result = libtorch_agnostic.ops.my_copy_(dst, src, False)
+            expected = src
+            self.assertEqual(result, expected)
+            self.assertEqual(result.data_ptr(), dst.data_ptr())
+
+        def test_my_clone(self, device):
+            import libtorch_agnostic_2_9 as libtorch_agnostic
+
+            t = torch.randn(2, 5, device=device)
+
+            result = libtorch_agnostic.ops.my_clone(t)
+            expected = t.clone()
+            self.assertEqual(result, expected)
+            self.assertNotEqual(result.data_ptr(), expected.data_ptr())
+            self.assertEqual(result.stride(), expected.stride())
+
+        @skipIfTorchVersionLessThan(2, 10)
+        def test_my__foreach_mul_(self, device):
+            import libtorch_agnostic_2_10 as libtorch_agnostic
+
+            N = 5
+            tensors = [torch.rand(32, 16, device=device) for _ in range(N)]
+            tensors_c = [t.clone() for t in tensors]
+            others = [torch.rand(32, 16, device=device) for _ in range(N)]
+
+            libtorch_agnostic.ops.my__foreach_mul_(tensors, others)
+            expected_values = torch._foreach_mul(tensors_c, others)
+
+            for tensor_t, expected_t in zip(tensors, expected_values):
+                self.assertEqual(tensor_t, expected_t)
+
+        @skipIfTorchVersionLessThan(2, 10)
+        def test_my__foreach_mul(self, device):
+            import libtorch_agnostic_2_10 as libtorch_agnostic
+
+            N = 5
+            tensors = [torch.rand(32, 16, device=device) for _ in range(N)]
+            others = [torch.rand(32, 16, device=device) for _ in range(N)]
+
+            result = libtorch_agnostic.ops.my__foreach_mul(tensors, others)
+            expected = torch._foreach_mul(tensors, others)
+
+            for result_t, expected_t in zip(result, expected):
+                self.assertEqual(result_t, expected_t)
+
+            def _make_cuda_tensors(prior_mem):
+                cuda_res = libtorch_agnostic.ops.my__foreach_mul(tensors, others)
+                self.assertGreater(torch.cuda.memory_allocated(device), prior_mem)
+
+                expected = torch._foreach_mul(tensors, others)
+                for result_t, expected_t in zip(cuda_res, expected):
+                    self.assertEqual(result_t, expected_t)
+
+            if tensors[0].is_cuda:
+                init_mem = torch.cuda.memory_allocated(device)
+                for _ in range(3):
+                    _make_cuda_tensors(init_mem)
+                    curr_mem = torch.cuda.memory_allocated(device)
+                    self.assertEqual(curr_mem, init_mem)
+
+        @skipIfTorchVersionLessThan(2, 10)
+        def test_make_tensor_clones_and_call_foreach(self, device):
+            import libtorch_agnostic_2_10 as libtorch_agnostic
+
+            t1 = torch.rand(2, 5, device=device)
+            t2 = torch.rand(3, 4, device=device)
+            result = libtorch_agnostic.ops.make_tensor_clones_and_call_foreach(t1, t2)
+            self.assertEqual(result[0], t1 * t1)
+            self.assertEqual(result[1], t2 * t2)
+
+        @skipIfTorchVersionLessThan(2, 10)
+        @onlyCUDA
+        def test_device(self, device):
+            import libtorch_agnostic_2_10 as libtorch_agnostic
+
+            cuda_device = libtorch_agnostic.ops.test_device_constructor(
+                is_cuda=True, index=1, use_str=False
+            )
+            self.assertEqual(cuda_device, torch.device("cuda:1"))
+            cuda_device = libtorch_agnostic.ops.test_device_constructor(
+                is_cuda=True, index=1, use_str=True
+            )
+            self.assertEqual(cuda_device, torch.device("cuda:1"))
+
+            self.assertEqual(libtorch_agnostic.ops.test_device_index(cuda_device), 1)
+            self.assertTrue(
+                libtorch_agnostic.ops.test_device_equality(
+                    cuda_device, torch.device("cuda:1")
+                )
+            )
+            self.assertFalse(
+                libtorch_agnostic.ops.test_device_equality(
+                    cuda_device, torch.device("cuda:0")
+                )
+            )
+            self.assertFalse(libtorch_agnostic.ops.test_device_is_cpu(cuda_device))
+            self.assertTrue(libtorch_agnostic.ops.test_device_is_cuda(cuda_device))
+
+            cuda_0_device = libtorch_agnostic.ops.test_device_set_index(cuda_device, 0)
+            self.assertEqual(cuda_0_device, torch.device("cuda:0"))
+
+            cpu_device = libtorch_agnostic.ops.test_device_constructor(False, 0, False)
+            self.assertEqual(cpu_device, torch.device("cpu"))
+            self.assertTrue(
+                libtorch_agnostic.ops.test_device_equality(
+                    cpu_device, torch.device("cpu")
+                )
+            )
+            self.assertTrue(libtorch_agnostic.ops.test_device_is_cpu(cpu_device))
+            self.assertFalse(libtorch_agnostic.ops.test_device_is_cuda(cpu_device))
+            self.assertFalse(
+                libtorch_agnostic.ops.test_device_equality(cpu_device, cuda_device)
+            )
+
+            with self.assertRaisesRegex(
+                RuntimeError, "Device index 129 is out of range for int8_t"
+            ):
+                libtorch_agnostic.ops.test_device_constructor(
+                    is_cuda=True, index=129, use_str=False
+                )
+
+            with self.assertRaisesRegex(
+                RuntimeError, "Device index 129 is out of range for int8_t"
+            ):
+                libtorch_agnostic.ops.test_device_set_index(cuda_device, 129)
+
+        @skipIfTorchVersionLessThan(2, 10)
+        @onlyCUDA
+        @deviceCountAtLeast(2)
+        def test_tensor_device(self, device):
+            import libtorch_agnostic_2_10 as libtorch_agnostic
+
+            t = torch.randn(2, 3)
+            self.assertEqual(libtorch_agnostic.ops.test_tensor_device(t), t.device)
+
+            t_cuda = torch.randn(2, 3, device="cuda")
+            self.assertEqual(
+                libtorch_agnostic.ops.test_tensor_device(t_cuda), t_cuda.device
+            )
+
+            t_cuda_1 = torch.randn(2, 3, device="cuda:1")
+            self.assertEqual(
+                libtorch_agnostic.ops.test_tensor_device(t_cuda_1), t_cuda_1.device
+            )
+
+        @skipIfTorchVersionLessThan(2, 10)
+        @onlyCPU
+        # TODO: Debug this:
+        # Dynamo failed to run FX node with fake tensors:
+        # call_function libtorch_agnostic.test_parallel_for.default(*(100, 10), **{}):
+        # got RuntimeError('libtorch_agnostic::test_parallel_for() expected at most
+        # 2 argument(s) but received 3 argument(s).
+        # Declaration: libtorch_agnostic::test_parallel_for(int size, int grain_size) -> Tensor')
+        @xfailIfTorchDynamo
+        def test_parallel_for(self, device):
+            import libtorch_agnostic_2_10 as libtorch_agnostic
+
+            num_threads = torch.get_num_threads()
+            size = 100
+            grain_size = 10
+            expected_num_threads_used = min(
+                (size + grain_size - 1) // grain_size, num_threads
+            )
+
+            result = libtorch_agnostic.ops.test_parallel_for(size, grain_size)
+            result_thread_ids = torch.unique(torch.bitwise_right_shift(result, 32))
+            result_values = torch.bitwise_and(result, 0xFFFFFFFF)
+            expected = torch.arange(size, dtype=torch.int64)
+
+            self.assertEqual(result_values, expected)
+            self.assertEqual(result_thread_ids, torch.arange(expected_num_threads_used))
+
+        @skipIfTorchVersionLessThan(2, 10)
+        @onlyCPU
+        def test_get_num_threads(self, device):
+            import libtorch_agnostic_2_10 as libtorch_agnostic
+
+            num_threads = libtorch_agnostic.ops.test_get_num_threads()
+            expected_num_threads = torch.get_num_threads()
+            self.assertEqual(num_threads, expected_num_threads)
+
+        @skipIfTorchVersionLessThan(2, 10)
+        def test_my_empty(self, device):
+            import libtorch_agnostic_2_10 as libtorch_agnostic
+
+            deterministic = torch.are_deterministic_algorithms_enabled()
+            try:
+                # set use_deterministic_algorithms to fill uninitialized memory
+                torch.use_deterministic_algorithms(True)
+
+                size = [2, 3]
+                result = libtorch_agnostic.ops.my_empty(size, None, None, None)
+                expected = torch.empty(size)
+                self.assertEqual(result, expected, exact_device=True)
+
+                result_float = libtorch_agnostic.ops.my_empty(
+                    size, torch.float32, None, None
+                )
+                expected_float = torch.empty(size, dtype=torch.float32)
+                self.assertEqual(result_float, expected_float, exact_device=True)
+
+                result_with_device = libtorch_agnostic.ops.my_empty(
+                    size, torch.float64, device, None
+                )
+                expected_with_device = torch.empty(
+                    size, dtype=torch.float64, device=device
+                )
+                self.assertEqual(
+                    result_with_device, expected_with_device, exact_device=True
+                )
+
+                if device == "cuda":
+                    result_pinned = libtorch_agnostic.ops.my_empty(
+                        size, torch.float32, "cpu", True
+                    )
+                    expected_pinned = torch.empty(
+                        size, dtype=torch.float32, device="cpu", pin_memory=True
+                    )
+                    self.assertEqual(result_pinned, expected_pinned)
+                    self.assertTrue(result_pinned.is_pinned())
+            finally:
+                torch.use_deterministic_algorithms(deterministic)
+
+        def test_my_flatten(self, device):
+            import libtorch_agnostic_2_9 as libtorch_agnostic
+
+            t = torch.randn(2, 3, 4, device=device)
+            result = libtorch_agnostic.ops.my_flatten(t)
+            expected = torch.flatten(t)
+            self.assertEqual(result, expected)
+
+            result_start = libtorch_agnostic.ops.my_flatten(t, 1)
+            expected_start = torch.flatten(t, 1)
+            self.assertEqual(result_start, expected_start)
+
+            result_range = libtorch_agnostic.ops.my_flatten(t, 2, -1)
+            expected_range = torch.flatten(t, 2, -1)
+            self.assertEqual(result_range, expected_range)
+
+        @skipIfTorchVersionLessThan(2, 10)
+        def test_my_reshape(self, device):
+            import libtorch_agnostic_2_10 as libtorch_agnostic
+
+            t = torch.randn(2, 3, 4, device=device)
+
+            result = libtorch_agnostic.ops.my_reshape(t, [6, 4])
+            expected = torch.reshape(t, [6, 4])
+            self.assertEqual(result, expected)
+
+            result_infer = libtorch_agnostic.ops.my_reshape(t, [-1, 4])
+            expected_infer = torch.reshape(t, [-1, 4])
+            self.assertEqual(result_infer, expected_infer)
+
+            result_flat = libtorch_agnostic.ops.my_reshape(t, [-1])
+            expected_flat = torch.reshape(t, [-1])
+            self.assertEqual(result_flat, expected_flat)
+
+        @skipIfTorchVersionLessThan(2, 10)
+        def test_my_view(self, device):
+            import libtorch_agnostic_2_10 as libtorch_agnostic
+
+            t = torch.randn(2, 3, 4, device=device)
+
+            result = libtorch_agnostic.ops.my_view(t, [6, 4])
+            expected = t.view([6, 4])
+            self.assertEqual(result, expected)
+
+            result_infer = libtorch_agnostic.ops.my_view(t, [-1, 4])
+            expected_infer = t.view([-1, 4])
+            self.assertEqual(result_infer, expected_infer)
+
+            result_flat = libtorch_agnostic.ops.my_view(t, [-1])
+            expected_flat = t.view([-1])
+            self.assertEqual(result_flat, expected_flat)
+
+        def test_mv_tensor_accessor(self, device):
+            import libtorch_agnostic_2_9 as libtorch_agnostic
+
+            m = torch.rand(3, 5, device=device)
+            v = torch.rand(5, device=device)
+            result = libtorch_agnostic.ops.mv_tensor_accessor(m, v)
+            expected = torch.mv(m, v)
+            self.assertEqual(result, expected)
+
+            # non-contiguous inputs
+            m = torch.rand(3 * 2, 5 * 3, device=device)[::2, ::3]
+            v = torch.rand(5 * 4, device=device)[::4]
+            result = libtorch_agnostic.ops.mv_tensor_accessor(m, v)
+            expected = torch.mv(m, v)
+            self.assertEqual(result, expected)
+
+        @skipIfTorchVersionLessThan(2, 10)
+        @skipIfTorchDynamo("no data pointer defined for FakeTensor, FunctionalTensor")
+        def test_get_any_data_ptr(self, device):
+            import libtorch_agnostic_2_10 as libtorch_agnostic
+
+            t = torch.empty(2, 5, device=device, dtype=torch.float32)
+            expected_p = t.data_ptr()
+
+            for mutable in [True, False]:
+                p = libtorch_agnostic.ops.get_any_data_ptr(t, mutable)
+                self.assertEqual(p, expected_p)
+
+        @skipIfTorchVersionLessThan(2, 10)
+        @skipIfTorchDynamo("no data pointer defined for FakeTensor, FunctionalTensor")
+        def test_get_template_any_data_ptr(self, device):
+            import libtorch_agnostic_2_10 as libtorch_agnostic
+
+            supported_dtypes = get_supported_dtypes()
+
+            for dtype in supported_dtypes:
+                t = torch.empty(2, 5, device=device, dtype=dtype)
+                expected_p = t.data_ptr()
+
+                for rdtype in supported_dtypes:
+                    if dtype == rdtype:
+                        for mutable in [True, False]:
+                            p = libtorch_agnostic.ops.get_template_any_data_ptr(
+                                t, rdtype, mutable
+                            )
+                            self.assertEqual(p, expected_p)
+                    else:
+                        for mutable in [True, False]:
+                            with self.assertRaisesRegex(
+                                RuntimeError, "expected scalar type.* but found"
+                            ):
+                                libtorch_agnostic.ops.get_template_any_data_ptr(
+                                    t, rdtype, mutable
+                                )
+
+    instantiate_device_type_tests(TestLibtorchAgnostic, globals(), except_for=None)
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/custom_backend/test_custom_backend.py b/test/custom_backend/test_custom_backend.py
index 5faf5fa94d5ad..d0e518c4cd125 100644
--- a/test/custom_backend/test_custom_backend.py
+++ b/test/custom_backend/test_custom_backend.py
@@ -1,6 +1,5 @@
 # Owner(s): ["module: unknown"]
 
-import os
 import tempfile
 
 from backend import get_custom_backend_library_path, Model, to_custom_backend
@@ -11,6 +10,7 @@
 
 class TestCustomBackend(TestCase):
     def setUp(self):
+        super().setUp()
         # Load the library containing the custom backend.
         self.library_path = get_custom_backend_library_path()
         torch.ops.load_library(self.library_path)
@@ -40,14 +40,11 @@ def test_save_load(self):
         self.test_execute()
 
         # Save and load.
-        f = tempfile.NamedTemporaryFile(delete=False)
-        try:
+        with tempfile.NamedTemporaryFile() as f:
             f.close()
             torch.jit.save(self.model, f.name)
             loaded = torch.jit.load(f.name)
-        finally:
-            os.unlink(f.name)
-        self.model = loaded
+            self.model = loaded
 
         # Test execution again.
         self.test_execute()
diff --git a/test/custom_operator/test_custom_ops.cpp b/test/custom_operator/test_custom_ops.cpp
index a526bebd26144..9791006d1498f 100644
--- a/test/custom_operator/test_custom_ops.cpp
+++ b/test/custom_operator/test_custom_ops.cpp
@@ -22,7 +22,7 @@ void check_all_parameters(
 
 template<class Result, class... Args>
 Result get_operator_from_registry_and_execute(const char* op_name, Args&&... args) {
-  auto& ops = torch::jit::getAllOperatorsFor(
+  auto ops = torch::jit::getAllOperatorsFor(
       torch::jit::Symbol::fromQualString(op_name));
   TORCH_INTERNAL_ASSERT(ops.size() == 1);
 
diff --git a/test/custom_operator/test_custom_ops.py b/test/custom_operator/test_custom_ops.py
index 24bc4db520a89..8d43ed10d0ede 100644
--- a/test/custom_operator/test_custom_ops.py
+++ b/test/custom_operator/test_custom_ops.py
@@ -1,6 +1,5 @@
 # Owner(s): ["module: unknown"]
 
-import os.path
 import sys
 import tempfile
 import unittest
@@ -18,6 +17,7 @@
 
 class TestCustomOperators(TestCase):
     def setUp(self):
+        super().setUp()
         self.library_path = get_custom_op_library_path()
         ops.load_library(self.library_path)
 
@@ -143,16 +143,13 @@ def test_saving_and_loading_script_module_with_custom_op(self):
         # Ideally we would like to not have to manually delete the file, but NamedTemporaryFile
         # opens the file, and it cannot be opened multiple times in Windows. To support Windows,
         # close the file after creation and try to remove it manually.
-        file = tempfile.NamedTemporaryFile(delete=False)
-        try:
+        with tempfile.NamedTemporaryFile() as file:
             file.close()
             model.save(file.name)
             loaded = torch.jit.load(file.name)
-        finally:
-            os.unlink(file.name)
 
-        output = loaded.forward(torch.ones(5))
-        self.assertTrue(output.allclose(torch.ones(5) + 1))
+            output = loaded.forward(torch.ones(5))
+            self.assertTrue(output.allclose(torch.ones(5) + 1))
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_comm.py b/test/distributed/_composable/fsdp/test_fully_shard_comm.py
index 44000e761d8a0..ad3064608960d 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_comm.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_comm.py
@@ -59,7 +59,12 @@
     patch_reshard,
     patch_unshard,
 )
-from torch.testing._internal.common_utils import run_tests, TEST_XPU, xfailIf
+from torch.testing._internal.common_utils import (
+    run_tests,
+    TEST_WITH_ROCM,
+    TEST_XPU,
+    xfailIf,
+)
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     FeedForward,
     ModelArgs,
@@ -1658,10 +1663,17 @@ def test_exception_when_used_together_with_comm_hooks(self):
 class TestFullyShardForceSumReduction(FSDPTest):
     # The messages might change when we move to a different NCCL version.
     # Please update this test if it starts failing.
-    COLLECTIVE_RE = (
-        "NCCL INFO {coll}: opCount [0-9a-f]+ sendbuff 0x[0-9a-f]+ recvbuff 0x[0-9a-f]+ "
-        "count {count} datatype [0-9]+ op {reduce_op} root [0-9]+ comm 0x[0-9a-f]+"
-    )
+
+    if TEST_WITH_ROCM and torch.cuda.nccl.version()[:2] >= (2, 27):
+        COLLECTIVE_RE = (
+            r"NCCL INFO {coll}: opCount [0-9a-f]+ sendbuff 0x[0-9a-f]+ recvbuff 0x[0-9a-f]+ acc \(nil\) "
+            "count {count} datatype [0-9]+ op {reduce_op} root [0-9]+ comm 0x[0-9a-f]+"
+        )
+    else:
+        COLLECTIVE_RE = (
+            "NCCL INFO {coll}: opCount [0-9a-f]+ sendbuff 0x[0-9a-f]+ recvbuff 0x[0-9a-f]+ "
+            "count {count} datatype [0-9]+ op {reduce_op} root [0-9]+ comm 0x[0-9a-f]+"
+        )
     # See here for the numerical values for each reduction op:
     # https://github.com/NVIDIA/nccl/blob/72d2432094d6ae36abd6e511c3a16a2d052dbf94/src/nccl.h.in#L260-L275
     SUM_REDUCTION = 0
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_logging.py b/test/distributed/_composable/fsdp/test_fully_shard_logging.py
index c9450a2b8f475..9b666eb55ba08 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_logging.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_logging.py
@@ -1,7 +1,7 @@
 # Owner(s): ["module: fsdp"]
 import functools
 import os
-import unittest.mock
+import unittest
 
 import torch.distributed as dist
 from torch._dynamo.test_case import run_tests
@@ -37,9 +37,9 @@ def test_fsdp_logging(self):
 import torch.distributed as dist
 import torch.nn as nn
 from torch.distributed.fsdp import fully_shard
-logger = logging.getLogger("torch.distributed._composable.fsdp")
+logger = logging.getLogger("torch.distributed.fsdp.fully_shard")
 logger.setLevel(logging.DEBUG)
-device = {device_type.type}
+device = '{device_type.type}'
 torch.manual_seed(0)
 model = nn.Sequential(*[nn.Linear(4, 4, device=device, bias=False) for _ in range(2)])
 for layer in model:
diff --git a/test/distributed/_composable/test_replicate_with_fsdp.py b/test/distributed/_composable/test_replicate_with_fsdp.py
index 7ec059a647ee5..1087d9c813c9e 100644
--- a/test/distributed/_composable/test_replicate_with_fsdp.py
+++ b/test/distributed/_composable/test_replicate_with_fsdp.py
@@ -76,7 +76,7 @@ def _init_pg(self):
             store=dist.FileStore(self.file_name, self.world_size),
         )
 
-    @skip_if_lt_x_gpu(2)
+    @skip_if_lt_x_gpu(4)
     def test_replicate_transformer(self):
         """
         This tests that replicate works on a transformer model with fully_shard and replicate layers
@@ -126,7 +126,7 @@ def _test_replicate_transformer(self, sharding_strategy):
                 for parameter in layer.parameters():
                     self.assertEqual(parameter.placements, (Shard(dim=0),))
 
-    @skip_if_lt_x_gpu(2)
+    @skip_if_lt_x_gpu(4)
     def test_replicate_transformer_managed_modules(self):
         """
         This tests that replicate managed modules works properly. In this test we use a Transformer Module with 3 layers,
@@ -178,7 +178,7 @@ def test_replicate_transformer_managed_modules(self):
         replicate_model = replicate(replicate_model)
         self.assertEqual(len(_get_managed_modules((replicate_model,))), 21)
 
-    @skip_if_lt_x_gpu(2)
+    @skip_if_lt_x_gpu(4)
     def test_replicate_tp_device_mesh(self):
         """
         This tests that a user can pass in a device mesh to replicate a module
@@ -206,7 +206,7 @@ def test_replicate_tp_device_mesh(self):
                 self.assertEqual(parameter.device_mesh.shape, (2,))
                 self.assertEqual(parameter.placements, (Replicate(),))
 
-    @skip_if_lt_x_gpu(2)
+    @skip_if_lt_x_gpu(4)
     def test_train_replicate_fsdp(self):
         """
         Tests that replicate_model has the same behavior as original model when training
@@ -253,7 +253,7 @@ def test_train_replicate_fsdp(self):
             self.assertEqual(replicate_loss, loss)
             check_sharded_parity(self, model, replicate_model)
 
-    @skip_if_lt_x_gpu(2)
+    @skip_if_lt_x_gpu(4)
     def test_train_parity_2d_mlp(self):
         """
         Verifies when a device mesh is passed in, the model has the same behavior as the original model when training
diff --git a/test/distributed/_tools/test_fsdp2_mem_tracker.py b/test/distributed/_tools/test_fsdp2_mem_tracker.py
index a38afd12de8f6..c793bc98b8918 100644
--- a/test/distributed/_tools/test_fsdp2_mem_tracker.py
+++ b/test/distributed/_tools/test_fsdp2_mem_tracker.py
@@ -180,6 +180,47 @@ def test_tracker_non_root_forward_backward(self):
         del model
         del optim
 
+    def _test_tracker_multihandler_hook(self):
+        """Should run without KeyError."""
+
+        class TestModule(nn.Module):
+            def __init__(self, dim: int):
+                super().__init__()
+                self.norm1 = nn.RMSNorm(dim)
+                self.output1 = nn.Linear(dim, dim)
+                self.norm2 = nn.RMSNorm(dim)
+                self.output2 = nn.Linear(dim, dim)
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                x = self.norm1(x)
+                x = self.output1(x)
+                x = self.norm2(x)
+                x = self.output2(x)
+                return x
+
+        gc.collect()
+        torch.manual_seed(42)
+        dev = torch.device(torch.accelerator.current_device_index())
+
+        with torch.device(dev):
+            model = TestModule(128)
+
+        mesh = init_device_mesh(dev.type, (self.world_size,))
+        fully_shard([model.norm1, model.output1], mesh=mesh)
+        fully_shard([model.norm2, model.output2], mesh=mesh)
+        fully_shard(model, mesh=mesh)
+
+        fmt = FSDPMemTracker(model)
+
+        with fmt:
+            inp = torch.randn(16, 128, device=dev)
+            y = model(inp)
+            loss = y.sum()
+            loss.backward()
+
+        del inp
+        del model
+
 
 class TestTrackerFullyShard1DTrainingCompose(FSDPTest):
     @property
diff --git a/test/distributed/_tools/test_sac_ilp.py b/test/distributed/_tools/test_sac_ilp.py
index 9afb267ed1675..555b0efb9f89e 100644
--- a/test/distributed/_tools/test_sac_ilp.py
+++ b/test/distributed/_tools/test_sac_ilp.py
@@ -80,7 +80,7 @@ def _run_and_get_memTracker(
             # postprocessing due to the fact that for ModTracker, the post backward hook
             # is not being called for modules whose inputs don't require gradients
             # TODO: fix this in ModTracker and ensure it does not lead to any perf regression
-            if _ModState.POST_BW not in mod_stats.snapshots.keys():
+            if _ModState.POST_BW not in mod_stats.snapshots:
                 mod_stats.snapshots.setdefault(_ModState.POST_BW, []).append(
                     copy.deepcopy(last_snapshot)
                 )
diff --git a/test/distributed/argparse_util_test.py b/test/distributed/argparse_util_test.py
index 1902faf992734..a3b3ef2bc717e 100644
--- a/test/distributed/argparse_util_test.py
+++ b/test/distributed/argparse_util_test.py
@@ -16,7 +16,7 @@
 class ArgParseUtilTest(unittest.TestCase):
     def setUp(self):
         # remove any lingering environment variables
-        for e in os.environ.keys():
+        for e in os.environ.keys():  # noqa: SIM118
             if e.startswith("PET_"):
                 del os.environ[e]
 
diff --git a/test/distributed/checkpoint/_experimental/test_builder.py b/test/distributed/checkpoint/_experimental/test_builder.py
index 9b2ba937eb4fd..64aacaf8c00cc 100644
--- a/test/distributed/checkpoint/_experimental/test_builder.py
+++ b/test/distributed/checkpoint/_experimental/test_builder.py
@@ -22,6 +22,7 @@
 
 class TestMakeCheckpointer(TestCase):
     def setUp(self) -> None:
+        super().setUp()
         # Create a temporary directory for checkpoints
         self.temp_dir = tempfile.mkdtemp()
 
diff --git a/test/distributed/checkpoint/_experimental/test_checkpoint_process.py b/test/distributed/checkpoint/_experimental/test_checkpoint_process.py
index 1220d5f07235b..161dd1a80c3e1 100644
--- a/test/distributed/checkpoint/_experimental/test_checkpoint_process.py
+++ b/test/distributed/checkpoint/_experimental/test_checkpoint_process.py
@@ -161,6 +161,7 @@ def test_custom_options(self) -> None:
 
 class TestCheckpointProcess(TestCase):
     def setUp(self) -> None:
+        super().setUp()
         """Set up common test fixtures."""
         self.rank_info = RankInfo(
             global_world_size=1,
diff --git a/test/distributed/checkpoint/_experimental/test_checkpoint_reader.py b/test/distributed/checkpoint/_experimental/test_checkpoint_reader.py
index 88feb0bffee5d..70d1d30facd70 100644
--- a/test/distributed/checkpoint/_experimental/test_checkpoint_reader.py
+++ b/test/distributed/checkpoint/_experimental/test_checkpoint_reader.py
@@ -14,6 +14,7 @@
 
 class TestCheckpointReader(TestCase):
     def setUp(self):
+        super().setUp()
         # Create a temporary directory for test checkpoints
         self.temp_dir = tempfile.mkdtemp()
 
diff --git a/test/distributed/checkpoint/_experimental/test_checkpoint_writer.py b/test/distributed/checkpoint/_experimental/test_checkpoint_writer.py
index c5141c6a1730e..959f1c9e7572d 100644
--- a/test/distributed/checkpoint/_experimental/test_checkpoint_writer.py
+++ b/test/distributed/checkpoint/_experimental/test_checkpoint_writer.py
@@ -52,6 +52,7 @@ def test_custom_values(self):
 
 class TestCheckpointWriter(TestCase):
     def setUp(self):
+        super().setUp()
         # Create a temporary directory for test checkpoints
         self.temp_dir = tempfile.mkdtemp()
 
diff --git a/test/distributed/checkpoint/_experimental/test_checkpointer.py b/test/distributed/checkpoint/_experimental/test_checkpointer.py
index 62fde0b3166df..fbd19ff9eafad 100644
--- a/test/distributed/checkpoint/_experimental/test_checkpointer.py
+++ b/test/distributed/checkpoint/_experimental/test_checkpointer.py
@@ -52,6 +52,7 @@ class TestCheckpointer(TestCase):
     """Parameterized tests that work with both sync and async checkpointers."""
 
     def setUp(self):
+        super().setUp()
         # Create a temporary directory for checkpoints
         self.temp_dir = tempfile.mkdtemp()
 
@@ -397,6 +398,7 @@ class TestAsyncCheckpointerSpecific(TestCase):
     """Tests specific to AsyncCheckpointer functionality."""
 
     def setUp(self):
+        super().setUp()
         # Create a temporary directory for checkpoints
         self.temp_dir = tempfile.mkdtemp()
 
diff --git a/test/distributed/checkpoint/_experimental/test_staging.py b/test/distributed/checkpoint/_experimental/test_staging.py
index 3fdb3bc022f25..c9be4fe43f49d 100644
--- a/test/distributed/checkpoint/_experimental/test_staging.py
+++ b/test/distributed/checkpoint/_experimental/test_staging.py
@@ -12,6 +12,7 @@
 
 class TestDefaultStager(TestCase):
     def setUp(self) -> None:
+        super().setUp()
         # Create a test state dictionary with various data types
         self.state_dict = {
             "model": torch.nn.Linear(10, 5).state_dict(),
@@ -206,7 +207,7 @@ def test_multiple_staging_operations(self) -> None:
         for i, result in enumerate(staged_results):
             self.assertIsInstance(result, dict)
             # Verify the result contains the expected keys
-            for key in state_dicts[i].keys():
+            for key in state_dicts[i]:
                 self.assertIn(key, result)
 
         stager.close()
diff --git a/test/distributed/checkpoint/test_dtensor_resharding.py b/test/distributed/checkpoint/test_dtensor_resharding.py
index 306f61a597c25..233fb3e7e0f03 100644
--- a/test/distributed/checkpoint/test_dtensor_resharding.py
+++ b/test/distributed/checkpoint/test_dtensor_resharding.py
@@ -299,7 +299,7 @@ def test_dtensor_checkpoint_resharding_with_empty_shard(self):
 
     @with_comms
     @with_temp_dir
-    @skip_if_lt_x_gpu(2)
+    @skip_if_lt_x_gpu(4)
     def test_dtensor_checkpoint_with_uneven_shards(self) -> None:
         """
         Saving a dtensor with uneven shards.
@@ -436,6 +436,7 @@ class TestCheckpointableReshard(DTensorTestBase):
 
     @with_comms
     @with_temp_dir
+    @skip_if_lt_x_gpu(4)
     def test_uneven_reshard_with_checkpointable_api(self) -> None:
         """
         Saves a 1d distributed tensor that has shards with uneven sizes using Checkpointable API.
@@ -498,6 +499,7 @@ def test_uneven_reshard_with_checkpointable_api(self) -> None:
 
     @with_comms
     @with_temp_dir
+    @skip_if_lt_x_gpu(4)
     def test_uneven_reshard_with_dtensor_shards_wrapper_api(self) -> None:
         """
         Saves a 1d distributed tensor that has shards with uneven sizes using Checkpointable API.
diff --git a/test/distributed/checkpoint/test_hf_safetensor_e2e.py b/test/distributed/checkpoint/test_hf_safetensor_e2e.py
index f0316fde9f2c5..b9979da8a97f1 100644
--- a/test/distributed/checkpoint/test_hf_safetensor_e2e.py
+++ b/test/distributed/checkpoint/test_hf_safetensor_e2e.py
@@ -60,7 +60,7 @@ def test_save(self) -> None:
         self.assertEqual(
             sorted(state_dict_to_save.keys()), sorted(state_dict_loaded.keys())
         )
-        for key in state_dict_to_save.keys():
+        for key in state_dict_to_save:
             self.assertTrue(
                 torch.equal(state_dict_to_save[key], state_dict_loaded[key])
             )
@@ -89,7 +89,7 @@ def test_load(self) -> None:
         self.assertEqual(
             sorted(state_dict_to_save.keys()), sorted(state_dict_to_load.keys())
         )
-        for key in state_dict_to_save.keys():
+        for key in state_dict_to_save:
             self.assertTrue(
                 torch.equal(state_dict_to_save[key], state_dict_to_load[key])
             )
@@ -116,7 +116,7 @@ def test_load_into_empty_dict(self) -> None:
         self.assertEqual(
             sorted(state_dict_to_save.keys()), sorted(state_dict_loaded.keys())
         )
-        for key in state_dict_to_save.keys():
+        for key in state_dict_to_save:
             self.assertTrue(
                 torch.equal(state_dict_to_save[key], state_dict_loaded[key])
             )
@@ -156,7 +156,7 @@ def test_load_with_multiple_threads(self) -> None:
         self.assertEqual(
             sorted(state_dict_to_save.keys()), sorted(state_dict_to_load.keys())
         )
-        for key in state_dict_to_save.keys():
+        for key in state_dict_to_save:
             self.assertTrue(
                 torch.equal(state_dict_to_save[key], state_dict_to_load[key])
             )
@@ -208,7 +208,7 @@ def test_quantized_checkpoint_loading(self) -> None:
 
         # Create model.safetensors.index.json with weight mapping
         weight_map = {}
-        for key in quantized_checkpoint.keys():
+        for key in quantized_checkpoint:
             weight_map[key] = "model.safetensors"
 
         index_data = {
@@ -245,7 +245,7 @@ def test_quantized_checkpoint_loading(self) -> None:
             sorted(original_tensors.keys()), sorted(state_dict_to_load.keys())
         )
 
-        for tensor_name in original_tensors.keys():
+        for tensor_name in original_tensors:
             original = original_tensors[tensor_name]
             loaded = state_dict_to_load[tensor_name]
 
diff --git a/test/distributed/checkpoint/test_planner.py b/test/distributed/checkpoint/test_planner.py
index 16f7089206e34..a8620c383f2f9 100644
--- a/test/distributed/checkpoint/test_planner.py
+++ b/test/distributed/checkpoint/test_planner.py
@@ -18,6 +18,7 @@
 from torch.distributed.checkpoint.api import CheckpointException
 from torch.distributed.checkpoint.default_planner import (
     _create_default_local_metadata,
+    _validate_global_plan,
     create_default_global_save_plan,
     create_default_local_load_plan,
     create_default_local_save_plan,
@@ -28,6 +29,7 @@
 from torch.distributed.checkpoint.metadata import (
     BytesStorageMetadata,
     ChunkStorageMetadata,
+    Metadata,
     MetadataIndex,
     TensorProperties,
     TensorStorageMetadata,
@@ -560,6 +562,32 @@ def create_data(rank):
         self.assertTrue(_compare_save_plans(plan2, plan2))
 
 
+class TestValidateGlobalPlan(TestCase):
+    def _make_metadata(self, chunks, size):
+        storage = TensorStorageMetadata(
+            properties=TensorProperties(dtype=torch.float32),
+            size=torch.Size(size),
+            chunks=chunks,
+        )
+        return Metadata(state_dict_metadata={"param": storage})
+
+    def test_non_overlapping_chunks(self):
+        chunks = [
+            ChunkStorageMetadata(offsets=torch.Size([i]), sizes=torch.Size([1]))
+            for i in range(4)
+        ]
+        metadata = self._make_metadata(chunks, [4])
+        self.assertTrue(_validate_global_plan([SavePlan([])], metadata))
+
+    def test_detect_overlapping_chunks(self):
+        chunks = [
+            ChunkStorageMetadata(offsets=torch.Size([0]), sizes=torch.Size([2])),
+            ChunkStorageMetadata(offsets=torch.Size([1]), sizes=torch.Size([2])),
+        ]
+        metadata = self._make_metadata(chunks, [4])
+        self.assertFalse(_validate_global_plan([SavePlan([])], metadata))
+
+
 class TestLoadPlanner(TestCase):
     @with_temp_dir
     def test_strict(self):
diff --git a/test/distributed/checkpoint/test_quantized_hf_storage.py b/test/distributed/checkpoint/test_quantized_hf_storage.py
index c8ee756aaf3f4..da15cff68018c 100644
--- a/test/distributed/checkpoint/test_quantized_hf_storage.py
+++ b/test/distributed/checkpoint/test_quantized_hf_storage.py
@@ -15,6 +15,7 @@
 
 class TestQuantizedHfStorage(TestCase):
     def setUp(self):
+        super().setUp()
         """Set up common test fixtures."""
         self.temp_dir = tempfile.TemporaryDirectory()
         self.path = self.temp_dir.name
diff --git a/test/distributed/checkpoint/test_state_dict.py b/test/distributed/checkpoint/test_state_dict.py
index 095dc4bc3514a..03bcf7ce5e03e 100644
--- a/test/distributed/checkpoint/test_state_dict.py
+++ b/test/distributed/checkpoint/test_state_dict.py
@@ -769,7 +769,7 @@ def _test_deprecate_partial(self) -> None:
         model_state_dict3 = copy.deepcopy(model_state_dict3)
         self.assertEqual(len(model_state_dict2), 2)
         self.assertEqual(len(model_state_dict3), 2)
-        for key in model_state_dict3.keys():
+        for key in model_state_dict3:
             full_fqn = f"l.{key}"
             value1 = model_state_dict1[full_fqn]
             value2 = model_state_dict2[full_fqn]
@@ -886,7 +886,7 @@ def test_setting_meta_device_model(self) -> None:
             self.assertEqual(cpu_model_value, meta_model_value)
 
     @with_comms
-    @skip_if_lt_x_gpu(2)
+    @skip_if_lt_x_gpu(4)
     def test_setting_meta_device_model_broadcasting_and_memory(self) -> None:
         # This test verifies that we can set model state dict by a meta device model
         # With the correlated changes in state_dict, meta device model should be accepted
diff --git a/test/distributed/elastic/multiprocessing/api_test.py b/test/distributed/elastic/multiprocessing/api_test.py
index c05c0884c78ce..4a7172388974c 100644
--- a/test/distributed/elastic/multiprocessing/api_test.py
+++ b/test/distributed/elastic/multiprocessing/api_test.py
@@ -743,16 +743,19 @@ def test_binary_redirect_and_tee(self):
                 self.assertTrue(tail_log.stopped())
 
         def test_binary_duplicate_log_filters(self):
+            envs = {0: {"RANK": "0"}, 1: {"RANK": "1"}}
+            logs_specs = DefaultLogsSpecs(
+                log_dir=self.log_dir(),
+                redirects={0: Std.ERR, 1: Std.NONE},
+                tee={0: Std.OUT, 1: Std.ERR},
+            )
+            logs_dest = logs_specs.reify(envs)
             pc = start_processes(
                 name="trainer",
                 entrypoint=bin("echo1.py"),
                 args={0: ("helloA,helloB",), 1: ("worldA,worldB",)},
-                envs={0: {"RANK": "0"}, 1: {"RANK": "1"}},
-                logs_specs=DefaultLogsSpecs(
-                    log_dir=self.log_dir(),
-                    redirects={0: Std.ERR, 1: Std.NONE},
-                    tee={0: Std.OUT, 1: Std.ERR},
-                ),
+                envs=envs,
+                logs_specs=logs_specs,
                 log_line_prefixes={0: "[rank0]:", 1: "[rank1]:"},
                 duplicate_stdout_filters=["helloA"],
                 duplicate_stderr_filters=["worldA", "B"],
@@ -762,12 +765,18 @@ def test_binary_duplicate_log_filters(self):
             result = pc.wait()
 
             self.assertFalse(result.is_failed())
-            self.assert_in_file(["[rank0]:helloA stdout from 0"], pc.filtered_stdout)
+            self.assert_in_file(
+                ["[rank0]:helloA stdout from 0"], logs_dest.filtered_stdout
+            )
             self.assert_not_in_file(
-                ["[rank0]:helloB stdout from 0"], pc.filtered_stdout
+                ["[rank0]:helloB stdout from 0"], logs_dest.filtered_stdout
+            )
+            self.assert_in_file(
+                ["[rank1]:worldA stderr from 1"], logs_dest.filtered_stderr
+            )
+            self.assert_in_file(
+                ["[rank1]:worldB stderr from 1"], logs_dest.filtered_stderr
             )
-            self.assert_in_file(["[rank1]:worldA stderr from 1"], pc.filtered_stderr)
-            self.assert_in_file(["[rank1]:worldB stderr from 1"], pc.filtered_stderr)
             for tail_log in pc._tail_logs:
                 self.assertTrue(tail_log.stopped())
 
@@ -838,16 +847,19 @@ def test_function_redirect_and_tee(self):
         def test_function_duplicate_log_filters(self):
             for start_method in self._start_methods:
                 with self.subTest(start_method=start_method):
+                    envs = {0: {"RANK": "0"}, 1: {"RANK": "1"}}
+                    logs_specs = DefaultLogsSpecs(
+                        log_dir=self.log_dir(),
+                        redirects={0: Std.ERR, 1: Std.NONE},
+                        tee={0: Std.OUT, 1: Std.ERR},
+                    )
+                    logs_dest = logs_specs.reify(envs)
                     pc = start_processes(
                         name="trainer",
                         entrypoint=echo1,
                         args={0: ("helloA,helloB",), 1: ("worldA,worldB",)},
-                        envs={0: {"RANK": "0"}, 1: {"RANK": "1"}},
-                        logs_specs=DefaultLogsSpecs(
-                            log_dir=self.log_dir(),
-                            redirects={0: Std.ERR, 1: Std.NONE},
-                            tee={0: Std.OUT, 1: Std.ERR},
-                        ),
+                        envs=envs,
+                        logs_specs=logs_specs,
                         duplicate_stdout_filters=["helloA"],
                         duplicate_stderr_filters=["worldA", "B"],
                         start_method="spawn",
@@ -857,16 +869,16 @@ def test_function_duplicate_log_filters(self):
 
                     self.assertFalse(result.is_failed())
                     self.assert_in_file(
-                        ["[trainer0]:helloA stdout from 0"], pc.filtered_stdout
+                        ["[trainer0]:helloA stdout from 0"], logs_dest.filtered_stdout
                     )
                     self.assert_not_in_file(
-                        ["[trainer0]:helloB stdout from 0"], pc.filtered_stdout
+                        ["[trainer0]:helloB stdout from 0"], logs_dest.filtered_stdout
                     )
                     self.assert_in_file(
-                        ["[trainer1]:worldA stderr from 1"], pc.filtered_stderr
+                        ["[trainer1]:worldA stderr from 1"], logs_dest.filtered_stderr
                     )
                     self.assert_in_file(
-                        ["[trainer1]:worldB stderr from 1"], pc.filtered_stderr
+                        ["[trainer1]:worldB stderr from 1"], logs_dest.filtered_stderr
                     )
                     for tail_log in pc._tail_logs:
                         self.assertTrue(tail_log.stopped())
diff --git a/test/distributed/elastic/multiprocessing/errors/api_test.py b/test/distributed/elastic/multiprocessing/errors/api_test.py
index 7710adabfd254..5a3a28f988d12 100644
--- a/test/distributed/elastic/multiprocessing/errors/api_test.py
+++ b/test/distributed/elastic/multiprocessing/errors/api_test.py
@@ -225,9 +225,11 @@ def test_record_child_failure(self):
                 raise_child_failure_error_fn("trainer", trainer_error_file)
             pf = cm.exception.get_first_failure()[1]
             # compare worker error file with reply file and overridden error code
-            expect = json.load(open(pf.error_file))
+            with open(pf.error_file) as f:
+                expect = json.load(f)
             expect["message"]["errorCode"] = pf.exitcode
-            actual = json.load(open(self.test_error_file))
+            with open(self.test_error_file) as f:
+                actual = json.load(f)
             self.assertTrue(
                 json.dumps(expect, sort_keys=True),
                 json.dumps(actual, sort_keys=True),
diff --git a/test/distributed/elastic/multiprocessing/tail_log_test.py b/test/distributed/elastic/multiprocessing/tail_log_test.py
index 44626997c5401..1ed0d5e292106 100644
--- a/test/distributed/elastic/multiprocessing/tail_log_test.py
+++ b/test/distributed/elastic/multiprocessing/tail_log_test.py
@@ -100,8 +100,9 @@ def test_tail_write_to_dst_file(self):
         }
 
         dst = os.path.join(self.test_dir, "tailed_stdout.log")
+        dst_file = open(dst, "w", buffering=1)
         tail = TailLog(
-            name="writer", log_files=log_files, dst=dst, interval_sec=interval_sec
+            name="writer", log_files=log_files, dst=dst_file, interval_sec=interval_sec
         ).start()
         # sleep here is intentional to ensure that the log tail
         # can gracefully handle and wait for non-existent log files
@@ -117,10 +118,11 @@ def test_tail_write_to_dst_file(self):
         wait(futs, return_when=ALL_COMPLETED)
         self.assertFalse(tail.stopped())
         tail.stop()
+        dst_file.close()
 
         actual: dict[int, set[int]] = {}
-        with open(dst) as dst_file:
-            for line in dst_file:
+        with open(dst) as read_dst_file:
+            for line in read_dst_file:
                 header, num = line.split(":")
                 nums = actual.setdefault(header, set())
                 nums.add(int(num))
@@ -256,4 +258,4 @@ def test_tail_logfile_error_in_tail_fn(self, mock_logger):
         tail = TailLog("writer", log_files={0: self.test_dir}, dst=sys.stdout).start()
         tail.stop()
 
-        mock_logger.error.assert_called_once()
+        mock_logger.exception.assert_called_once()
diff --git a/test/distributed/elastic/multiprocessing/test_api.py b/test/distributed/elastic/multiprocessing/test_api.py
index 9b145777e1457..109dc5b557d12 100644
--- a/test/distributed/elastic/multiprocessing/test_api.py
+++ b/test/distributed/elastic/multiprocessing/test_api.py
@@ -21,6 +21,7 @@
 
 class SignalHandlingTest(TestCase):
     def setUp(self):
+        super().setUp()
         # Save original environment variable if it exists
         self.original_signals_env = os.environ.get(
             "TORCHELASTIC_SIGNALS_TO_HANDLE", None
diff --git a/test/distributed/flight_recorder/test_fr_analysis.py b/test/distributed/flight_recorder/test_fr_analysis.py
index 4e96360b8f1f6..6b7b0dcada162 100644
--- a/test/distributed/flight_recorder/test_fr_analysis.py
+++ b/test/distributed/flight_recorder/test_fr_analysis.py
@@ -2,23 +2,16 @@
 
 import copy
 import math
-import pathlib
-import sys
 from typing import Any
 
-
-REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent.parent
-
-sys.path.insert(0, str(REPO_ROOT))
-from tools.flight_recorder.components.builder import build_db
-from tools.flight_recorder.components.config_manager import JobConfig
-from tools.flight_recorder.components.types import COLLECTIVES, MatchInfo, MatchState
-from tools.flight_recorder.components.utils import match_one_event
-
-
-# Make sure to remove REPO_ROOT after import is done
-sys.path.remove(str(REPO_ROOT))
-
+from torch.distributed.flight_recorder.components.builder import build_db
+from torch.distributed.flight_recorder.components.config_manager import JobConfig
+from torch.distributed.flight_recorder.components.types import (
+    COLLECTIVES,
+    MatchInfo,
+    MatchState,
+)
+from torch.distributed.flight_recorder.components.utils import match_one_event
 from torch.testing._internal.common_utils import run_tests, TestCase
 
 
diff --git a/test/distributed/fsdp/test_fsdp_misc.py b/test/distributed/fsdp/test_fsdp_misc.py
index 99a1c3ad1707c..83a03489ada95 100644
--- a/test/distributed/fsdp/test_fsdp_misc.py
+++ b/test/distributed/fsdp/test_fsdp_misc.py
@@ -479,6 +479,7 @@ def test_fsdp_optimizer_overlap(self):
                     for (n, p), (n_prev, p_prev) in zip(
                         fsdp_overlap.named_parameters(), fsdp_overlap_prev_params
                     ):
+                        self.assertEqual(n, n_prev)
                         self.assertNotEqual(
                             p,
                             p_prev,
diff --git a/test/distributed/fsdp/test_fsdp_mixed_precision.py b/test/distributed/fsdp/test_fsdp_mixed_precision.py
index dee38d0403467..b4532a86e3052 100644
--- a/test/distributed/fsdp/test_fsdp_mixed_precision.py
+++ b/test/distributed/fsdp/test_fsdp_mixed_precision.py
@@ -498,7 +498,7 @@ def _run_test_mixed_precision_e2e(
                     for name, tensor in state_dict.items():
                         # Parameters and buffers are checkpointed in their
                         # original dtypes, which may be different.
-                        if name in named_buffers.keys():
+                        if name in named_buffers:
                             self.assertEqual(tensor.dtype, _BUFFER_ORIG_DTYPE)
                         else:
                             self.assertEqual(
diff --git a/test/distributed/fsdp/test_fsdp_state_dict.py b/test/distributed/fsdp/test_fsdp_state_dict.py
index b0677655186a6..50e9e6a798681 100644
--- a/test/distributed/fsdp/test_fsdp_state_dict.py
+++ b/test/distributed/fsdp/test_fsdp_state_dict.py
@@ -587,9 +587,7 @@ def test_basic_save_and_load_state_dict(
                     model, cpu_offload.offload_params, fp16
                 )
 
-            ignore_keys = [
-                k for k in fsdp_state_dict.keys() if NON_ROOT_FSDP_PREFIX in k
-            ]
+            ignore_keys = [k for k in fsdp_state_dict if NON_ROOT_FSDP_PREFIX in k]
 
             self._validate_state_dict_contents(
                 model,
@@ -910,7 +908,7 @@ def test_state_dict_load_into_local_module(
         with sd_mgr:
             fsdp_state_dict = model.state_dict()
 
-        ignore_keys = [k for k in fsdp_state_dict.keys() if NON_ROOT_FSDP_PREFIX in k]
+        ignore_keys = [k for k in fsdp_state_dict if NON_ROOT_FSDP_PREFIX in k]
         self._validate_state_dict_contents(
             model,
             fsdp_state_dict,
@@ -959,9 +957,7 @@ def _create_module(wrap_fsdp=True):
                 # Full name of linear_skip param tensors in SkipModel, as would be
                 # stored in checkpoint.
                 linear_skip_tensor_names = [
-                    k
-                    for k in dict(module.named_parameters()).keys()
-                    if LINEAR_SKIP in k
+                    k for k in dict(module.named_parameters()) if LINEAR_SKIP in k
                 ]
                 # skip SkipModule
                 linear_skip = getattr(module, LINEAR_SKIP)
diff --git a/test/distributed/launcher/api_test.py b/test/distributed/launcher/api_test.py
index 48465516a913b..330fd302bbd45 100644
--- a/test/distributed/launcher/api_test.py
+++ b/test/distributed/launcher/api_test.py
@@ -137,7 +137,7 @@ def setUp(self):
         self.test_dir = tempfile.mkdtemp()
 
         # remove any lingering environment variables.
-        for env in os.environ.keys():
+        for env in os.environ.keys():  # noqa: SIM118
             if env.startswith("PET_"):
                 del os.environ[env]
 
diff --git a/test/distributed/launcher/script_deviceid.py b/test/distributed/launcher/script_deviceid.py
new file mode 100644
index 0000000000000..1a09cdc102633
--- /dev/null
+++ b/test/distributed/launcher/script_deviceid.py
@@ -0,0 +1,44 @@
+# Owner(s): ["oncall: r2p"]
+
+# This is a helper script for
+# test_run.py::ElasticLaunchTest::test_virtual_local_rank. It prints out the
+# generated inductor output for a simple function.
+
+import os
+from unittest.mock import patch
+
+import torch
+import torch.distributed as dist
+from torch._inductor import codecache
+
+
+@torch.compile
+def myfn(x: torch.Tensor) -> torch.Tensor:
+    return x + x
+
+
+dist.init_process_group(backend="nccl")
+
+local_rank = int(os.environ.get("LOCAL_RANK", "cuda:0"))
+torch.cuda.set_device(local_rank)
+
+
+def print_output_code(original_fn):
+    def wrapper(msg, *args, **kwargs):
+        # Check if this is the "Output code:" message
+        if args and "Output code:" in msg:
+            print(args[0])
+
+    return wrapper
+
+
+x = torch.rand(2, 2, device="cuda")
+
+with patch.object(
+    codecache.output_code_log,
+    "debug",
+    side_effect=print_output_code(codecache.output_code_log.debug),
+):
+    y = myfn(x)
+
+dist.destroy_process_group()
diff --git a/test/distributed/launcher/test_api.py b/test/distributed/launcher/test_api.py
index e6e778fe2ff32..04cc17912cf48 100644
--- a/test/distributed/launcher/test_api.py
+++ b/test/distributed/launcher/test_api.py
@@ -16,6 +16,7 @@
 
 class LauncherApiTest(TestCase):
     def setUp(self):
+        super().setUp()
         # Save original environment variable if it exists
         self.original_signals_env = os.environ.get(
             "TORCHELASTIC_SIGNALS_TO_HANDLE", None
diff --git a/test/distributed/launcher/test_run.py b/test/distributed/launcher/test_run.py
index d271e60954ae7..484a975051d4f 100644
--- a/test/distributed/launcher/test_run.py
+++ b/test/distributed/launcher/test_run.py
@@ -16,7 +16,7 @@
 import tempfile
 import uuid
 from contextlib import closing, redirect_stderr, redirect_stdout
-from unittest import mock
+from unittest import mock, skipIf
 from unittest.mock import MagicMock, Mock, patch
 
 import torch.distributed.run as launch
@@ -28,6 +28,7 @@
 from torch.testing._internal.common_utils import (
     run_tests,
     skip_but_pass_in_sandcastle_if,
+    TEST_CUDA,
     TEST_WITH_DEV_DBG_ASAN,
     TestCase,
 )
@@ -69,7 +70,7 @@ def setUp(self):
         self.test_dir = tempfile.mkdtemp()
 
         # remove any lingering environment variables
-        for env in os.environ.keys():
+        for env in os.environ.keys():  # noqa: SIM118
             if env.startswith("PET_"):
                 del os.environ[env]
 
@@ -677,6 +678,96 @@ def test_capture_logs_using_default_logs_specs(self):
         for i in range(nproc_per_node):
             self.assertTrue(f"[rank{i}]: creating " in captured_out.getvalue())
 
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
+    @skipIf(not TEST_CUDA, "requires CUDA")
+    def test_virtual_local_rank(self):
+        """
+        Test that virtual-local-rank ensures consistent device IDs across ranks.
+        Without it, ranks may compile to different devices, leading to different code.
+        """
+        run_id = str(uuid.uuid4().int)
+        nnodes = 1
+        nproc_per_node = 2
+
+        # Helper function to run and capture output
+        def run_test(use_virtual_local_rank):
+            args = [
+                f"--nnodes={nnodes}",
+                f"--nproc-per-node={nproc_per_node}",
+                f"--rdzv-id={run_id}",
+                "--monitor-interval=1",
+                "--start-method=spawn",
+                "--redirect=3",
+                "--tee=3",
+            ]
+            if use_virtual_local_rank:
+                args.append("--virtual-local-rank")
+
+            args.append(path("script_deviceid.py"))
+
+            captured_out = io.StringIO()
+            captured_err = io.StringIO()
+            with redirect_stdout(captured_out), redirect_stderr(captured_err):
+                launch.main(args)
+
+            return captured_out.getvalue()
+
+        def split_ranks(output):
+            default0 = []
+            default1 = []
+            for line in output.splitlines():
+                if "cuda:" not in line:
+                    continue
+                if line.startswith("[default0]:"):
+                    default0.append(line[11:])
+                elif line.startswith("[default1]:"):
+                    default1.append(line[11:])
+            return default0, default1
+
+        # First, run WITHOUT virtual-local-rank - outputs should differ
+        output = run_test(use_virtual_local_rank=False)
+        rank0, rank1 = split_ranks(output)
+
+        # Verify we actually captured compiled code from both ranks
+        self.assertGreater(
+            len(rank0), 0, "Expected to capture compiled code from rank 0"
+        )
+        self.assertGreater(
+            len(rank1), 0, "Expected to capture compiled code from rank 1"
+        )
+
+        # Without virtual-local-rank, the ranks should have DIFFERENT compiled code
+        # because they see different device IDs (cuda:0 vs cuda:1)
+        self.assertNotEqual(
+            rank0,
+            rank1,
+            "Expected different compiled code without --virtual-local-rank",
+        )
+
+        # Now run WITH virtual-local-rank - outputs should be identical
+        output = run_test(use_virtual_local_rank=True)
+        rank0, rank1 = split_ranks(output)
+
+        # Verify we actually captured compiled code from both ranks
+        self.assertGreater(
+            len(rank0),
+            0,
+            "Expected to capture compiled code from rank 0 with --virtual-local-rank",
+        )
+        self.assertGreater(
+            len(rank1),
+            0,
+            "Expected to capture compiled code from rank 1 with --virtual-local-rank",
+        )
+
+        # With virtual-local-rank, both ranks should have IDENTICAL compiled code
+        # because they both see cuda:0 during compilation
+        self.assertEqual(
+            rank0, rank1, "Expected identical compiled code with --virtual-local-rank"
+        )
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/pipelining/test_schedule.py b/test/distributed/pipelining/test_schedule.py
index 714ab8f659111..36d334d18b02c 100644
--- a/test/distributed/pipelining/test_schedule.py
+++ b/test/distributed/pipelining/test_schedule.py
@@ -21,6 +21,7 @@
 from torch.distributed.pipelining._utils import generate_stage_to_rank_mapping
 from torch.distributed.pipelining.schedules import (
     _Action,
+    _add_reduce_grad,
     _add_send_recv,
     _add_unshard_reshard,
     _format_pipeline_order,
@@ -574,6 +575,45 @@ def test_unshard_reshard(self, test_info):
                 ),
             )
 
+    @parametrize(
+        "test_info",
+        [
+            {
+                "compute": ["0F0", "0F1", "   ", "0B0", "0B1"],
+                "comms": ["0F0", "0F1", "0B0", "0B1", "0REDUCE_GRAD"],
+            },
+            {
+                "compute": ["0F0", "0F1", "1F0", "1F1", "1B0", "1B1", "0B0", "0B1"],
+                "comms": [
+                    "0F0",
+                    "0F1",
+                    "1F0",
+                    "1F1",
+                    "1B0",
+                    "1B1",
+                    "1REDUCE_GRAD",
+                    "0B0",
+                    "0B1",
+                    "0REDUCE_GRAD",
+                ],
+            },
+        ],
+    )
+    def test_reduce_grad(self, test_info):
+        compute_sch = self._parse_actions(test_info["compute"])
+        expected_comms_sch = self._parse_actions(test_info["comms"])
+
+        comms_sch = _add_reduce_grad(compute_sch, 2)
+        for expected, actual in zip(expected_comms_sch, comms_sch, strict=True):
+            self.assertEqual(
+                expected,
+                actual,
+                (
+                    f"Mismatch: expected action {expected} but found {actual}."
+                    f"\nWhole Schedule: {comms_sch}"
+                ),
+            )
+
     @parametrize(
         "test_info",
         [
diff --git a/test/distributed/pipelining/test_schedule_multiproc.py b/test/distributed/pipelining/test_schedule_multiproc.py
index 9806bb5d03874..687cb113b48bf 100644
--- a/test/distributed/pipelining/test_schedule_multiproc.py
+++ b/test/distributed/pipelining/test_schedule_multiproc.py
@@ -39,6 +39,7 @@
 from torch.testing._internal.common_distributed import (
     MultiProcContinuousTest,
     requires_accelerator_dist_backend,
+    skip_if_lt_x_gpu,
 )
 from torch.testing._internal.common_utils import (
     check_leaked_tensors,
@@ -46,6 +47,7 @@
     parametrize,
     run_tests,
     skip_but_pass_in_sandcastle_if,
+    TEST_MULTIACCELERATOR,
 )
 
 
@@ -56,7 +58,6 @@
 torch.manual_seed(0)
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 backend = dist.get_default_backend_for_device(device_type)
-TEST_MULTIACCELERATOR = torch.accelerator.device_count() >= 2
 
 
 @dataclass
@@ -231,6 +232,7 @@ def config(self) -> PipelineTestConfig:
         not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
     )
     @parametrize("ScheduleClass", [_ScheduleForwardOnly])
+    @skip_if_lt_x_gpu(4)
     def test_forward_only(self, ScheduleClass):
         mod, mod_ref, x, _, _ = setup_models_and_data(self.config)
         x_clone = x.clone()
@@ -274,6 +276,7 @@ def test_forward_only(self, ScheduleClass):
             ScheduleInterleavedZeroBubble,
         ],
     )
+    @skip_if_lt_x_gpu(4)
     def test_eval_inference_mode(self, ScheduleClass):
         num_microbatches = 4
         if ScheduleClass in [
@@ -351,6 +354,7 @@ def test_eval_inference_mode(self, ScheduleClass):
             ScheduleInterleavedZeroBubble,
         ],
     )
+    @skip_if_lt_x_gpu(4)
     def test_return_output(self, ScheduleClass):
         num_microbatches = 4
         if ScheduleClass in [
@@ -406,6 +410,7 @@ def test_return_output(self, ScheduleClass):
         not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
     )
     @parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
+    @skip_if_lt_x_gpu(4)
     def test_multi_iter(self, ScheduleClass):
         mod, _, x, target, loss_fn = setup_models_and_data(self.config)
         chunks = 4
@@ -429,6 +434,7 @@ def test_multi_iter(self, ScheduleClass):
         not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
     )
     @parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
+    @skip_if_lt_x_gpu(4)
     def test_kwargs_with_tracer(self, ScheduleClass):
         mod = ModelWithKwargs(d_hid, splits=self.world_size)
         mod.to(self.device)
@@ -481,6 +487,7 @@ def test_kwargs_with_tracer(self, ScheduleClass):
         not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
     )
     @parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
+    @skip_if_lt_x_gpu(4)
     def test_grad_with_tracer(self, ScheduleClass):
         mod, ref_mod, x, target, loss_fn = setup_models_and_data(self.config)
 
@@ -523,6 +530,7 @@ def test_grad_with_tracer(self, ScheduleClass):
     )
     @parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
     @parametrize("shape_inference", [True, False])
+    @skip_if_lt_x_gpu(4)
     def test_grad_with_manual(self, ScheduleClass, shape_inference):
         mod, ref_mod, x, target, loss_fn = setup_models_and_data(self.config)
 
@@ -586,6 +594,7 @@ def test_grad_with_manual(self, ScheduleClass, shape_inference):
             ScheduleInterleavedZeroBubble,
         ],
     )
+    @skip_if_lt_x_gpu(4)
     def test_grad_with_manual_interleaved(self, ScheduleClass):
         stages_per_rank = 2
         n_stages = stages_per_rank * self.world_size
@@ -650,6 +659,7 @@ def test_grad_with_manual_interleaved(self, ScheduleClass):
         not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
     )
     @parametrize("ScheduleClass", [ScheduleInterleavedZeroBubble])
+    @skip_if_lt_x_gpu(4)
     def test_schedule_with_weight_update_mlp_e2e(self, ScheduleClass):
         stages_per_rank = 2
         n_stages = stages_per_rank * self.world_size
@@ -736,6 +746,7 @@ def dw_runner():
         "schedule_class",
         [ScheduleZBVZeroBubble, ScheduleDualPipeV],
     )
+    @skip_if_lt_x_gpu(4)
     def test_v_shape_schedules(self, schedule_class):
         n_stages = 8
         rank_stages = {0: [0, 7], 1: [1, 6], 2: [2, 5], 3: [3, 4]}
@@ -780,6 +791,7 @@ def test_v_shape_schedules(self, schedule_class):
     @skip_but_pass_in_sandcastle_if(
         not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
     )
+    @skip_if_lt_x_gpu(4)
     def test_custom_function_callback(self):
         """Test the custom function callback functionality with _PipelineScheduleRuntime."""
         n_stages = 8
@@ -979,6 +991,7 @@ def overlap_callback(action: _Action, ctx: _PipelineContext):
         "ScheduleClass",
         [ScheduleInterleavedZeroBubble, ScheduleInterleaved1F1B],
     )
+    @skip_if_lt_x_gpu(4)
     def test_zero_bubble_with_model_kwargs(self, ScheduleClass):
         stages_per_rank = 2
         n_stages = stages_per_rank * self.world_size
@@ -1072,6 +1085,7 @@ def config(self) -> PipelineTestConfig:
         "schedule_class",
         [ScheduleVShaped, ScheduleUnbalanced],
     )
+    @skip_if_lt_x_gpu(4)
     def test_non_symmetric_stage_ids(self, schedule_class):
         n_stages = schedule_class.n_stages
         rank_stages = schedule_class.rank_stages
@@ -1121,6 +1135,7 @@ def test_non_symmetric_stage_ids(self, schedule_class):
         not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
     )
     @parametrize("ScheduleClass", [ScheduleWithReorderedB])
+    @skip_if_lt_x_gpu(4)
     def test_pipeline_schedule_runtime_custom_sched(self, ScheduleClass):
         n_stages = 2
         stages_per_rank = 1
@@ -1181,6 +1196,7 @@ def test_pipeline_schedule_runtime_custom_sched(self, ScheduleClass):
         not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
     )
     @parametrize("ScheduleClass", [ScheduleWithW])
+    @skip_if_lt_x_gpu(4)
     def test_schedule_with_native_zero_bubble(self, ScheduleClass):
         n_stages = ScheduleClass.n_stages
         num_microbatches = ScheduleClass.num_microbatches
diff --git a/test/distributed/pipelining/test_stage.py b/test/distributed/pipelining/test_stage.py
index 1e6dad4a77d77..b9ad3d5cb6771 100644
--- a/test/distributed/pipelining/test_stage.py
+++ b/test/distributed/pipelining/test_stage.py
@@ -24,6 +24,7 @@
     parametrize,
     run_tests,
     skip_but_pass_in_sandcastle_if,
+    TEST_MULTIACCELERATOR,
 )
 from torch.utils._pytree import tree_map_only
 
@@ -34,7 +35,6 @@
 
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 backend = dist.get_default_backend_for_device(device_type)
-TEST_MULTIACCELERATOR = torch.accelerator.device_count() >= 2
 
 torch.manual_seed(0)
 
diff --git a/test/distributed/tensor/debug/test_debug_mode.py b/test/distributed/tensor/debug/test_debug_mode.py
index 07442f34c8946..1ba4bccf3696d 100644
--- a/test/distributed/tensor/debug/test_debug_mode.py
+++ b/test/distributed/tensor/debug/test_debug_mode.py
@@ -1,12 +1,22 @@
 # Owner(s): ["oncall: distributed"]
 
 import contextlib
+import unittest
 
 import torch
 import torch.distributed as dist
+from torch._dynamo.testing import CompileCounterWithBackend
 from torch._subclasses.fake_tensor import FakeTensorMode
-from torch.distributed.tensor import DeviceMesh, DTensor, Partial, Replicate, Shard
+from torch.distributed.tensor import (
+    DeviceMesh,
+    distribute_tensor,
+    DTensor,
+    Partial,
+    Replicate,
+    Shard,
+)
 from torch.distributed.tensor._dtensor_spec import ShardOrderEntry
+from torch.fx.experimental.proxy_tensor import make_fx
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
@@ -15,8 +25,17 @@
     TestCase,
 )
 from torch.testing._internal.distributed.fake_pg import FakeStore
-from torch.utils._debug_mode import _OpCall, _RedistributeCall, DebugMode
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
+from torch.utils._debug_mode import (
+    _OpCall,
+    _RedistributeCall,
+    _TritonKernelCall,
+    DebugMode,
+    hash_tensor_fn,
+    norm_hash_fn,
+)
 from torch.utils._python_dispatch import TorchDispatchMode
+from torch.utils._triton import has_triton_package
 
 
 @requires_cuda
@@ -42,22 +61,24 @@ def test_debug_mode_mm(self):
         x_dtensor = DTensor.from_local(x, mesh, [Shard(0)], run_check=False)
         y_dtensor = DTensor.from_local(y, mesh, [Shard(0)], run_check=False)
 
-        with DebugMode(record_torchfunction=True) as debug_mode:
+        with DebugMode(
+            record_torchfunction=True, record_ids=True, record_output=True
+        ) as debug_mode:
             torch.mm(x_dtensor, y_dtensor).sum()
 
         self.assertExpectedInline(
             debug_mode.debug_string(),
             """\
-  torch.mm(dt: f32[8, 8]| S(0), dt: f32[8, 32]| S(0))
-    aten::mm(dt: f32[8, 8]| S(0), dt: f32[8, 32]| S(0))
+  torch.mm(dt$0: f32[8, 8]| S(0), dt$1: f32[8, 32]| S(0))  ->  dt$6: f32[8, 32]| S(0)
+    aten::mm(dt$0: f32[8, 8]| S(0), dt$1: f32[8, 32]| S(0))
       redistribute_input(1, S(0) -> R)
-        redistribute_input(t: f32[1, 32], trace: S(0)->R)
-          _c10d_functional::all_gather_into_tensor(t: f32[1, 32], 8, 0)
-          _c10d_functional::wait_tensor(t: f32[8, 32])
-      aten::mm(t: f32[1, 8], t: f32[8, 32])
-  <method 'sum' of 'torch._C.TensorBase' objects>(dt: f32[8, 32]| S(0))
-    aten::sum(dt: f32[8, 32]| S(0))
-      aten::sum(t: f32[1, 32])""",
+        redistribute_input(t$2: f32[1, 32], trace: S(0)->R)
+          _c10d_functional::all_gather_into_tensor(t$2: f32[1, 32], 8, 0)  ->  t$3: f32[8, 32]
+          _c10d_functional::wait_tensor(t$3: f32[8, 32])  ->  t$3: f32[8, 32]
+      aten::mm(t$4: f32[1, 8], t$3: f32[8, 32])  ->  t$5: f32[1, 32]
+  <method 'sum' of 'torch._C.TensorBase' objects>(dt$6: f32[8, 32]| S(0))  ->  dt$8: f32[]| P
+    aten::sum(dt$6: f32[8, 32]| S(0))
+      aten::sum(t$5: f32[1, 32])  ->  t$7: f32[]""",
         )
 
         self.assertTrue(isinstance(debug_mode.operators[0], _OpCall))
@@ -96,6 +117,28 @@ def mm(x, y):
             "aten::sum(t: f32[1, 32])  # {'hash': " in debug_mode.debug_string()
         )
 
+        # check tuple hash functions
+        with (
+            DebugMode() as debug_mode,
+            DebugMode.log_tensor_hashes(hash_fn=["norm", "hash_tensor"]),
+        ):
+            mm(x_dtensor, y_dtensor)
+
+        output_hash = debug_mode.operators[-1].log["hash"]
+        norm_ = lambda x: norm_hash_fn(x, use_scalar=True)  # noqa: E731
+        hash_ = lambda x: hash_tensor_fn(x, use_scalar=True)  # noqa: E731
+
+        self.assertEqual(output_hash[0], norm_(eager_out))
+        self.assertEqual(output_hash[1], hash_(eager_out))
+
+        # some edge cases
+        self.assertEqual(norm_(torch.tensor(torch.nan)), torch.nan)
+        self.assertEqual(norm_(torch.tensor(torch.inf)), torch.inf)
+        self.assertEqual(norm_(torch.complex(torch.ones(4), torch.zeros(4))), 4)
+        self.assertEqual(hash_(torch.ones(4, dtype=torch.float8_e5m2)), 0)
+        self.assertEqual(hash_(torch.ones(4, dtype=torch.int8)), 0)
+        self.assertEqual(hash_(torch.ones(5, dtype=torch.int8)), 1)
+
     def test_debug_string_inside_context(self):
         mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
 
@@ -366,14 +409,22 @@ def test_debug_mode_higher_order_cond(self):
         self.assertIn("torch.ops.higher_order.cond", debug_mode.debug_string())
 
     def test_compile(self):
-        @torch.compile
+        cnt = CompileCounterWithBackend("inductor")
+
+        @torch.compile(backend=cnt)
         def f(x):
             return x.sin().cos()
 
         x = torch.randn(8)
+        f(x)
         with DebugMode() as debug_mode:
             f(x)
-        self.assertEqual(len(debug_mode.debug_string()), 0)
+            self.assertEqual(len(debug_mode.debug_string()), 0)
+            f(x)
+        f(x)
+        self.assertEqual(
+            cnt.frame_count, 1
+        )  # check DebugMode doesn't trigger additional recompilations
 
     def test_nn_module(self):
         class Foo(torch.nn.Module):
@@ -423,6 +474,138 @@ def forward(self, x):
             op for op in debug_mode.operators if str(op.op) == "aten.sum.dim_IntList"
         ][-1]
         self.assertTrue("self.l2(self.l1(x))" in sum_op.fwd_stack_trace)
+        self.assertTrue(
+            "self.l2(self.l1(x))" in debug_mode.debug_string(show_stack_trace=True)
+        )
+
+    @unittest.skipIf(not HAS_GPU, "requires GPU")
+    @unittest.skipIf(not has_triton_package(), "requires triton")
+    def test_triton_kernel_logs(self):
+        import triton
+
+        from torch.testing._internal.triton_utils import add_kernel_autotuned
+
+        def call_triton(x, y):
+            output = torch.zeros_like(x)
+            n_elements = output.numel()
+            grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)  # noqa: E731
+            add_kernel_autotuned[grid](x, y, output, n_elements)
+            return output
+
+        x = torch.randn(128, device=GPU_TYPE)
+        y = torch.randn(128, device=GPU_TYPE)
+
+        with DebugMode() as debug_mode:
+            torch.compile(call_triton)(x, y)
+
+        triton_calls = [
+            op for op in debug_mode.operators if isinstance(op, _TritonKernelCall)
+        ]
+        self.assertGreater(len(triton_calls), 0)
+        self.assertIn("[triton]", triton_calls[0].render([]))
+
+    def test_check_hash_mismatches(self):
+        x = torch.randn(64, 64, device=GPU_TYPE)
+        x_different = torch.randn(64, 64, device=GPU_TYPE)
+
+        # Identical runs should have no mismatches
+        with DebugMode() as dm1, DebugMode.log_tensor_hashes():
+            x.sin().sum()
+        with DebugMode() as dm2, DebugMode.log_tensor_hashes():
+            x.sin().sum()
+        mismatches = DebugMode.check_hash_mismatches(dm1.logs, dm2.logs)
+        self.assertEqual(len(mismatches), 0)
+
+        # Different inputs should produce hash mismatches
+        with DebugMode() as dm3, DebugMode.log_tensor_hashes():
+            x_different.sin().sum()
+
+        # Check that mismatches are detected
+        mismatches = DebugMode.check_hash_mismatches(dm1.logs, dm3.logs)
+        self.assertEqual(len(mismatches), 2)
+        self.assertEqual(
+            [call["call"] for call in mismatches], ["aten::sin", "aten::sum"]
+        )
+
+    @unittest.skipIf(not HAS_GPU, "requires GPU")
+    @unittest.skipIf(not has_triton_package(), "requires triton")
+    def test_check_triton_hash_mismatches(self):
+        import triton
+
+        from torch.testing._internal.triton_utils import add_kernel_autotuned
+
+        def call_triton(x, y):
+            output = torch.zeros_like(x)
+            n_elements = output.numel()
+            grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)  # noqa: E731
+            add_kernel_autotuned[grid](x, y, output, n_elements)
+            return output
+
+        a = torch.randn(128, device=GPU_TYPE)
+        b = torch.randn(128, device=GPU_TYPE)
+        c = torch.randn(128, device=GPU_TYPE)
+
+        # Run with hash logging to verify triton kernels can be hashed
+        with DebugMode() as dm_t1, DebugMode.log_tensor_hashes(hash_inputs=True):
+            torch.compile(call_triton)(a, b)
+
+        # Different inputs should have different hashes in triton kernels
+        with DebugMode() as dm_t2, DebugMode.log_tensor_hashes(hash_inputs=True):
+            torch.compile(call_triton)(a, c)
+
+        # Compare triton kernel hashes
+        mismatches = DebugMode.check_hash_mismatches(
+            dm_t1.logs, dm_t2.logs, compare_inputs=True
+        )
+        triton_mismatches = [m for m in mismatches if m["call_type"] == "triton kernel"]
+        self.assertGreater(len(triton_mismatches), 0)
+
+        # check both input & output hash mismatches are detected
+        self.assertGreater(len([m for m in triton_mismatches if m["is_input_hash"]]), 0)
+        self.assertGreater(
+            len([m for m in triton_mismatches if not m["is_input_hash"]]), 0
+        )
+
+    def test_check_structure_mismatches(self):
+        x = torch.randn(32, 32, device=self.device_type)
+
+        with DebugMode() as dm1, DebugMode.log_tensor_hashes():
+            x.sin()
+        with DebugMode() as dm2, DebugMode.log_tensor_hashes():
+            x.cos()
+        with DebugMode() as dm3, DebugMode.log_tensor_hashes():
+            x.sin().cos()
+
+        with self.assertRaisesRegex(ValueError, "Operators don't match"):
+            DebugMode.check_hash_mismatches(dm1.logs, dm2.logs)
+
+        with self.assertRaisesRegex(ValueError, "Log lengths don't match"):
+            DebugMode.check_hash_mismatches(dm1.logs, dm3.logs)
+
+    def test_pretty_print_dtensor_make_fx(self):
+        mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+
+        A = torch.randn(8, 32)
+        B = torch.randn(32, 32)
+        dA = distribute_tensor(A, mesh, [Shard(0)]).requires_grad_()
+        dB = distribute_tensor(B, mesh, [Replicate()]).requires_grad_()
+
+        def f(dA, dB):
+            dy = dA @ dB
+            loss = dy.sum()
+            loss.backward()
+            return dA.grad, dB.grad
+
+        # We actually need the tracing_mode='fake' here, or to trace under a FakeTensorMode.
+        # make_fx has some logic to ensure we don't accidentally stash real tensors in the graph
+        # so we won't stash our DTensors properly if they don't hold Fake inner tensors
+        gm = make_fx(f, tracing_mode="fake")(dA, dB)
+        # DCE isn't necessary here, there were just a lot of dead detach() nodes that spammed the graph
+        gm.graph.eliminate_dead_code()
+        gm.recompile()
+        # Colored is nice for actual viewing, not using in this test though
+        gm_str = gm.print_readable(colored=False, print_output=False)
+        self.assertTrue('"DTensor(f32[8, 32], S(0))" = torch.ops.aten.mm' in gm_str)
 
 
 instantiate_parametrized_tests(TestDTensorDebugMode)
diff --git a/test/distributed/tensor/test_attention.py b/test/distributed/tensor/test_attention.py
index eaf3a4042060d..6c3485f9d7025 100644
--- a/test/distributed/tensor/test_attention.py
+++ b/test/distributed/tensor/test_attention.py
@@ -3,7 +3,8 @@
 import itertools
 import random
 import unittest
-from typing import Any, Callable, ClassVar, Optional
+from collections.abc import Callable
+from typing import Any, ClassVar, Optional
 
 import torch
 import torch.distributed as dist
diff --git a/test/distributed/tensor/test_convolution_ops.py b/test/distributed/tensor/test_convolution_ops.py
index de4343eef6a4e..0a06bd66df5e8 100644
--- a/test/distributed/tensor/test_convolution_ops.py
+++ b/test/distributed/tensor/test_convolution_ops.py
@@ -204,14 +204,16 @@ def test_conv_backward_none_grad_inp(self):
         self.assertTrue(b_dt.grad is not None)
         self.assertTrue(x_dt.grad is None)
 
-    def _run_single_arg_fwd(self, model, arg) -> tuple[torch.Tensor, torch.Tensor]:
+    def _run_single_arg_fwd(
+        self, model, arg, placements=None
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         """Given model and arg, runs fwd model local and distbuted given device_mesh"""
         device_mesh = self.build_device_mesh()
         model_copy = copy.deepcopy(model).to(device=self.device_type)
         dist_model = distribute_module(model, device_mesh, _conv_fn)
-        arg_dt = DTensor.from_local(arg, device_mesh, [Replicate()])
+        arg_dt = DTensor.from_local(arg, device_mesh, placements)
         out_dt = dist_model(arg_dt.to(device=self.device_type))
-        out = model_copy(arg)
+        out = model_copy(arg_dt.full_tensor())
         return (out_dt.full_tensor(), out)
 
     @with_comms
@@ -219,25 +221,119 @@ def test_conv1d(self):
         model = nn.Conv1d(64, 64, 3, padding=1)
         x = torch.randn(1, 64, 8, device=self.device_type)
         out_dt, out = self._run_single_arg_fwd(model, x)
-        self.assertEqual(out_dt.shape, out.shape)
+        self.assertEqual(out_dt, out)
 
     @with_comms
     def test_conv3d(self):
         model = nn.Conv3d(64, 64, 3, padding=1)
         x = torch.randn(1, 64, 8, 8, 8, device=self.device_type)
-        out_dt, out = self._run_single_arg_fwd(model, x)
-        self.assertEqual(out_dt.shape, out.shape)
+        out_dt, out = self._run_single_arg_fwd(model, x, [Shard(0)])
+        self.assertEqual(out_dt, out)
+
+    @with_comms
+    def test_conv2d_no_bias_compile(self):
+        """Test Conv2d with bias=False in compile mode (Issue #167091)
+
+        Regression test: Previously this would fail during torch.compile
+        tracing with AssertionError when bias_spec was None.
+        """
+        device_mesh = self.build_device_mesh()
+
+        def conv_fn(x, w):
+            return F.conv2d(x, w, bias=None, padding=1)
+
+        compiled_fn = torch.compile(conv_fn)
+
+        # Create tensors
+        x = torch.randn(1, 4, 5, 5, device=self.device_type)
+        w = torch.randn(8, 4, 3, 3, device=self.device_type)
+
+        # Distribute tensors
+        x_dt = distribute_tensor(x, device_mesh, [Replicate()])
+        w_dt = distribute_tensor(w, device_mesh, [Replicate()])
+
+        # Test eager mode for comparison
+        result_eager = conv_fn(x_dt, w_dt)
+
+        # Test compiled mode - this should not crash
+        result_compiled = compiled_fn(x_dt, w_dt)
+
+        # Verify shape is correct (the key regression test)
+        self.assertEqual(result_compiled.shape, torch.Size([1, 8, 5, 5]))
+
+        # Verify numerical correctness
+        torch.testing.assert_close(result_compiled.to_local(), result_eager.to_local())
+
+    @with_comms
+    def test_conv2d_no_bias_backward(self):
+        """Test Conv2d backward pass with bias=False (Issue #167091)
+
+        Regression test: Previously backward pass would fail when
+        grad_bias_spec was None.
+        """
+        device_mesh = self.build_device_mesh()
+
+        # Create tensors with requires_grad
+        x = torch.randn(1, 4, 5, 5, device=self.device_type)
+        w = torch.randn(8, 4, 3, 3, device=self.device_type, requires_grad=True)
+
+        # Distribute tensors
+        x_dt = distribute_tensor(x, device_mesh, [Replicate()])
+        w_dt = torch.nn.Parameter(distribute_tensor(w, device_mesh, [Replicate()]))
+
+        # Forward pass
+        result = F.conv2d(x_dt, w_dt, bias=None, padding=1)
+
+        # Backward pass - this should not crash
+        grad_output = torch.randn_like(result)
+        result.backward(grad_output)
+
+        # Check weight gradient exists (the key regression test)
+        self.assertIsNotNone(w_dt.grad)
+        self.assertEqual(w_dt.grad.shape, torch.Size([8, 4, 3, 3]))
+
+    @with_comms
+    def test_conv2d_module_no_bias(self):
+        """Test nn.Conv2d module with bias=False (Issue #167091)
+
+        Regression test: Ensures nn.Conv2d with bias=False works with DTensor.
+        """
+        device_mesh = self.build_device_mesh()
+
+        # Create model with bias=False
+        model = nn.Conv2d(4, 8, kernel_size=3, padding=1, bias=False).to(
+            self.device_type
+        )
+        nn.init.ones_(model.weight)
+
+        # Distribute model
+        model_dt = distribute_module(model, device_mesh, _conv_fn)
+
+        # Create input
+        x = torch.randn(1, 4, 5, 5, device=self.device_type)
+        x_dt = distribute_tensor(x, device_mesh, [Replicate()])
+
+        # Forward pass - this should not crash
+        output_dt = model_dt(x_dt)
+
+        # Check outputs shape is correct
+        self.assertEqual(output_dt.shape, torch.Size([1, 8, 5, 5]))
+
+        # Check that model.bias is None
+        self.assertIsNone(model.bias)
 
 
 DistConvolutionOpsTestWithLocalTensor = create_local_tensor_test_class(
     DistConvolutionOpsTest,
     # Send / recv ops are not supported
     skipped_tests=[
-        "test_conv1d",
-        "test_conv3d",
         "test_conv_backward_none_grad_inp",
         "test_depthwise_convolution",
         "test_downsampling_convolution",
+        # New tests for Issue #167091 - use send/recv via tp_convolution
+        "test_conv2d_no_bias_compile",
+        "test_conv2d_no_bias_backward",
+        "test_conv2d_module_no_bias",
     ],
 )
 
diff --git a/test/distributed/tensor/test_dtensor_compile.py b/test/distributed/tensor/test_dtensor_compile.py
index b82e9c97b57a8..ddba3150b05fb 100644
--- a/test/distributed/tensor/test_dtensor_compile.py
+++ b/test/distributed/tensor/test_dtensor_compile.py
@@ -464,6 +464,25 @@ def g(x):
         run(g, 64, 8)
         self.assertEqual(cnt.frame_count, 2)
 
+    def test_dtensor_requires_grad_recompile(self):
+        cnt = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        @torch.compile(backend=cnt, fullgraph=True)
+        def f(x):
+            y = x * x
+            return y.to_local()
+
+        full_x = torch.randn(8, 8, requires_grad=False)
+        x = distribute_tensor(full_x, mesh, [Shard(0)])
+        f(x)
+
+        full_x = torch.randn(8, 8, requires_grad=True)
+        x = distribute_tensor(full_x, mesh, [Shard(0)])
+        f(x)
+
+        self.assertEqual(cnt.frame_count, 2)
+
     def test_dtensor_attribute_access_on_intermediate(self):
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
 
diff --git a/test/distributed/tensor/test_dtensor_export.py b/test/distributed/tensor/test_dtensor_export.py
index d2104066811be..bd75668ab4856 100644
--- a/test/distributed/tensor/test_dtensor_export.py
+++ b/test/distributed/tensor/test_dtensor_export.py
@@ -6,10 +6,7 @@
 import torch
 import torch.distributed as dist
 import torch.fx.traceback as fx_traceback
-from torch._dynamo.functional_export import (
-    _dynamo_graph_capture_for_export,
-    dynamo_graph_capture_for_export,
-)
+from torch._dynamo.functional_export import dynamo_graph_capture_for_export
 from torch._functorch.aot_autograd import aot_export_joint_with_descriptors
 from torch._functorch.partitioners import min_cut_rematerialization_partition
 from torch._guards import tracing, TracingContext
@@ -153,17 +150,6 @@ def graph_capture_and_aot_export_joint_with_descriptors_v2(model, args, kwargs=N
         return aot_export_joint_with_descriptors_alone(gm, args, kwargs)
 
 
-def graph_capture_and_aot_export_joint_with_descriptors(model, args, kwargs=None):
-    if kwargs is None:
-        kwargs = {}
-    with torch._dynamo.config.patch(install_free_tensors=True):
-        # TODO: switch to use the official graph_capture API once it is ready
-        gm = _dynamo_graph_capture_for_export(model)(*args, **kwargs)
-        fake_mode = gm.meta.get("fake_mode", None)
-    with tracing(TracingContext(fake_mode)):
-        return aot_export_joint_with_descriptors_alone(gm, args, kwargs)
-
-
 def aot_export_joint_with_descriptors_alone(model, args, kwargs=None):
     if kwargs is None:
         kwargs = {}
@@ -360,7 +346,6 @@ def unmarked_nodes(gm):
         "export_fn",
         [
             graph_capture_and_aot_export_joint_with_descriptors_v2,
-            graph_capture_and_aot_export_joint_with_descriptors,
             aot_export_joint_with_descriptors_alone,
         ],
     )
@@ -386,10 +371,6 @@ def test_annotate_aot_export_joint_with_descriptors_alone(self):
                 graph_capture_and_aot_export_joint_with_descriptors_v2,
                 "[[4, 10], [4], [10, 4], [10], [4, 10], [4], [10, 4], [10], [s64, 10], [s64, 10]]",
             ),
-            (
-                graph_capture_and_aot_export_joint_with_descriptors,
-                "[[4, 10], [4], [10, 4], [10], [s22, 10], [s22, 10]]",
-            ),
         ],
     )
     def test_dynamic_shapes(self, export_fn_with_answer):
@@ -434,7 +415,6 @@ def test_dynamic_shapes(self, export_fn_with_answer):
         "export_fn",
         [
             dynamo_graph_capture_for_export,
-            _dynamo_graph_capture_for_export,
         ],
     )
     def test_einsum_dtensor_export(self, export_fn):
@@ -456,11 +436,7 @@ def test_einsum_dtensor_export(self, export_fn):
 
         # Run model to verify it works
         output = model(*inputs)
-        with torch._dynamo.config.patch(
-            install_free_tensors=(export_fn is _dynamo_graph_capture_for_export)
-        ):
-            # TODO: switch to use the official graph_capture API once it is ready
-            gm = export_fn(model)(*inputs)
+        gm = export_fn(model)(*inputs)
         output_gm = gm(*inputs)
         self.assertEqual(output, output_gm)
 
@@ -468,7 +444,6 @@ def test_einsum_dtensor_export(self, export_fn):
         "export_fn",
         [
             graph_capture_and_aot_export_joint_with_descriptors_v2,
-            graph_capture_and_aot_export_joint_with_descriptors,
         ],
     )
     def test_flex_attention_dtensor_export(self, export_fn):
@@ -531,10 +506,52 @@ def nest_fn(leaf: torch.Tensor | DTensor):
             return nest_fn(leaf) + 1
 
         z = torch.randn(16, 16)
-        gm = graph_capture_and_aot_export_joint_with_descriptors(fn, (z,))
+        gm = graph_capture_and_aot_export_joint_with_descriptors_v2(fn, (z,))
 
         self.assertEqual(fn(z), gm(z)[0])
 
+    def test_dtensor_data_dependent_index_and_slice(self):
+        device_mesh = init_device_mesh(self.device_type, mesh_shape=(self.world_size,))
+
+        class Foo(torch.nn.Module):
+            def forward(self, x, y):
+                return x[y]
+
+        x = torch.randn(10)
+        y = torch.randint(1, (10,)).bool()
+        x_dt = distribute_tensor(x, device_mesh, placements=[Replicate()])
+        y_dt = distribute_tensor(y, device_mesh, placements=[Replicate()])
+        dynamo_graph_capture_for_export(Foo())(x_dt, y_dt)
+
+        class Bar(torch.nn.Module):
+            def forward(self, x):
+                val = torch.clamp(x.max(), min=1).item()
+                torch._check(val >= 1)
+                return x[:val]
+
+        x = torch.randint(1000, (4, 64, 16))
+        x_dt = distribute_tensor(x, device_mesh, placements=[Replicate()])
+        gm = dynamo_graph_capture_for_export(Bar())(x_dt)
+        self.assertExpectedInline(
+            str(gm.graph).strip(),
+            """\
+graph():
+    %l_x_ : torch.distributed.tensor.DTensor [num_users=2] = placeholder[target=L_x_]
+    %max_1 : [num_users=1] = call_method[target=max](args = (%l_x_,), kwargs = {})
+    %clamp : [num_users=1] = call_function[target=torch.clamp](args = (%max_1,), kwargs = {min: 1})
+    %item : [num_users=2] = call_method[target=item](args = (%clamp,), kwargs = {})
+    %ge_1 : [num_users=1] = call_function[target=operator.ge](args = (%item, 1), kwargs = {})
+    %_assert_scalar_default : [num_users=0] = call_function[target=torch.ops.aten._assert_scalar.default](args = (%ge_1, Runtime assertion failed for expression u0 >= 1 on node 'ge_1'), kwargs = {})
+    %getitem : [num_users=2] = call_function[target=operator.getitem](args = (%l_x_, slice(None, item, None)), kwargs = {})
+    %getattr_1 : [num_users=1] = call_function[target=builtins.getattr](args = (%getitem, _local_tensor), kwargs = {})
+    %sym_size_int : [num_users=2] = call_function[target=torch.ops.aten.sym_size.int](args = (%getattr_1, 0), kwargs = {})
+    %ge_2 : [num_users=1] = call_function[target=operator.ge](args = (%sym_size_int, 0), kwargs = {})
+    %_assert_scalar_default_1 : [num_users=0] = call_function[target=torch.ops.aten._assert_scalar.default](args = (%ge_2, Runtime assertion failed for expression u2 >= 0 on node 'ge_2'), kwargs = {})
+    %le : [num_users=1] = call_function[target=operator.le](args = (%sym_size_int, 4), kwargs = {})
+    %_assert_scalar_default_2 : [num_users=0] = call_function[target=torch.ops.aten._assert_scalar.default](args = (%le, Runtime assertion failed for expression u2 <= 4 on node 'le'), kwargs = {})
+    return (getitem,)""",  # noqa: B950
+        )
+
 
 instantiate_parametrized_tests(DTensorExportTest)
 
diff --git a/test/distributed/tensor/test_math_ops.py b/test/distributed/tensor/test_math_ops.py
index f031085b23bd2..56321806477b9 100644
--- a/test/distributed/tensor/test_math_ops.py
+++ b/test/distributed/tensor/test_math_ops.py
@@ -26,6 +26,7 @@
     RowwiseParallel,
     SequenceParallel,
 )
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     create_local_tensor_test_class,
@@ -764,6 +765,7 @@ def test_foreach_norm_different_mesh(self):
         self.assertEqual(grad1_norm.device_mesh, mesh_y)
 
     @with_comms
+    @skip_if_lt_x_gpu(4)
     def test_foreach_add_different_mesh(self):
         mesh_shape = (2, self.world_size // 2)
         mesh_2d = init_device_mesh(
diff --git a/test/distributed/tensor/test_op_strategy.py b/test/distributed/tensor/test_op_strategy.py
index da9c4b4174b5d..139f5fb61fac8 100644
--- a/test/distributed/tensor/test_op_strategy.py
+++ b/test/distributed/tensor/test_op_strategy.py
@@ -577,7 +577,7 @@ def mock_select_func(strategy, op_schema=None):
                 self.assertEqual(
                     comm_mode.get_comm_counts(),
                     {
-                        torch.ops.c10d_functional.all_gather_into_tensor: 4,
+                        torch.ops.c10d_functional.all_gather_into_tensor: self.world_size,
                     },
                 )
                 expected_cost = [
diff --git a/test/distributed/tensor/test_pointwise_ops.py b/test/distributed/tensor/test_pointwise_ops.py
index 953709c197584..d2c4e7dea06b4 100644
--- a/test/distributed/tensor/test_pointwise_ops.py
+++ b/test/distributed/tensor/test_pointwise_ops.py
@@ -331,6 +331,25 @@ def test_mul_partial(self):
         self.assertEqual(z.placements, (Replicate(),))
         self.assertEqual(z.to_local(), input)
 
+    def test_inplace_op_partial_to_replicate(self):
+        # test that in-place operations that require redistribution raise an error
+        # to preserve aliasing semantics (issue #163374)
+        device_mesh = self.build_device_mesh()
+
+        input_tensor = torch.tensor(64.0, device=self.device_type)
+        partial_dt = DTensor.from_local(
+            input_tensor, device_mesh, placements=(Partial(),)
+        )
+
+        self.assertTrue(partial_dt.placements[0].is_partial())
+
+        # Inplace ops that require placement changes (Partial -> Replicate) should error
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "in-place operations that require placement changes are not supported",
+        ):
+            partial_dt.clamp_(max=10)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_redistribute.py b/test/distributed/tensor/test_redistribute.py
index 23593462f0a29..381660e47927d 100644
--- a/test/distributed/tensor/test_redistribute.py
+++ b/test/distributed/tensor/test_redistribute.py
@@ -2,7 +2,6 @@
 # Owner(s): ["oncall: distributed"]
 
 import contextlib
-import copy
 import itertools
 import unittest
 
@@ -22,9 +21,8 @@
 )
 from torch.distributed.tensor._collective_utils import shard_dim_alltoall
 from torch.distributed.tensor._dtensor_spec import ShardOrderEntry
-from torch.distributed.tensor._redistribute import redistribute_local_tensor
 from torch.distributed.tensor.debug import CommDebugMode
-from torch.distributed.tensor.placement_types import _StridedShard
+from torch.distributed.tensor.placement_types import _StridedShard, MaskPartial
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
@@ -35,7 +33,11 @@
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     create_local_tensor_test_class,
     DTensorTestBase,
+    generate_shard_orders,
+    make_full_tensor,
     map_local_tensor_for_rank,
+    patched_distribute_tensor as _distribute_tensor,
+    redistribute,
     with_comms,
 )
 from torch.utils._debug_mode import DebugMode
@@ -785,88 +787,6 @@ def _extract_redistribute_trace_from_debug_mode(self, s: str) -> str:
         else:
             return ""
 
-    # TODO(zpcore): remove once the native redistribute supports shard_order arg
-    def redistribute(
-        self,
-        dtensor_input,
-        device_mesh,
-        placements,
-        shard_order,
-        use_graph_based_transform=True,
-    ):
-        """
-        wrapper function to support shard_order for redistribution
-        This is a simpler version of Redistribute, only considers the forward.
-        """
-        if placements is None:
-            placements = self._shard_order_to_placement(shard_order, device_mesh)
-        placements = tuple(placements)
-        old_spec = dtensor_input._spec
-        new_spec = copy.deepcopy(old_spec)
-        new_spec.placements = placements
-        if shard_order is not None:
-            new_spec.shard_order = shard_order
-        else:
-            new_spec.shard_order = ()
-        if old_spec == new_spec:
-            return dtensor_input
-        dtensor_input = DTensor.from_local(
-            redistribute_local_tensor(
-                dtensor_input.to_local(),
-                old_spec,
-                new_spec,
-                use_graph_based_transform=use_graph_based_transform,
-            ),
-            device_mesh,
-        )
-        dtensor_input._spec = copy.deepcopy(new_spec)
-        return dtensor_input  # returns DTensor
-
-    # TODO(zpcore): remove once the native distribute_tensor supports
-    # shard_order arg
-    def distribute_tensor(
-        self,
-        input_tensor,
-        device_mesh,
-        placements,
-        shard_order,
-        use_graph_based_transform=True,
-    ):
-        """wrapper function to support shard_order for tensor distribution"""
-        if placements is None:
-            placements = self._shard_order_to_placement(shard_order, device_mesh)
-        placements = tuple(placements)
-        tensor_dt = distribute_tensor(input_tensor, device_mesh, placements)
-        # fix the shard order
-        return self.redistribute(
-            tensor_dt, device_mesh, placements, shard_order, use_graph_based_transform
-        )
-
-    # TODO(zpcore): remove once the native redistribute supports shard_order arg
-    def full_tensor(self, dtensor_input):
-        """wrapper function to support DTensor.full_tensor"""
-        return self.redistribute(
-            dtensor_input, dtensor_input.device_mesh, placements=None, shard_order=()
-        ).to_local()
-
-    def _shard_order_to_placement(self, shard_order, mesh):
-        """convert shard_order to placement with only Replicate() and Shard()"""
-        placements = [Replicate() for _ in range(mesh.ndim)]
-        if shard_order is not None:
-            for entry in shard_order:
-                tensor_dim = entry.tensor_dim
-                mesh_dims = entry.mesh_dims
-                for mesh_dim in mesh_dims:
-                    placements[mesh_dim] = Shard(tensor_dim)
-        return tuple(placements)
-
-    def _convert_shard_order_dict_to_ShardOrder(self, shard_order):
-        """Convert shard_order dict to ShardOrder"""
-        return tuple(
-            ShardOrderEntry(tensor_dim=tensor_dim, mesh_dims=tuple(mesh_dims))
-            for tensor_dim, mesh_dims in shard_order.items()
-        )
-
     @with_comms
     def test_ordered_redistribute(self):
         """Test ordered redistribution with various sharding syntaxes"""
@@ -927,13 +847,11 @@ def test_ordered_redistribute(self):
         for idx, ((src_placement, src_order), (dst_placement, dst_order)) in enumerate(
             sharding_src_dst_pairs_with_expected_trace
         ):
-            sharded_dt = self.distribute_tensor(
+            sharded_dt = _distribute_tensor(
                 input_data.clone(), mesh, src_placement, shard_order=src_order
             )
             with DebugMode(record_torchfunction=False) as debug_mode:
-                sharded_dt = self.redistribute(
-                    sharded_dt, mesh, dst_placement, dst_order
-                )
+                sharded_dt = redistribute(sharded_dt, mesh, dst_placement, dst_order)
             trace_str = self._extract_redistribute_trace_from_debug_mode(
                 debug_mode.debug_string()
             )
@@ -957,49 +875,11 @@ def test_ordered_redistribute(self):
                     trace_str,
                     """S(0)[0]S(0)[1]R->S(0)S(1)R->RS(1)R->RS(1)S(0)""",
                 )
-            expected_dt = self.distribute_tensor(
+            expected_dt = _distribute_tensor(
                 input_data.clone(), mesh, dst_placement, shard_order=dst_order
             )
             self.assertEqual(sharded_dt.to_local(), expected_dt.to_local())
 
-    def generate_shard_orders(self, mesh, tensor_rank):
-        # Generate all possible sharding placement of tensor with rank
-        # `tensor_rank` over mesh.
-        def _split_list(lst: list, N: int):
-            def compositions(n, k):
-                if k == 1:
-                    yield [n]
-                else:
-                    for i in range(1, n - k + 2):
-                        for tail in compositions(n - i, k - 1):
-                            yield [i] + tail
-
-            length = len(lst)
-            for comp in compositions(length, N):
-                result = []
-                start = 0
-                for size in comp:
-                    result.append(lst[start : start + size])
-                    start += size
-                yield result
-
-        all_mesh = list(range(mesh.ndim))
-        all_device_order = list(itertools.permutations(all_mesh))
-        for device_order in all_device_order:
-            # split on device orders, and assign each device order segment to a tensor dim
-            for num_split in range(1, mesh.ndim + 1):
-                for splitted_list in _split_list(list(range(mesh.ndim)), num_split):
-                    for tensor_dims in itertools.combinations(
-                        range(tensor_rank), len(splitted_list)
-                    ):
-                        shard_order = {}
-                        assert len(tensor_dims) == len(splitted_list)
-                        for tensor_dim, mesh_dims in zip(tensor_dims, splitted_list):
-                            shard_order[tensor_dim] = device_order[
-                                mesh_dims[0] : mesh_dims[-1] + 1
-                            ]
-                        yield self._convert_shard_order_dict_to_ShardOrder(shard_order)
-
     @with_comms
     def test_generate_shard_orders(self):
         """Check if `generate_shard_orders` generates unique sharding combinations"""
@@ -1012,7 +892,7 @@ def test_generate_shard_orders(self):
         ]
         for test_input in test_inputs:
             all_combinations = []
-            for shard_order in self.generate_shard_orders(
+            for shard_order in generate_shard_orders(
                 test_input["mesh"], test_input["tensor_rank"]
             ):
                 all_combinations.append(shard_order)  # noqa: PERF402
@@ -1062,12 +942,12 @@ def test_ordered_distribute_all_combination(self):
             input_data = torch.randn(tensor_shape, device=self.device_type)
             tensor_rank = input_data.ndim
             with maybe_disable_local_tensor_mode():
-                shard_orders = self.generate_shard_orders(mesh, tensor_rank)
+                shard_orders = generate_shard_orders(mesh, tensor_rank)
             for shard_order in shard_orders:
-                sharded_dt = self.distribute_tensor(
+                sharded_dt = _distribute_tensor(
                     input_data.clone(), mesh, placements=None, shard_order=shard_order
                 )
-                self.assertEqual(self.full_tensor(sharded_dt), input_data)
+                self.assertEqual(make_full_tensor(sharded_dt), input_data)
 
         # 2. Verify the correctness of redistribution from DTensor to DTensor.
         # This test repeatedly redistributes a DTensor to various ordered
@@ -1078,20 +958,20 @@ def test_ordered_distribute_all_combination(self):
             tensor_rank = input_data.ndim
             prev_sharded_dt = None
             with maybe_disable_local_tensor_mode():
-                shard_orders = self.generate_shard_orders(mesh, tensor_rank)
+                shard_orders = generate_shard_orders(mesh, tensor_rank)
             for shard_order in shard_orders:
                 if prev_sharded_dt is None:
-                    prev_sharded_dt = self.distribute_tensor(
+                    prev_sharded_dt = _distribute_tensor(
                         input_data.clone(),
                         mesh,
                         placements=None,
                         shard_order=shard_order,
                     )
                 else:
-                    sharded_dt = self.redistribute(
+                    sharded_dt = redistribute(
                         prev_sharded_dt, mesh, placements=None, shard_order=shard_order
                     )
-                    self.assertEqual(self.full_tensor(sharded_dt), input_data)
+                    self.assertEqual(make_full_tensor(sharded_dt), input_data)
                     prev_sharded_dt = sharded_dt
 
     @with_comms
@@ -1136,13 +1016,13 @@ def _is_valid_placement(placements, tensor_rank):
                 local_tensor = torch.randn(shape, device=self.device_type)
                 full_tensor = DTensor.from_local(local_tensor, mesh, placements)
                 with maybe_disable_local_tensor_mode():
-                    shard_orders = self.generate_shard_orders(mesh, len(shape))
+                    shard_orders = generate_shard_orders(mesh, len(shape))
                 for shard_order in shard_orders:
-                    sharded_dt = self.redistribute(
+                    sharded_dt = redistribute(
                         full_tensor, mesh, placements=None, shard_order=shard_order
                     )
                     self.assertEqual(
-                        self.full_tensor(sharded_dt), self.full_tensor(full_tensor)
+                        make_full_tensor(sharded_dt), make_full_tensor(full_tensor)
                     )
 
     @unittest.skip(
@@ -1152,24 +1032,20 @@ def _is_valid_placement(placements, tensor_rank):
     @with_comms
     def test_ordered_redistribute_for_special_placement(self):
         """Test ordered redistribution with special placement"""
-        from torch.distributed.tensor._ops._embedding_ops import _MaskPartial
-
         torch.manual_seed(21)
         mesh = init_device_mesh(self.device_type, (8,))
         input_data = torch.randn((8, 8), device=self.device_type)
         src_placement = [Shard(1)]
         tgt_placement = [
-            (_MaskPartial(offset_shape=torch.Size([10, 20]), offset_dim=0),)
+            (MaskPartial(offset_shape=torch.Size([10, 20]), offset_dim=0),)
         ]
-        sharded_dt = self.distribute_tensor(
+        sharded_dt = _distribute_tensor(
             input_data.clone(),
             mesh,
             src_placement,
             shard_order=(ShardOrderEntry(tensor_dim=1, mesh_dims=(0,)),),
         )
-        sharded_dt = self.redistribute(
-            sharded_dt, mesh, tgt_placement, shard_order=None
-        )
+        sharded_dt = redistribute(sharded_dt, mesh, tgt_placement, shard_order=None)
 
     @with_comms
     def test_shard_order_same_data_as_strided_shard(self):
@@ -1179,7 +1055,7 @@ def test_shard_order_same_data_as_strided_shard(self):
         strided_placement = [_StridedShard(-2, split_factor=2), Shard(-2)]
         x_strided_dt = distribute_tensor(x, device_mesh, strided_placement)
         # specify right-to-left order use ordered shard
-        x_ordered_dt = self.distribute_tensor(
+        x_ordered_dt = _distribute_tensor(
             x,
             device_mesh,
             placements=[Shard(0), Shard(0)],
diff --git a/test/distributed/tensor/test_tensor_ops.py b/test/distributed/tensor/test_tensor_ops.py
index e8266aa4f4aef..80968fb52e904 100644
--- a/test/distributed/tensor/test_tensor_ops.py
+++ b/test/distributed/tensor/test_tensor_ops.py
@@ -706,11 +706,11 @@ def test_where_type_promotion(self):
     @with_comms
     def test_dtensor_dtype_conversion(self):
         from torch.distributed.tensor.debug import (
-            _clear_sharding_prop_cache,
-            _get_sharding_prop_cache_info,
+            _clear_fast_path_sharding_prop_cache,
+            _get_fast_path_sharding_prop_cache_stats,
         )
 
-        _clear_sharding_prop_cache()
+        _clear_fast_path_sharding_prop_cache()
         device_mesh = self.build_device_mesh()
         shard_spec = [Shard(0)]
         # by default we start from bf16 dtype
@@ -730,13 +730,13 @@ def test_dtensor_dtype_conversion(self):
         self.assertEqual(bf16_sharded_dtensor1.to_local().dtype, torch.bfloat16)
 
         # by this point we only have cache misses
-        hits, misses, _, _ = _get_sharding_prop_cache_info()
+        hits, misses = _get_fast_path_sharding_prop_cache_stats()
         self.assertEqual(hits, 0)
         self.assertEqual(misses, 2)
 
         # convert to fp32 again and see if there's cache hit
         bf16_sharded_dtensor1.float()
-        hits, misses, _, _ = _get_sharding_prop_cache_info()
+        hits, misses = _get_fast_path_sharding_prop_cache_stats()
         # by now we should have cache hit
         self.assertEqual(hits, 1)
         self.assertEqual(misses, 2)
diff --git a/test/distributed/tensor/test_utils.py b/test/distributed/tensor/test_utils.py
index 01f150f090b73..11b70c8554e52 100644
--- a/test/distributed/tensor/test_utils.py
+++ b/test/distributed/tensor/test_utils.py
@@ -1,11 +1,18 @@
 # Owner(s): ["oncall: distributed"]
 
 import itertools
+from contextlib import nullcontext
 from typing import Any
 
 import torch
+import torch.distributed as dist
+from torch.distributed._local_tensor import (
+    local_tensor_mode,
+    LocalTensor,
+    LocalTensorMode,
+)
 from torch.distributed.device_mesh import init_device_mesh
-from torch.distributed.tensor import distribute_tensor, DTensor
+from torch.distributed.tensor import DeviceMesh, distribute_tensor, DTensor
 from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
 from torch.distributed.tensor._utils import (
     _compute_local_shape_and_global_offset,
@@ -14,6 +21,7 @@
     compute_global_tensor_shape,
     compute_local_shape_and_global_offset,
     compute_local_tensor_info,
+    ExplicitRedistributionContext,
 )
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.distributed.tensor.placement_types import (
@@ -26,6 +34,10 @@
 from torch.testing._internal.common_utils import run_tests, TestCase
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
+    generate_shard_orders,
+    LocalDTensorTestBase,
+    patched_distribute_tensor as _distribute_tensor,
+    shard_order_to_placement,
     with_comms,
 )
 
@@ -766,6 +778,63 @@ def test_2d_mesh_uneven_strided_shard(self):
             self.assertEqual(dtensor.full_tensor(), tensor)
 
 
+class Test_StridedShard_with_shard_order(LocalDTensorTestBase):
+    @property
+    def world_size(self) -> int:
+        return 32
+
+    @with_comms
+    def test_StridedShard_to_shard_order(self):
+        with LocalTensorMode(ranks=self.world_size):
+            mesh = DeviceMesh("cpu", torch.arange(self.world_size).view(2, 2, 2, 2, 2))
+            shard_iter = generate_shard_orders(mesh, 3)
+            # It takes ~4.8h to complete total 2520 shard order combinations here
+            # using LocalTensor. So we only randomly pick 25 shard orders to test.
+            all_shard_order = list(shard_iter)
+            import random
+
+            random.seed(42)
+            shard_order_choices = random.sample(
+                all_shard_order, min(25, len(all_shard_order))
+            )
+
+            x = torch.randn(32, 32, 32)
+            for shard_order in shard_order_choices:
+                a = _distribute_tensor(x, mesh, None, shard_order)
+
+                placement_without_stridedshard = shard_order_to_placement(
+                    shard_order, mesh
+                )
+                placements_with_stridedshard = (
+                    DTensorSpec._convert_shard_order_to_StridedShard(
+                        shard_order, placement_without_stridedshard, mesh
+                    )
+                )
+                b = distribute_tensor(x, mesh, placements_with_stridedshard)
+                shard_order_from_stridedshard = (
+                    DTensorSpec._maybe_convert_StridedShard_to_shard_order(
+                        placements_with_stridedshard, mesh
+                    )
+                )
+                self.assertEqual(shard_order, shard_order_from_stridedshard)
+                self.assertEqual(a.to_local(), b.to_local())
+
+    @with_comms
+    def test_StridedShard_not_convertible_to_shard_order(self):
+        with LocalTensorMode(ranks=self.world_size):
+            mesh = DeviceMesh("cpu", torch.arange(self.world_size).view(4, 8))
+            unconvertible_placements_list = [
+                [_StridedShard(0, split_factor=2), _StridedShard(1, split_factor=2)],
+                [_StridedShard(0, split_factor=2), Shard(1)],
+                [_StridedShard(1, split_factor=16), Shard(1)],
+            ]
+            for placements in unconvertible_placements_list:
+                shard_order = DTensorSpec._maybe_convert_StridedShard_to_shard_order(
+                    tuple(placements), mesh
+                )
+                self.assertIsNone(shard_order)
+
+
 class Test2DStridedLocalShard(DTensorTestBase):
     @property
     def world_size(self):
@@ -851,5 +920,105 @@ def test_fsdp2_tp_2d_dtensor_local_shards_and_offsets(self):
         self.assertEqual(global_tensor, dtensor_2d.full_tensor())
 
 
+class LocalTensorTestBase(TestCase):
+    def assertEqual(self, lhs, rhs, **kwargs):
+        mode = local_tensor_mode()
+        with nullcontext() if mode is None else mode.disable():
+            if isinstance(lhs, LocalTensor) and isinstance(rhs, LocalTensor):
+                assert isinstance(lhs, LocalTensor) and isinstance(rhs, LocalTensor)
+                super().assertEqual(lhs._ranks, rhs._ranks)
+                for r in lhs._ranks:
+                    super().assertEqual(
+                        lhs._local_tensors[r],
+                        rhs._local_tensors[r],
+                        lambda m: f"rank {r}: {m}",
+                    )
+            elif isinstance(lhs, LocalTensor) or isinstance(rhs, LocalTensor):
+                lhs, rhs = (lhs, rhs) if isinstance(lhs, LocalTensor) else (rhs, lhs)
+                for r in lhs._ranks:
+                    super().assertEqual(
+                        lhs._local_tensors[r], rhs, lambda m: f"rank {r}: {m}"
+                    )
+            else:
+                return super().assertEqual(lhs, rhs, **kwargs)
+
+    @property
+    def world_size(self):
+        raise NotImplementedError("override world-size in your subclass")
+
+    def build_device_mesh(self) -> DeviceMesh:
+        return init_device_mesh("cpu", (self.world_size,))
+
+    def setUp(self):
+        super().setUp()
+        torch.distributed.init_process_group(
+            # TODO: test other ranks too
+            "fake",
+            rank=0,
+            world_size=self.world_size,
+        )
+
+    def tearDown(self):
+        super().tearDown()
+        try:
+            dist.destroy_process_group()
+        except AssertionError:
+            pass
+
+
+class TestExplicitRedistribute(LocalTensorTestBase):
+    @property
+    def world_size(self):
+        return 4
+
+    def test_explicit_matmul(self):
+        with LocalTensorMode(self.world_size):
+            device_mesh = self.build_device_mesh()
+            dim = 128
+            x = torch.randn(8, dim, requires_grad=True)
+            A = torch.randn(dim, dim, requires_grad=True)
+
+            # Prepare DTensors
+            dx = distribute_tensor(x, device_mesh, [Shard(0)])
+            dA = distribute_tensor(A, device_mesh, [Shard(0)])
+
+            # implicit redistribute works as usual by default
+            with CommDebugMode() as comm_mode:
+                torch.matmul(dx, dA)
+            self.assertEqual(comm_mode.get_total_counts(), 1)
+
+            # explicit redistribute works too
+            with ExplicitRedistributionContext():
+                with self.assertRaisesRegex(RuntimeError, "Implicit redistribution"):
+                    torch.matmul(dx, dA)
+
+            # explicit redistribute allows manual redistribute
+            with ExplicitRedistributionContext():
+                dA_repl = dA.redistribute(device_mesh, [Replicate()])
+                torch.matmul(dx, dA_repl)
+
+            dx = distribute_tensor(x, device_mesh, [Shard(0)])
+            dA = distribute_tensor(A, device_mesh, [Replicate()])
+            with ExplicitRedistributionContext(strict=True):
+                dY = torch.matmul(dx, dA_repl)
+                loss = dY.sum()
+
+                # we now see the error during backwards
+                with self.assertRaisesRegex(RuntimeError, "Implicit redistribution"):
+                    loss.backward(retain_graph=True)
+
+                with ExplicitRedistributionContext(strict=False):
+                    # but since it's a 'free' redistribute, we can still do it under non-strict mode
+                    loss.backward(retain_graph=True)
+
+                with ExplicitRedistributionContext(enable=False):
+                    # and we can disable
+                    loss.backward(retain_graph=True)
+
+                # and re-enable
+                with self.assertRaisesRegex(RuntimeError, "Implicit redistribution"):
+                    loss.backward(retain_graph=True)
+
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_view_ops.py b/test/distributed/tensor/test_view_ops.py
index 857d5bd7a91df..aeb3a9df79a31 100644
--- a/test/distributed/tensor/test_view_ops.py
+++ b/test/distributed/tensor/test_view_ops.py
@@ -664,6 +664,101 @@ def test_squeeze_(self):
         )
         self.assertEqual(dist_x.placements, [Partial(), Shard(0)])
 
+    @with_comms
+    def test_storage_offset_slice(self):
+        """
+        Test that storage_offset is properly tracked on DTensor when slicing
+        a replicated tensor.
+        """
+        mesh = init_device_mesh(self.device_type, (self.world_size,))
+
+        # Create a replicated DTensor
+        tensor = torch.randn(10, device=self.device_type)
+        dtensor = distribute_tensor(tensor, mesh, [Replicate()])
+
+        # Perform a slice operation [1:]
+        with CommDebugMode() as comm_mode:
+            sliced_dtensor = dtensor[1:]
+            # Slicing should not trigger any communication
+            self.assertEqual(comm_mode.get_total_counts(), 0)
+
+        # Verify that the DTensor's storage_offset matches the expected value
+        self.assertEqual(sliced_dtensor.storage_offset(), 1)
+
+        # Verify that the local tensor also has the correct storage_offset
+        self.assertEqual(sliced_dtensor.to_local().storage_offset(), 1)
+
+        # Verify the shape is correct
+        self.assertEqual(sliced_dtensor.shape, torch.Size([9]))
+
+        # Verify the values are correct
+        expected = tensor[1:]
+        self.assertEqual(sliced_dtensor.full_tensor(), expected)
+
+    @with_comms
+    def test_storage_offset_shard_dim0_slice_dim1(self):
+        """
+        Test that storage_offset is properly tracked when tensor is sharded on dim 0
+        and sliced on dim 1.
+        """
+        mesh = init_device_mesh(self.device_type, (self.world_size,))
+
+        # Create a 2D tensor and shard on dim 0
+        tensor = torch.randn(12, 8, device=self.device_type)
+        dtensor = distribute_tensor(tensor, mesh, [Shard(0)])
+
+        # Perform a slice operation [:, 2:]
+        with CommDebugMode() as comm_mode:
+            sliced_dtensor = dtensor[:, 2:]
+            # Slicing should not trigger any communication
+            self.assertEqual(comm_mode.get_total_counts(), 0)
+
+        # The storage_offset should be 2 (skipping 2 elements in each row)
+        self.assertEqual(sliced_dtensor.storage_offset(), 2)
+
+        # Verify that the local tensor also has the correct storage_offset
+        self.assertEqual(sliced_dtensor.to_local().storage_offset(), 2)
+
+        # Verify the shape is correct
+        expected_shape = torch.Size([12, 6])
+        self.assertEqual(sliced_dtensor.shape, expected_shape)
+
+        # Verify the values are correct
+        expected = tensor[:, 2:]
+        self.assertEqual(sliced_dtensor.full_tensor(), expected)
+
+    @with_comms
+    def test_storage_offset_shard_dim1_slice_dim0(self):
+        """
+        Test that storage_offset is properly tracked when tensor is sharded on dim 1
+        and sliced on dim 0.
+        """
+        mesh = init_device_mesh(self.device_type, (self.world_size,))
+
+        # Create a 2D tensor and shard on dim 1
+        tensor = torch.randn(10, 12, device=self.device_type)
+        dtensor = distribute_tensor(tensor, mesh, [Shard(1)])
+
+        # Perform a slice operation [2:, :]
+        with CommDebugMode() as comm_mode:
+            sliced_dtensor = dtensor[2:, :]
+            # Slicing should not trigger any communication
+            self.assertEqual(comm_mode.get_total_counts(), 0)
+
+        local_dim1_size = 12 // self.world_size
+        expected_offset = 2 * local_dim1_size
+        self.assertEqual(sliced_dtensor.storage_offset(), expected_offset)
+
+        self.assertEqual(sliced_dtensor.to_local().storage_offset(), expected_offset)
+
+        # Verify the shape is correct
+        expected_shape = torch.Size([8, 12])
+        self.assertEqual(sliced_dtensor.shape, expected_shape)
+
+        # Verify the values are correct
+        expected = tensor[2:, :]
+        self.assertEqual(sliced_dtensor.full_tensor(), expected)
+
 
 TestViewOpsWithLocalTensor = create_local_tensor_test_class(
     TestViewOps,
diff --git a/test/distributed/test_aten_comm_compute_reordering.py b/test/distributed/test_aten_comm_compute_reordering.py
index 5b1db2d8dfe14..426f77e379f8f 100644
--- a/test/distributed/test_aten_comm_compute_reordering.py
+++ b/test/distributed/test_aten_comm_compute_reordering.py
@@ -10,6 +10,7 @@
 
 # for some reason importing functional collectives after dynamo breaks collectives handling!
 import torch.distributed._functional_collectives as _functional_collectives
+import torch.fx as fx
 from torch._C import FileCheck
 from torch._dynamo.utils import counters, same
 from torch._inductor.utils import run_and_get_code, run_and_get_triton_code
@@ -54,6 +55,7 @@ def apply_reordering_and_get_graph(graph, out_li) -> None:
         "max_compute_pre_fetch",
         "custom_runtime_estimation",
         "insert_overlap_deps",
+        "collective_estimator",
     )
     for key in config_keys:
         if (val := getattr(dist_opts, key)) is not None:
@@ -237,6 +239,49 @@ def func(a, *, tag, ranks, group_size):
             self.assertTrue(same(out, correct))
             self.assertEqual(counters["inductor"]["overlap_scheduling_exposed"], 0)
 
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @torch._inductor.config.patch(get_patches())
+    def test_schedulable_wait(self):
+        """Test that if a wait node is scheduable or not."""
+        from torch._inductor.fx_passes.bucketing import _schedulable_wait_node
+
+        def test_graph():
+            graph = fx.Graph()
+
+            inp = graph.placeholder("inp")
+            group_size = graph.placeholder("group_size")
+            group_name = graph.placeholder("group_name")
+
+            ag_0_out = graph.call_function(
+                torch.ops._c10d_functional.all_gather_into_tensor.default,
+                args=(inp, group_size, group_name),
+            )
+            ag_0_wait = graph.call_function(
+                torch.ops._c10d_functional.wait_tensor.default,
+                args=(ag_0_out,),
+            )
+            ag_1_out = graph.call_function(
+                torch.ops._c10d_functional.all_gather_into_tensor.default,
+                args=(ag_0_wait, group_size, group_name),
+            )
+            ag_1_wait = graph.call_function(
+                torch.ops._c10d_functional.wait_tensor.default,
+                args=(ag_1_out,),
+            )
+            ag_2_wait = graph.call_function(
+                torch.ops._c10d_functional.wait_tensor.default,
+                args=(ag_1_wait,),
+            )
+
+            graph.output(ag_2_wait)
+            return graph
+
+        graph = test_graph()
+        schedulable = {"wait_tensor_default", "wait_tensor_default_1"}
+        for node in list(graph.nodes):
+            expected = node.name in schedulable
+            assert _schedulable_wait_node(node) is expected
+
     @torch._inductor.config.patch(get_patches())
     def test_reorder_compute_for_overlap_mul(self):
         def func(a, *, tag, ranks, group_size):
@@ -943,6 +988,50 @@ def func(a, b, *, ranks):
             correct = func(inputs_a, inputs_b, ranks=ranks)
             self.assertTrue(same(out, correct))
 
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    def test_collective_benchmarking_with_real_pg(self):
+        """Test collective benchmarking with real process group (falls back on fake)."""
+
+        def func(a):
+            # Test all three collective types with 8x8 (power of 2 size = 256 elements = 1024 bytes for fp32)
+            ar = _functional_collectives.all_reduce(a, "sum", "0")
+            ag = _functional_collectives.all_gather_tensor(
+                a, 0, list(range(self.world_size))
+            )
+            rs = _functional_collectives.reduce_scatter_tensor(a, "sum", 0, "0")
+
+            b = torch.matmul(a, a)
+            c = torch.matmul(ar, b)
+            return c.sum() + ag.sum() + rs.sum()
+
+        patches = {
+            **get_patches(),
+            "aten_distributed_optimizations.collective_estimator": "benchmark",
+            "aten_distributed_optimizations.custom_runtime_estimation": None,  # Remove custom estimation so benchmarking happens
+        }
+
+        with _dynamo_dist_per_rank_init(
+            self.rank,
+            self.world_size,
+            self.backend(device_type),
+            fake_pg=not at_least_x_gpu(2),
+        ):
+            inputs = torch.ones(8, 8, dtype=torch.float, device=device_type) + self.rank
+
+            with torch._inductor.config.patch(patches):
+                compiled = torch.compile(func)
+                out, aten_graph_str = run_and_get_aten_graph(compiled, inputs)
+
+                # Verify all three collective types are present
+                FileCheck().check("all_reduce").check("all_gather").check(
+                    "reduce_scatter"
+                ).run(aten_graph_str)
+
+                # Test passes if compilation succeeded with benchmarking enabled
+                # Cache verification is tricky due to multiprocess test setup
+                correct = func(inputs)
+                self.assertTrue(same(out, correct))
+
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @torch._inductor.config.patch(get_bucket_patches())
     def test_multidtype_bucketing(self):
@@ -1016,6 +1105,364 @@ def func(a, b, c):
             correct = func(a, b, c)
             self.assertTrue(same(out, correct))
 
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @torch._inductor.config.patch(get_bucket_patches())
+    def test_multiple_hiding_nodes_bucketing(self):
+        """Test that collectives hidden by multiple compute ops can bucket together."""
+
+        # Use 0.5 compute multiplier so each collective needs 2 matmuls to be fully hidden
+        def estimate_with_half_compute(fx_node, override_size=None):
+            return estimate_aten_runtime(fx_node, compute_multiplier=0.5)
+
+        def func(a, b, *, ranks):
+            # Two all_gathers that will be hidden by multiple compute operations
+            ag1 = _functional_collectives.all_gather_tensor(a, 0, ranks)
+            ag2 = _functional_collectives.all_gather_tensor(b, 0, ranks)
+
+            # Multiple compute operations that can hide the collectives
+            # With 0.5 multiplier: mm1 and mm2 together hide ag1, mm2 and mm3 together hide ag2
+            mm1 = torch.matmul(a, a.T)
+            mm2 = torch.matmul(b, b.T)
+            mm3 = torch.matmul(a + b, (a + b).T)
+
+            return ag1.sum() + ag2.sum() + mm1.sum() + mm2.sum() + mm3.sum()
+
+        with _dynamo_dist_per_rank_init(
+            self.rank,
+            self.world_size,
+            self.backend(device_type),
+            fake_pg=not at_least_x_gpu(2),
+        ):
+            a = torch.ones(8, 8, dtype=torch.float, device=device_type)
+            b = torch.ones(8, 8, dtype=torch.float, device=device_type) * 2
+            ranks = list(range(self.world_size))
+
+            func_c = functools.partial(func, ranks=ranks)
+
+            # Patch with custom estimation that uses 0.5 multiplier
+            with torch._inductor.config.patch(
+                {
+                    "aten_distributed_optimizations.custom_runtime_estimation": estimate_with_half_compute
+                }
+            ):
+                compiled = torch.compile(func_c)
+                out, aten_graph_str = run_and_get_aten_graph(compiled, a, b)
+
+            # Should have 1 bucketed all_gather (both ag1 and ag2 bucketed together)
+            FileCheck().check_count(
+                "torch.ops._c10d_functional.wait_tensor.default", 1, exactly=True
+            ).run(aten_graph_str)
+
+            # Verify bucketed collective is scheduled before all matmuls
+            FileCheck().check("functional.all_gather_into_tensor").check(
+                "aten.mm"
+            ).check("aten.mm").check("aten.mm").check("wait_tensor").run(aten_graph_str)
+
+            # Verify correctness
+            correct = func(a, b, ranks=ranks)
+            self.assertTrue(same(out, correct))
+
+
+def get_toy_model(device_type: str):
+    """
+    Helper to construct a small multi-layer ToyModel
+    """
+
+    class ToyBlock(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.wq = torch.nn.Linear(4, 4)
+            self.wk = torch.nn.Linear(4, 4)
+            self.proj = torch.nn.Linear(4, 4)
+
+        def forward(self, x):
+            attn = self.wq(x) + self.wk(x)
+            return self.proj(torch.nn.functional.relu(attn))
+
+    class ToyModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.layers = torch.nn.ModuleList([ToyBlock() for _ in range(2)])
+            self.norm = torch.nn.LayerNorm(4)
+
+        def forward(self, x):
+            for blk in self.layers:
+                x = blk(x)
+            return self.norm(x)
+
+    model = ToyModel().to(device_type)
+    return model
+
+
+def apply_manual_reordering_and_get_graph(graph, module_bucket_plans, out_li) -> None:
+    gm = graph.owning_module
+    from torch._inductor.fx_passes.overlap_manual_scheduling import (
+        ManualOverlapScheduler,
+    )
+
+    for node in list(gm.graph.nodes):
+        if (
+            node.name == "all_gather_into_tensor"
+            or node.name == "all_gather_into_tensor_1"
+            or node.name == "wait_tensor"
+            or node.name == "wait_tensor_1"
+        ):
+            node.meta["nn_module_stack"] = {"test": ["module_1", ""]}
+        if (
+            node.name == "all_gather_into_tensor_2"
+            or node.name == "all_gather_into_tensor_3"
+            or node.name == "wait_tensor_2"
+            or node.name == "wait_tensor_3"
+        ):
+            node.meta["nn_module_stack"] = {"test": ["module_2", ""]}
+
+    overlapped_gm = ManualOverlapScheduler(
+        gm, module_bucket_plans, insert_overlap_deps=False
+    ).run()
+    overlapped_gm.graph.lint()
+    out_li.append(overlapped_gm.graph)
+
+
+def run_and_get_manual_aten_graph(fn, module_bucket_plans, *inputs):
+    li = []
+    apply = functools.partial(
+        apply_manual_reordering_and_get_graph,
+        module_bucket_plans=module_bucket_plans,
+        out_li=li,
+    )
+    with torch._inductor.config.patch(post_grad_custom_post_pass=apply):
+        out = fn(*inputs)
+
+    return out, li[0]
+
+
+class TestManualOverlapBucketing(TestComputeCommReorderingMultiProc):
+    """
+    Tests for manual overlap scheduling and subgraph utilities.
+    """
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    def test_make_graph_view_and_get_subgraph_by_path(self):
+        from torch._inductor.fx_passes.graph_view import (
+            get_subgraph_by_path,
+            make_graph_view,
+        )
+
+        model = get_toy_model(device_type)
+        gm = torch.fx.symbolic_trace(model)
+        graph_view = make_graph_view(gm.graph)
+        # Fetch subgraph for first transformer layer
+        sub_nodes = get_subgraph_by_path(graph_view, "layers.0.wq")
+        self.assertEqual([n.name for n in sub_nodes], ["layers_0_wq"])
+
+        # Fetch multiple paths at once
+        multi_nodes = get_subgraph_by_path(graph_view, ["layers.0.wq", "layers.0.proj"])
+        self.assertEqual(
+            [n.name for n in multi_nodes], ["layers_0_wq", "layers_0_proj"]
+        )
+
+        # Fetch non existing paths
+        non_exist_nodes = get_subgraph_by_path(graph_view, "nonexistent.module.path")
+        self.assertEqual(non_exist_nodes, [])
+
+        # Fetch mixed of existing and non existing paths
+        mixed_nodes = get_subgraph_by_path(
+            graph_view, ["layers.0.wq", "nonexistent.module.path"]
+        )
+        self.assertEqual([n.name for n in mixed_nodes], ["layers_0_wq"])
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    def test_manual_reordering_bucketing_pass_separate_buckets(
+        self,
+    ):
+        def func(a, b, c, d, *, ranks):
+            # All 4 all-gathers are independent - COULD be bucketed together
+            ag1 = _functional_collectives.all_gather_tensor(a, 0, ranks)
+            ag2 = _functional_collectives.all_gather_tensor(b, 0, ranks)
+            ag3 = _functional_collectives.all_gather_tensor(c[:4], 0, ranks)
+            ag4 = _functional_collectives.all_gather_tensor(d[:4], 0, ranks)
+
+            # First compute - can hide ag1 and ag2
+            e = a * 5  # Use a to avoid fusion
+            mm1 = torch.matmul(e, e.T)
+
+            # Force ag1/ag2 to complete before mm2 (but ag3/ag4 can still be deferred)
+            # Use first 8x8 elements to match mm1's shape
+            intermediate = ag1[:8, :8] + ag2[:8, :8]
+
+            # Second compute - depends on ag1/ag2 through intermediate, can hide ag3/ag4
+            mm2 = torch.matmul(mm1 + intermediate, c[:8])
+
+            # Use all results
+            result = (
+                ag1.sum() * 1.1
+                + ag2.sum() * 1.2
+                + ag3.sum() * 1.3
+                + ag4.sum() * 1.4
+                + mm1.sum()
+                + mm2.sum()
+            )
+            return result
+
+        with _dynamo_dist_per_rank_init(
+            self.rank,
+            self.world_size,
+            self.backend(device_type),
+            fake_pg=not at_least_x_gpu(2),
+        ):
+            a = torch.ones(8, 8, dtype=torch.float, device=device_type)
+            b = torch.ones(8, 8, dtype=torch.float, device=device_type) * 2
+            c = torch.ones(8, 8, dtype=torch.float, device=device_type) * 3
+            d = torch.ones(8, 8, dtype=torch.float, device=device_type) * 4
+            ranks = list(range(self.world_size))
+
+            func_c = functools.partial(func, ranks=ranks)
+            compiled = torch.compile(func_c)
+            out, aten_graph = run_and_get_manual_aten_graph(
+                compiled, ["module_1", "module_2"], a, b, c, d
+            )
+
+            (
+                FileCheck()
+                .check("_pre_bucket_all_gather")
+                .check("all_gather_into_tensor_out")
+                .check("_pre_bucket_all_gather_1")
+                .check("all_gather_into_tensor_out_1")
+                .check("wait_tensor_4")
+                .check("wait_tensor_5")
+                .run(str(aten_graph))
+            )
+
+            correct = func(a, b, c, d, ranks=ranks)
+            self.assertTrue(same(out, correct))
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    def test_bucketing_reordering_pass_no_bucket(
+        self,
+    ):
+        def func(a, b, c, d, *, ranks):
+            # All 4 all-gathers are independent - COULD be bucketed together
+            ag1 = _functional_collectives.all_gather_tensor(a, 0, ranks)
+            ag2 = _functional_collectives.all_gather_tensor(b, 0, ranks)
+            ag3 = _functional_collectives.all_gather_tensor(c[:4], 0, ranks)
+            ag4 = _functional_collectives.all_gather_tensor(d[:4], 0, ranks)
+
+            # First compute - can hide ag1 and ag2
+            e = a * 5  # Use a to avoid fusion
+            mm1 = torch.matmul(e, e.T)
+
+            # Force ag1/ag2 to complete before mm2 (but ag3/ag4 can still be deferred)
+            # Use first 8x8 elements to match mm1's shape
+            intermediate = ag1[:8, :8] + ag2[:8, :8]
+
+            # Second compute - depends on ag1/ag2 through intermediate, can hide ag3/ag4
+            mm2 = torch.matmul(mm1 + intermediate, c[:8])
+
+            # Use all results
+            result = (
+                ag1.sum() * 1.1
+                + ag2.sum() * 1.2
+                + ag3.sum() * 1.3
+                + ag4.sum() * 1.4
+                + mm1.sum()
+                + mm2.sum()
+            )
+            return result
+
+        with _dynamo_dist_per_rank_init(
+            self.rank,
+            self.world_size,
+            self.backend(device_type),
+            fake_pg=not at_least_x_gpu(2),
+        ):
+            a = torch.ones(8, 8, dtype=torch.float, device=device_type)
+            b = torch.ones(8, 8, dtype=torch.float, device=device_type) * 2
+            c = torch.ones(8, 8, dtype=torch.float, device=device_type) * 3
+            d = torch.ones(8, 8, dtype=torch.float, device=device_type) * 4
+            ranks = list(range(self.world_size))
+
+            func_c = functools.partial(func, ranks=ranks)
+            compiled = torch.compile(func_c)
+            out, aten_graph = run_and_get_manual_aten_graph(compiled, [], a, b, c, d)
+
+            (
+                FileCheck()
+                .check("all_gather_into_tensor")
+                .check("all_gather_into_tensor_1")
+                .check("all_gather_into_tensor_2")
+                .check("all_gather_into_tensor_3")
+                .check("wait_tensor")
+                .check("wait_tensor_1")
+                .check("wait_tensor_2")
+                .check("wait_tensor_3")
+                .run(str(aten_graph))
+            )
+
+            correct = func(a, b, c, d, ranks=ranks)
+            self.assertTrue(same(out, correct))
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    def test_bucketing_reordering_pass_single_bucket(
+        self,
+    ):
+        def func(a, b, c, d, *, ranks):
+            # All 4 all-gathers are independent - COULD be bucketed together
+            ag1 = _functional_collectives.all_gather_tensor(a, 0, ranks)
+            ag2 = _functional_collectives.all_gather_tensor(b, 0, ranks)
+            ag3 = _functional_collectives.all_gather_tensor(c[:4], 0, ranks)
+            ag4 = _functional_collectives.all_gather_tensor(d[:4], 0, ranks)
+
+            # First compute - can hide ag1 and ag2
+            e = a * 5  # Use a to avoid fusion
+            mm1 = torch.matmul(e, e.T)
+
+            # Force ag1/ag2 to complete before mm2 (but ag3/ag4 can still be deferred)
+            # Use first 8x8 elements to match mm1's shape
+            intermediate = ag1[:8, :8] + ag2[:8, :8]
+
+            # Second compute - depends on ag1/ag2 through intermediate, can hide ag3/ag4
+            mm2 = torch.matmul(mm1 + intermediate, c[:8])
+
+            # Use all results
+            result = (
+                ag1.sum() * 1.1
+                + ag2.sum() * 1.2
+                + ag3.sum() * 1.3
+                + ag4.sum() * 1.4
+                + mm1.sum()
+                + mm2.sum()
+            )
+            return result
+
+        with _dynamo_dist_per_rank_init(
+            self.rank,
+            self.world_size,
+            self.backend(device_type),
+            fake_pg=not at_least_x_gpu(2),
+        ):
+            a = torch.ones(8, 8, dtype=torch.float, device=device_type)
+            b = torch.ones(8, 8, dtype=torch.float, device=device_type) * 2
+            c = torch.ones(8, 8, dtype=torch.float, device=device_type) * 3
+            d = torch.ones(8, 8, dtype=torch.float, device=device_type) * 4
+            ranks = list(range(self.world_size))
+
+            func_c = functools.partial(func, ranks=ranks)
+            compiled = torch.compile(func_c)
+            out, aten_graph = run_and_get_manual_aten_graph(
+                compiled, [["module_1", "module_2"]], a, b, c, d
+            )
+
+            (
+                FileCheck()
+                .check("_pre_bucket_all_gather")
+                .check("all_gather_into_tensor_out")
+                .check("wait_tensor_4")
+                .run(str(aten_graph))
+            )
+
+            correct = func(a, b, c, d, ranks=ranks)
+            self.assertTrue(same(out, correct))
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index 985e2d5f151a2..2a1cb2b5580cb 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -1189,9 +1189,7 @@ def _test_sequence_num_incremented(self, process_group, ranks):
                 self.assertEqual(len(set(rank_to_seq_num.values())), 2)
                 self.assertEqual(rank_to_seq_num[0], rank_to_seq_num[2])
                 expected_same = {
-                    rank_to_seq_num[i]
-                    for i in rank_to_seq_num.keys()
-                    if i not in [0, 2]
+                    rank_to_seq_num[i] for i in rank_to_seq_num if i not in [0, 2]
                 }
                 self.assertEqual(len(expected_same), 1)
                 self.assertEqual(rank_to_seq_num[0] + 1, rank_to_seq_num[1])
@@ -1558,7 +1556,7 @@ def test_debug_level(self):
         }
         invalid_debug_modes = ["foo", 0, 1, -1]
 
-        for mode in mapping.keys():
+        for mode in mapping:
             os.environ["TORCH_DISTRIBUTED_DEBUG"] = str(mode)
             dist.set_debug_level_from_env()
             set_debug_mode = dist.get_debug_level()
diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py
index ffd48407abd01..07c68d5c0a465 100644
--- a/test/distributed/test_c10d_gloo.py
+++ b/test/distributed/test_c10d_gloo.py
@@ -54,12 +54,10 @@
     verify_ddp_error_logged,
 )
 from torch.testing._internal.common_utils import (
-    MI300_ARCH,
     retry_on_connect_failures,
     run_tests,
     skip_but_pass_in_sandcastle,
     skipIfRocm,
-    skipIfRocmArch,
     TestCase,
 )
 
@@ -1233,7 +1231,7 @@ def test_gather_stress(self):
         self._test_gather_stress(inputs, lambda t: t.clone())
 
     @skip_if_lt_x_gpu(2)
-    @skipIfRocmArch(MI300_ARCH)
+    @skipIfRocm
     @requires_gloo()
     def test_gather_stress_cuda(self):
         inputs = [torch.tensor([i + self.rank]).cuda() for i in range(1000)]
@@ -2357,6 +2355,7 @@ def forward(self, x, use_fc3=True):
 
 class ReducerTest(TestCase):
     def setUp(self):
+        super().setUp()
         self.file = tempfile.NamedTemporaryFile(delete=False)
         world_size = 1
         self.store = c10d.FileStore(self.file.name, world_size)
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index c117bc810b115..512808757c40c 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -2,6 +2,7 @@
 
 import copy
 import json
+import logging
 import os
 import pickle
 import random
@@ -21,6 +22,7 @@
 import torch
 import torch.distributed as c10d
 import torch.distributed._functional_collectives as _functional_collectives
+from torch.distributed.distributed_c10d import SHRINK_ABORT as NCCL_SHRINK_ABORT
 
 
 if not c10d.is_available() or not c10d.is_nccl_available():
@@ -47,12 +49,15 @@
 from torch.nn.parallel import DistributedDataParallel
 from torch.testing._internal.common_cuda import _get_torch_rocm_version, TEST_MULTIGPU
 from torch.testing._internal.common_distributed import (
+    get_required_world_size,
     get_timeout,
     init_multigpu_helper,
     MultiProcessTestCase,
     requires_multicast_support,
     requires_nccl,
+    requires_nccl_shrink,
     requires_nccl_version,
+    requires_world_size,
     skip_if_lt_x_gpu,
     skip_if_rocm_multiprocess,
     sm_is_or_higher_than,
@@ -88,6 +93,53 @@
 )
 
 
+_start_time = time.time()
+_logger = logging.getLogger(__name__)
+
+
+def _ts():
+    return time.time() - _start_time
+
+
+def configure(level=logging.INFO, force=False):
+    try:
+        logging.basicConfig(
+            level=level,
+            format="%(asctime)s %(name)s %(levelname)s: %(message)s",
+            force=force,
+        )
+    except TypeError:
+        logging.basicConfig(
+            level=level, format="%(asctime)s %(name)s %(levelname)s: %(message)s"
+        )
+
+
+def log_test_info(rank, message):
+    _logger.info("[%7.3fs][Rank %s] %s", _ts(), rank, message)
+
+
+def log_test_success(rank, message):
+    _logger.info("[%7.3fs][Rank %s] ✅ %s", _ts(), rank, message)
+
+
+def log_test_validation(rank, message):
+    _logger.info("[%7.3fs][Rank %s] ✓ %s", _ts(), rank, message)
+
+
+def log_test_warning(rank, message):
+    _logger.warning("[%7.3fs][Rank %s] ⚠️ %s", _ts(), rank, message)
+
+
+def log_test_error(rank, message):
+    _logger.error("[%7.3fs][Rank %s] ✗ %s", _ts(), rank, message)
+
+
+_log_configure = configure
+
+
+_log_configure(level=logging.INFO, force=True)
+
+
 class RendezvousEnvTest(TestCase):
     @retry_on_connect_failures
     @requires_nccl()
@@ -200,6 +252,7 @@ class ProcessGroupNCCLNoGPUTest(TestCase):
     MAIN_PROCESS_RANK = 0
 
     def setUp(self):
+        super().setUp()
         self.rank = self.MAIN_PROCESS_RANK
         self.world_size = 1
         self.file = tempfile.NamedTemporaryFile(delete=False)
@@ -317,7 +370,7 @@ def tearDown(self):
 
     @property
     def world_size(self):
-        return 2
+        return get_required_world_size(self, 2)
 
     @property
     def rank_to_GPU(self):
@@ -1255,6 +1308,628 @@ def test_set_process_group_desc(self):
         pg_2 = c10d.new_group([0, 1])
         self.assertEqual(pg_2.group_desc, "undefined")
 
+    @requires_nccl_shrink()
+    @requires_world_size(2)
+    def test_shrink_group_basic(self):
+        """Test basic shrink_group functionality."""
+        self._perform_shrink_test([1], "Basic shrink test")
+
+    @requires_nccl_shrink()
+    @requires_world_size(2)
+    def test_shrink_group_validation(self):
+        """Test input validation in shrink_group."""
+        device, pg = self._setup_shrink_test("validation")
+
+        def _test_invalid_input(ranks, description, expected_exception):
+            """Helper to test invalid inputs."""
+            try:
+                c10d.shrink_group(ranks)
+                self.fail(f"Expected {expected_exception.__name__} for {description}")
+            except expected_exception:
+                log_test_validation(self.rank, f"✓ {description}")
+            except Exception:
+                if expected_exception is Exception:  # Accept any exception
+                    log_test_validation(self.rank, f"✓ {description}")
+                else:
+                    raise
+
+        # Test cases
+        _test_invalid_input([], "Empty exclusion list", ValueError)
+        if self.world_size > 1:
+            _test_invalid_input([0, 0, 1], "Duplicate ranks", Exception)
+        _test_invalid_input([self.world_size + 1], "Out of bounds rank", Exception)
+
+        log_test_success(self.rank, "All validation tests passed")
+        dist.destroy_process_group()
+
+    @requires_nccl_shrink()
+    @requires_world_size(2)
+    def test_shrink_group_backend_properties(self):
+        """Test that backend properties are preserved after shrinking."""
+
+        test_name = "Backend Properties Test"
+        ranks_to_exclude = [0]
+
+        # Reuse _setup_shrink_test for complete setup (device, environment, and process group)
+        device, pg = self._setup_shrink_test("backend_properties")
+
+        # Follow _perform_shrink_test pattern from here
+        log_test_info(self.rank, f"{test_name} (world_size={self.world_size})")
+
+        is_excluded = self.rank in ranks_to_exclude
+        log_test_info(
+            self.rank,
+            f"Excluding ranks: {ranks_to_exclude}, am_excluded: {is_excluded}",
+        )
+
+        # Store original backend property values (not references) before shrinking
+        original_timeout = None
+        original_high_priority = None
+        if not is_excluded:
+            original_backend = pg._get_backend(device)
+            original_timeout = original_backend.options._timeout
+            original_high_priority = original_backend.options.is_high_priority_stream
+            log_test_info(
+                self.rank,
+                f"Storing original backend properties: timeout={original_timeout}, high_priority={original_high_priority}",
+            )
+
+        if is_excluded:
+            log_test_info(
+                self.rank,
+                f"Excluded rank {self.rank} - setup complete, skipping shrink operation",
+            )
+            dist.destroy_process_group()  # hang without it
+            return
+
+        # Only non-excluded ranks proceed with shrink (same as _perform_shrink_test)
+        log_test_info(self.rank, "Non-excluded rank calling shrink_group")
+        shrunk_pg = c10d.shrink_group(ranks_to_exclude)
+
+        # Reuse _validate_shrunk_group helper (same as _perform_shrink_test)
+        expected_size = self.world_size - len(ranks_to_exclude)
+        _ = self._validate_shrunk_group(shrunk_pg, expected_size, test_name)
+
+        # Add custom backend properties validation
+        new_backend = shrunk_pg._get_backend(device)
+        log_test_info(self.rank, "Validating backend properties are preserved")
+
+        new_timeout = new_backend.options._timeout
+        new_high_priority = new_backend.options.is_high_priority_stream
+
+        log_test_info(
+            self.rank,
+            f"Timeout comparison - original: {original_timeout}, new: {new_timeout}",
+        )
+        self.assertEqual(
+            original_timeout, new_timeout, f"{test_name}: timeout not preserved"
+        )
+
+        log_test_info(
+            self.rank,
+            f"High priority stream comparison - original: {original_high_priority}, new: {new_high_priority}",
+        )
+        self.assertEqual(
+            original_high_priority,
+            new_high_priority,
+            f"{test_name}: high_priority_stream not preserved",
+        )
+
+        log_test_validation(
+            self.rank, f"{test_name}: Backend properties preserved successfully"
+        )
+        log_test_success(
+            self.rank, f"{test_name} successful (shrink + backend validation)"
+        )
+
+        # Cleanup (same as _perform_shrink_test)
+        dist.destroy_process_group()
+
+    @requires_nccl_shrink()
+    @requires_world_size(2)
+    def test_shrink_group_multiple_comms(self):
+        """Test shrink_group with multiple communicators and subgroup invalidation."""
+
+        device, pg = self._setup_shrink_test("multiple_comms")
+
+        # Create subgroup [0, 1] and test shrinking it
+        subgroup = c10d.new_group([0, 1])
+        if self.rank <= 1:
+            # Shrink subgroup: exclude rank 1
+            if self.rank == 0:  # Only rank 0 remains
+                shrunk_subgroup = c10d.shrink_group([1], group=subgroup)
+                self.assertEqual(shrunk_subgroup.size(), 1)
+                # Test communication on shrunk subgroup
+                tensor = torch.full((1,), self.rank).cuda(device)
+                c10d.all_reduce(tensor, group=shrunk_subgroup)
+                self.assertEqual(tensor.item(), 0)  # Only rank 0
+                log_test_success(self.rank, "Subgroup shrinking successful")
+
+        dist.barrier()  # Sync before default group test
+
+        # Shrink default group: exclude last rank
+        ranks_to_exclude = [self.world_size - 1]
+        if self.rank not in ranks_to_exclude:
+            shrunk_default = c10d.shrink_group(ranks_to_exclude)
+            expected_size = self.world_size - 1
+            self.assertEqual(shrunk_default.size(), expected_size)
+
+            # Test collective on shrunk default group
+            tensor = torch.full((1,), self.rank).cuda(device)
+            c10d.all_reduce(tensor, group=shrunk_default)
+            expected_sum = sum(
+                range(self.world_size - 1)
+            )  # 0 + 1 + ... + (world_size-2)
+            self.assertEqual(tensor.item(), expected_sum)
+            log_test_success(self.rank, "Default group shrinking successful")
+
+            # Note: After shrinking default group, the old subgroup is invalid
+            # due to global rank reassignment
+
+        dist.destroy_process_group()
+
+    def _test_shrink_group_with_flag(self, shrink_flag, flag_name, rank_to_exclude):
+        """Helper method to test shrink_group with a specific flag."""
+        if self.world_size < 2:
+            log_test_info(self.rank, f"Skipping (needs ≥2 GPUs, got {self.world_size})")
+            return
+        ranks_to_exclude = [rank_to_exclude]
+        log_test_info(self.rank, f"Using {flag_name} flag (value: {shrink_flag})")
+        if flag_name == "NCCL_SHRINK_ABORT":
+            log_test_info(
+                self.rank,
+                "ABORT flag will terminate ongoing operations before shrinking",
+            )
+
+        self._perform_shrink_test(
+            ranks_to_exclude, f"{flag_name} flag test", shrink_flags=shrink_flag
+        )
+
+    @requires_nccl_shrink()
+    @requires_world_size(2)
+    def test_shrink_group_flags(self):
+        """Test shrink_group with different shrink flags."""
+        # Test ABORT flags
+        log_test_info(self.rank, "Testing NCCL_SHRINK_ABORT flag")
+        self._test_shrink_group_with_flag(NCCL_SHRINK_ABORT, "NCCL_SHRINK_ABORT", 1)
+
+    @requires_nccl_shrink()
+    @requires_world_size(2)
+    def test_shrink_group_nccl_config(self):
+        """Verify that passing NCCL config via pg_options influences the shrunk group's backend options."""
+        device, pg = self._setup_shrink_test("config")
+        if self.rank == self.world_size - 1:
+            # excluded rank should not call shrink_group
+            dist.destroy_process_group()
+            return
+
+        # Prepare pg_options with NCCL config overrides
+        # Capture parent's current backend options to ensure we can prove override vs inherit
+        parent_backend = pg._get_backend(torch.device("cuda"))
+        parent_hp = parent_backend.options.is_high_priority_stream
+        parent_blocking = parent_backend.options.config.blocking
+
+        # Choose overrides that differ from the parent (flip where possible)
+        override_hp = not parent_hp
+        if parent_blocking in (0, 1):
+            override_blocking = 1 - parent_blocking
+        else:
+            # If undefined or unexpected, set to 1 which is a concrete value
+            override_blocking = 1
+
+        opts = c10d.ProcessGroupNCCL.Options()
+        opts.is_high_priority_stream = override_hp
+        opts.config.blocking = override_blocking
+
+        shrunk_pg = c10d.shrink_group([self.world_size - 1], pg_options=opts)
+
+        # Validate backend options propagated
+        backend = shrunk_pg._get_backend(torch.device("cuda"))
+        # is_high_priority_stream should exactly match our override and differ from parent
+        self.assertEqual(backend.options.is_high_priority_stream, override_hp)
+        self.assertNotEqual(backend.options.is_high_priority_stream, parent_hp)
+        # config is a struct; check representative field and difference from parent when meaningful
+        self.assertEqual(backend.options.config.blocking, override_blocking)
+        if parent_blocking in (0, 1):
+            self.assertNotEqual(backend.options.config.blocking, parent_blocking)
+
+        dist.destroy_process_group()
+
+    @requires_nccl_shrink()
+    @requires_world_size(2)
+    def test_shrink_group_performance(self):
+        """Test shrink_group performance and regression detection."""
+        import time
+
+        ranks_to_exclude = self._get_default_ranks_to_exclude()
+        is_excluded = self.rank in ranks_to_exclude
+
+        if not ranks_to_exclude:
+            log_test_info(self.rank, "Skipping performance test (world_size=1)")
+            return
+
+        log_test_info(self.rank, f"Performance test with {self.world_size} processes")
+        device, pg = self._setup_shrink_test("performance")
+
+        if not is_excluded:
+            log_test_info(self.rank, "Measuring shrink_group performance")
+            start_time = time.time()
+            shrunk_pg = c10d.shrink_group(ranks_to_exclude)
+            end_time = time.time()
+
+            elapsed_time = end_time - start_time
+            log_test_info(self.rank, f"shrink_group: {elapsed_time:.3f}s")
+
+            # Regression check: should complete within reasonable time
+            self.assertLess(
+                elapsed_time,
+                30.0,
+                f"shrink_group took {elapsed_time:.3f}s, possible regression",
+            )
+
+            # Test collective performance
+            expected_size = self.world_size - len(ranks_to_exclude)
+            self._validate_shrunk_group(shrunk_pg, expected_size, "performance")
+
+            collective_start = time.time()
+            _ = self._test_collective_on_shrunk_group(
+                shrunk_pg, device, ranks_to_exclude, "performance"
+            )
+            collective_time = time.time() - collective_start
+
+            log_test_info(self.rank, f"all_reduce: {collective_time:.3f}s")
+            log_test_success(self.rank, "Performance test passed")
+        else:
+            log_test_info(self.rank, "Excluded rank - waiting")
+
+        dist.destroy_process_group()
+
+    @requires_nccl_shrink()
+    @requires_world_size(4)
+    def test_shrink_group_multiple_exclusions(self):
+        """Test shrink_group with multiple ranks excluded at once."""
+        # Scale exclusions with world size
+        ranks_to_exclude = list(range(2, self.world_size, 2))  # Every other rank from 2
+
+        self._perform_shrink_test(ranks_to_exclude, "Multiple exclusions test")
+
+    @requires_nccl_shrink()
+    @requires_world_size(3)
+    def test_shrink_group_multiple_iterations(self):
+        """Test multiple shrink operations in sequence."""
+        log_test_info(
+            self.rank,
+            f"Starting test_shrink_group_multiple_iterations with world_size={self.world_size}",
+        )
+
+        store = c10d.FileStore(self.file_name, self.world_size)
+        device = torch.device(f"cuda:{self.rank}")
+        _ = self._create_process_group_nccl(store, self.opts(), device_id=device)
+
+        # Track current effective world size throughout shrinking operations
+        current_world_size = self.world_size
+        log_test_info(self.rank, f"Initial world_size: {current_world_size}")
+
+        # First shrinking: exclude the last rank(s)
+        first_exclusion = [self.world_size - 1]
+        if self.world_size >= 6:
+            first_exclusion.append(
+                self.world_size - 2
+            )  # Exclude last two ranks for larger sizes
+
+        log_test_info(self.rank, f"First shrinking: excluding ranks {first_exclusion}")
+
+        if self.rank not in first_exclusion:
+            # Only non-excluded ranks should call shrink_group
+            first_pg = c10d.shrink_group(first_exclusion)
+            self.assertIsNotNone(first_pg)
+            # IMPORTANT: Update world size after first shrinking
+            current_world_size = first_pg.size()
+            expected_first_size = self.world_size - len(first_exclusion)
+            log_test_info(
+                self.rank,
+                f"After first shrinking: world_size {self.world_size} -> {current_world_size}",
+            )
+            self.assertEqual(first_pg.size(), expected_first_size)
+
+            # Second shrinking: exclude another rank from the remaining group
+            # Choose a rank that's in the middle range
+            if current_world_size >= 3:
+                second_exclusion = [
+                    current_world_size - 1
+                ]  # Exclude the new "last" rank
+                log_test_info(
+                    self.rank,
+                    f"Second shrinking from group of size {current_world_size}: excluding ranks {second_exclusion}",
+                )
+
+                if self.rank not in second_exclusion:
+                    # Only non-excluded ranks should call shrink_group for second iteration
+                    second_pg = c10d.shrink_group(second_exclusion, group=first_pg)
+                    self.assertIsNotNone(second_pg)
+                    # IMPORTANT: Update world size after second shrinking
+                    final_world_size = second_pg.size()
+                    expected_final_size = current_world_size - len(second_exclusion)
+                    log_test_info(
+                        self.rank,
+                        f"After second shrinking: world_size {current_world_size} -> {final_world_size}",
+                    )
+                    self.assertEqual(second_pg.size(), expected_final_size)
+
+                    # Test collective on final group
+                    tensor = torch.full((1,), self.rank).cuda(device)
+                    log_test_info(
+                        self.rank,
+                        f"Performing all_reduce on final group (size {final_world_size}) with tensor: {tensor.item()}",
+                    )
+                    c10d.all_reduce(tensor, group=second_pg)
+                    log_test_info(
+                        self.rank,
+                        f"Final all_reduce completed, result: {tensor.item()}",
+                    )
+
+                    # Calculate expected sum of remaining ranks
+                    all_excluded = set(first_exclusion + second_exclusion)
+                    remaining_ranks = [
+                        r for r in range(self.world_size) if r not in all_excluded
+                    ]
+                    expected_sum = sum(remaining_ranks)
+                    log_test_info(
+                        self.rank,
+                        f"Remaining ranks: {remaining_ranks}, expected sum: {expected_sum}, actual: {tensor.item()}",
+                    )
+                    self.assertEqual(tensor.item(), expected_sum)
+                    log_test_info(self.rank, "Final verification passed")
+                else:
+                    log_test_info(
+                        self.rank,
+                        "This rank excluded in second shrinking, not calling shrink_group",
+                    )
+            else:
+                log_test_info(
+                    self.rank, "Skipping second shrinking (remaining group too small)"
+                )
+        else:
+            log_test_info(
+                self.rank,
+                "This rank excluded in first shrinking, not calling shrink_group",
+            )
+
+        log_test_info(self.rank, "Destroying process group")
+        dist.destroy_process_group()
+        log_test_info(self.rank, "test_shrink_group_multiple_iterations completed")
+
+    # Helper methods for optimized shrink group tests
+    def _setup_shrink_test(self, test_suffix, world_size=None, warmup=True):
+        """Common setup for shrink group tests."""
+        os.environ["TORCH_NCCL_USE_COMM_NONBLOCKING"] = "1"
+        world_size = world_size or self.world_size
+        store = c10d.FileStore(self.file_name + f"_{test_suffix}", world_size)
+        device = torch.device(f"cuda:{self.rank}")
+        c10d.init_process_group(
+            "nccl",
+            world_size=world_size,
+            rank=self.rank,
+            store=store,
+            pg_options=self.opts(),
+            device_id=device,
+        )
+        pg = c10d.distributed_c10d._get_default_group()
+
+        if warmup:
+            c10d.all_reduce(torch.ones(1).cuda(device), group=pg)
+
+        return device, pg
+
+    def _validate_shrunk_group(self, shrunk_pg, expected_size, test_name=""):
+        """Validate properties of a shrunk process group."""
+        self.assertIsNotNone(shrunk_pg, f"{test_name}: shrunk_pg should not be None")
+        actual_size = shrunk_pg.size()
+        self.assertEqual(
+            actual_size, expected_size, f"{test_name}: group size mismatch"
+        )
+
+        new_rank = shrunk_pg.rank()
+        self.assertTrue(
+            0 <= new_rank < expected_size, f"{test_name}: invalid new rank {new_rank}"
+        )
+
+        log_test_info(
+            self.rank,
+            f"{test_name}: world_size {self.world_size} -> {actual_size}, rank {self.rank} -> {new_rank}",
+        )
+        return new_rank
+
+    def _test_collective_on_shrunk_group(
+        self, shrunk_pg, device, ranks_to_exclude, test_name=""
+    ):
+        """Test collective communication on shrunk group and verify correctness."""
+        test_tensor = torch.full((1,), self.rank, device=device, dtype=torch.float32)
+        c10d.all_reduce(test_tensor, group=shrunk_pg)
+
+        result = test_tensor.item()
+        expected_sum = sum(
+            r for r in range(self.world_size) if r not in ranks_to_exclude
+        )
+
+        self.assertEqual(
+            result, expected_sum, f"{test_name}: collective result mismatch"
+        )
+        log_test_info(
+            self.rank, f"{test_name}: collective passed ({result} == {expected_sum})"
+        )
+        return result
+
+    def _perform_shrink_test(
+        self, ranks_to_exclude, test_name, shrink_flags=0, with_collective=True
+    ):
+        """Complete shrink test flow: setup, shrink, validate, test collective, cleanup.
+
+        Consistent API: All ranks perform setup to initialize distributed environment.
+        ONLY non-excluded ranks call shrink_group() for both default and non-default groups.
+        Excluded ranks perform setup, then exit without calling shrink_group() or waiting.
+        """
+        log_test_info(self.rank, f"{test_name} (world_size={self.world_size})")
+
+        is_excluded = self.rank in ranks_to_exclude
+        log_test_info(
+            self.rank,
+            f"Excluding ranks: {ranks_to_exclude}, am_excluded: {is_excluded}",
+        )
+
+        # All ranks (including excluded ones) perform setup to initialize distributed environment
+        device, pg = self._setup_shrink_test(test_name.lower().replace(" ", "_"))
+        is_default_group = pg == c10d.distributed_c10d._get_default_group()
+
+        if is_excluded:
+            log_test_info(
+                self.rank,
+                f"Excluded rank {self.rank} - setup complete, skipping shrink operation",
+            )
+            if shrink_flags & NCCL_SHRINK_ABORT:
+                log_test_info(self.rank, f"Using abort for excluded rank {self.rank}")
+                pg._get_backend(torch.device(device)).abort()
+                log_test_info(
+                    self.rank, f"cleanup resources for excluded rank {self.rank}"
+                )
+                dist.destroy_process_group()
+                log_test_info(self.rank, f"Excluded rank {self.rank} - exit")
+            else:
+                log_test_info(
+                    self.rank, f"Using regular destroy for excluded rank {self.rank}"
+                )
+                dist.destroy_process_group()
+            return None
+
+        # Only non-excluded ranks proceed with shrink
+        log_test_info(
+            self.rank,
+            f"Non-excluded rank calling shrink_group (default_group={is_default_group})",
+        )
+        shrunk_pg = c10d.shrink_group(ranks_to_exclude, shrink_flags=shrink_flags)
+        log_test_info(
+            self.rank,
+            f"Non-excluded rank calling shrink_group (default_group={is_default_group}) done",
+        )
+
+        # Non-excluded ranks: validate and test the new group
+        expected_size = self.world_size - len(ranks_to_exclude)
+        _ = self._validate_shrunk_group(shrunk_pg, expected_size, test_name)
+
+        if with_collective:
+            _ = self._test_collective_on_shrunk_group(
+                shrunk_pg, device, ranks_to_exclude, test_name
+            )
+            log_test_success(self.rank, f"{test_name} successful (shrink + collective)")
+        else:
+            log_test_success(self.rank, f"{test_name} successful (shrink only)")
+
+        dist.destroy_process_group()
+        return shrunk_pg
+
+    def _get_default_ranks_to_exclude(self):
+        """Get default ranks to exclude based on world size."""
+        if self.world_size <= 1:
+            return []
+        return [self.world_size - 1]  # Exclude last rank by default
+
+    @requires_nccl_shrink()
+    @requires_world_size(3)
+    def test_shrink_group_vs_abort_reinit_performance(self):
+        """Compare performance of shrink_group vs traditional abort+reinit (simplified for reliability)."""
+        log_test_info(self.rank, "=== TEST 1: abort+reinit ===")
+
+        device, pg1 = self._setup_shrink_test("_perf_reinit")
+        torch.cuda.synchronize(device)
+
+        # Test 1: Traditional abort + reinit
+        start_time = time.perf_counter()
+        dist.destroy_process_group()
+
+        device, new_pg = self._setup_shrink_test("perf_shrink_test1")
+        reinit_time = time.perf_counter() - start_time
+
+        # Test collective with original rank values for fair comparison (non-blocking mode)
+        test_tensor = torch.full((1,), self.rank, device=device, dtype=torch.float32)
+        work = c10d.all_reduce(test_tensor, group=new_pg, async_op=True)
+        work.wait()
+
+        torch.cuda.synchronize(device)
+
+        # Verify correctness
+        expected_sum = sum(r for r in range(self.world_size))
+        self.assertEqual(test_tensor.item(), expected_sum, "Reinit collective failed")
+
+        log_test_info(self.rank, f"abort+reinit: {reinit_time:.4f}s")
+        dist.destroy_process_group(new_pg)
+
+        # Test 2: shrink_group with NCCL_SHRINK_ABORT
+        log_test_info(self.rank, "=== TEST 2: shrink_group ===")
+
+        ranks_to_exclude = [self.world_size - 1]
+        is_excluded = self.rank in ranks_to_exclude
+        log_test_info(
+            self.rank,
+            f"Excluding ranks: {ranks_to_exclude}, am_excluded: {is_excluded}",
+        )
+
+        device, pg1 = self._setup_shrink_test("perf_shrink_test2")  # Unique suffix
+
+        shrink_time = 0
+        if not is_excluded:
+            torch.cuda.synchronize(device)  # Ensure accurate timing
+            start_time = time.perf_counter()
+            shrunk_pg = c10d.shrink_group(
+                ranks_to_exclude, shrink_flags=NCCL_SHRINK_ABORT
+            )
+            c10d.all_reduce(torch.ones(1).cuda(device), group=shrunk_pg)
+            shrink_time = time.perf_counter() - start_time
+
+            # Test collective communication on shrunk group (non-blocking mode)
+            test_tensor = torch.full(
+                (1,), self.rank, device=device, dtype=torch.float32
+            )
+            work = c10d.all_reduce(test_tensor, group=shrunk_pg, async_op=True)
+            work.wait()
+
+            # Verify correctness
+            expected_sum = sum(
+                r for r in range(self.world_size) if r not in ranks_to_exclude
+            )
+            self.assertEqual(
+                test_tensor.item(),
+                expected_sum,
+                "shrink_test: collective result mismatch",
+            )
+
+            torch.cuda.synchronize(device)  # Ensure operations complete
+            log_test_info(self.rank, f"shrink_group: {shrink_time:.4f}s")
+            dist.destroy_process_group()
+        else:
+            log_test_info(self.rank, "Excluded from shrink test - exiting immediately")
+            dist.destroy_process_group()
+            return
+
+        # Performance analysis (only for participating ranks)
+        if shrink_time > 0 and reinit_time > 0:
+            speedup = reinit_time / shrink_time
+            time_saved = reinit_time - shrink_time
+
+            log_test_info(self.rank, "=== PERFORMANCE RESULTS ===")
+            log_test_info(self.rank, f"shrink_group:  {shrink_time:.4f}s")
+            log_test_info(self.rank, f"abort+reinit:  {reinit_time:.4f}s")
+            log_test_info(self.rank, f"time_saved:    {time_saved:+.4f}s")
+            log_test_info(self.rank, f"speedup:       {speedup:.2f}x")
+
+            if speedup > 1.1:
+                log_test_success(self.rank, "shrink_group significantly faster")
+            elif speedup > 0.9:
+                log_test_info(self.rank, "≈ comparable performance")
+            else:
+                log_test_warning(self.rank, "abort+reinit faster")
+
+        log_test_info(self.rank, "Performance test completed")
+
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
     def test_deterministic_mode_no_break(self):
@@ -5115,6 +5790,229 @@ def test_coalescing_manager_collective(self, timing_enabled):
         else:
             self.assertTrue("duration_ms" not in t["entries"][0])
 
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @parametrize("timing_enabled", [True, False])
+    def test_fr_record_reset_circular_buffer_full(self, timing_enabled):
+        """
+        Test that when the circular buffer in entries_ is full and we call reset,
+        then fill the buffer with new entries, dump_entries returns only the new
+        entries and not the old ones.
+        """
+        if self.rank == self.MAIN_PROCESS_RANK:
+            return
+
+        # Override buffer size to 10 for faster testing
+        os.environ["TORCH_NCCL_TRACE_BUFFER_SIZE"] = "10"
+
+        pg = self._create_process_group_nccl()
+        if timing_enabled:
+            pg._enable_collectives_timing()
+        device = self.local_device
+        self.set_thread_name("fr_test_thread")
+        a = torch.full((3, 4), float(self.rank), device=device)
+
+        # Fill the buffer completely with 10 entries
+        for _ in range(10):
+            f = pg.allreduce(a)
+        f.wait()
+        torch.cuda.synchronize(device=device)
+        time.sleep(1)
+
+        # Verify buffer is full with 10 entries
+        t = pickle.loads(torch._C._distributed_c10d._dump_nccl_trace())
+        self.assertEqual(len(t["entries"]), 10)
+
+        # Now reset the flight recorder
+        torch._C._distributed_c10d._reset_fr_recording_nccl()
+
+        # Add new entries after reset - fill the buffer completely again
+        for _ in range(10):
+            f = pg.allreduce(a)
+        f.wait()
+        torch.cuda.synchronize(device=device)
+        time.sleep(1)
+
+        # Verify we get exactly 10 new entries, not 20
+        t = pickle.loads(torch._C._distributed_c10d._dump_nccl_trace())
+        self.assertEqual(len(t["entries"]), 10)
+
+        # Verify all entries have the expected properties (from after reset)
+        # After reset, record IDs should start from 0 again
+        for i, entry in enumerate(t["entries"]):
+            self.assertIn("profiling_name", entry)
+            self.assertEqual(entry["profiling_name"], "nccl:all_reduce")
+            self.assertIn("record_id", entry)
+            # Record IDs should be sequential starting from 0 after reset
+            self.assertEqual(entry["record_id"], i)
+
+        dist.destroy_process_group()
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @parametrize("timing_enabled", [True, False])
+    def test_fr_record_reset_partial_overwrite(self, timing_enabled):
+        """
+        Test that when the circular buffer is full, we reset, and then add fewer
+        entries than the buffer size, we only get the new entries.
+        This tests that old entries at the end of the circular buffer are properly
+        filtered out based on reset_epoch.
+        """
+        if self.rank == self.MAIN_PROCESS_RANK:
+            return
+
+        # Override buffer size to 10 for faster testing
+        os.environ["TORCH_NCCL_TRACE_BUFFER_SIZE"] = "10"
+
+        pg = self._create_process_group_nccl()
+        if timing_enabled:
+            pg._enable_collectives_timing()
+        device = self.local_device
+        self.set_thread_name("fr_test_thread")
+        a = torch.full((3, 4), float(self.rank), device=device)
+
+        # Fill the buffer completely
+        for _ in range(10):
+            f = pg.allreduce(a)
+        f.wait()
+        torch.cuda.synchronize(device=device)
+        time.sleep(1)
+
+        # Reset the flight recorder
+        torch._C._distributed_c10d._reset_fr_recording_nccl()
+
+        # Add only 3 new entries (much less than buffer size)
+        for _ in range(3):
+            f = pg.allreduce(a)
+        f.wait()
+        torch.cuda.synchronize(device=device)
+        time.sleep(1)
+
+        # Verify we only get the 3 new entries, not 10
+        t = pickle.loads(torch._C._distributed_c10d._dump_nccl_trace())
+        self.assertEqual(len(t["entries"]), 3)
+
+        # Verify record IDs start from 0 after reset
+        for i, entry in enumerate(t["entries"]):
+            self.assertIn("record_id", entry)
+            self.assertEqual(entry["record_id"], i)
+
+        dist.destroy_process_group()
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @parametrize("timing_enabled", [True, False])
+    def test_fr_record_reset_wraparound(self, timing_enabled):
+        """
+        Test that when we reset in the middle of the circular buffer and then
+        wrap around, dump_entries correctly returns only entries from the current
+        epoch in the correct order.
+        """
+        if self.rank == self.MAIN_PROCESS_RANK:
+            return
+
+        # Override buffer size to 10 for faster testing
+        os.environ["TORCH_NCCL_TRACE_BUFFER_SIZE"] = "10"
+
+        pg = self._create_process_group_nccl()
+        if timing_enabled:
+            pg._enable_collectives_timing()
+        device = self.local_device
+        self.set_thread_name("fr_test_thread")
+        a = torch.full((3, 4), float(self.rank), device=device)
+
+        # Fill half the buffer
+        for _ in range(5):
+            f = pg.allreduce(a)
+        f.wait()
+        torch.cuda.synchronize(device=device)
+        time.sleep(1)
+
+        # Reset at this point (reset happens at index 5)
+        torch._C._distributed_c10d._reset_fr_recording_nccl()
+
+        # Now add 8 entries, which will wrap around
+        # (5->9 fills rest of buffer, then 0->2 wraps around)
+        for _ in range(8):
+            f = pg.allreduce(a)
+        f.wait()
+        torch.cuda.synchronize(device=device)
+        time.sleep(1)
+
+        # Should get exactly 8 entries, properly ordered
+        t = pickle.loads(torch._C._distributed_c10d._dump_nccl_trace())
+        self.assertEqual(len(t["entries"]), 8)
+
+        # Entries should be in chronological order
+        # The dump_entries() method returns entries from next_ to end, then 0 to next_
+        # After filtering old entries, we should have 8 entries in order
+        # Verify record IDs start from 0 after reset (id_ is reset in reset_all())
+        for i, entry in enumerate(t["entries"]):
+            self.assertIn("profiling_name", entry)
+            self.assertIn("record_id", entry)
+            self.assertEqual(entry["record_id"], i)
+
+        dist.destroy_process_group()
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @parametrize("timing_enabled", [True, False])
+    def test_fr_record_multiple_resets(self, timing_enabled):
+        """
+        Test multiple consecutive resets to ensure each reset properly increments
+        the epoch and filters out entries from previous epochs.
+        """
+        if self.rank == self.MAIN_PROCESS_RANK:
+            return
+
+        # Override buffer size to 10 for faster testing
+        os.environ["TORCH_NCCL_TRACE_BUFFER_SIZE"] = "10"
+
+        pg = self._create_process_group_nccl()
+        if timing_enabled:
+            pg._enable_collectives_timing()
+        device = self.local_device
+        self.set_thread_name("fr_test_thread")
+        a = torch.full((3, 4), float(self.rank), device=device)
+
+        # First batch: 2 entries
+        for _ in range(2):
+            f = pg.allreduce(a)
+        f.wait()
+        torch.cuda.synchronize(device=device)
+        time.sleep(1)
+
+        # First reset
+        torch._C._distributed_c10d._reset_fr_recording_nccl()
+
+        # Second batch: 3 entries
+        for _ in range(3):
+            f = pg.allreduce(a)
+        f.wait()
+        torch.cuda.synchronize(device=device)
+        time.sleep(1)
+
+        # Second reset
+        torch._C._distributed_c10d._reset_fr_recording_nccl()
+
+        # Third batch: 4 entries
+        for _ in range(4):
+            f = pg.allreduce(a)
+        f.wait()
+        torch.cuda.synchronize(device=device)
+        time.sleep(1)
+
+        # Should only see the last 4 entries
+        t = pickle.loads(torch._C._distributed_c10d._dump_nccl_trace())
+        self.assertEqual(len(t["entries"]), 4)
+
+        # Verify record IDs start from 0 after the last reset
+        for i, entry in enumerate(t["entries"]):
+            self.assertIn("record_id", entry)
+            self.assertEqual(entry["record_id"], i)
+
+        dist.destroy_process_group()
+
 
 def check_if_test_is_skipped(fn):
     def wrapper(self, *args, **kwargs):
@@ -5446,6 +6344,14 @@ def test_comm_recursive_split_group(self):
         if self.rank == 6 or self.rank == 7:
             dist.broadcast(tensor2, 6, group=ng2)
             self.assertEqual(tensor2, torch.full((1,), 6))
+
+        # Test the case when the split changes the pg option of split group
+        # while the parent pg option is not changed.
+        new_pg = c10d.new_group([0, 1, 2, 3, 4, 5, 6, 7], device_id=device)
+        backend_new_pg = new_pg._get_backend(torch.device(device))
+        self.assertEqual(len(backend_new_pg.options.global_ranks_in_group), 8)
+        c10d.split_group(new_pg, [[0, 2, 4, 6], [1, 3, 5, 7]])
+        self.assertEqual(len(backend_new_pg.options.global_ranks_in_group), 8)
         # a barrier and a cuda sync before destroying all pgs.
         dist.barrier(pg)
         torch.cuda.synchronize()
diff --git a/test/distributed/test_composability.py b/test/distributed/test_composability.py
index 3508a43cb548f..566a63d67302d 100644
--- a/test/distributed/test_composability.py
+++ b/test/distributed/test_composability.py
@@ -499,6 +499,7 @@ def create_schedule(computation_types, microbatch_index=None):
             [
                 _ComputationType.UNSHARD,
                 _ComputationType.FORWARD,
+                _ComputationType.REDUCE_GRAD,  # Contains final fsdp post_backward
             ],
             microbatch_index=0,
         )
diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index b75fb91379f9c..61186034c746f 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -2,6 +2,7 @@
 import contextlib
 import copy
 import functools
+import logging
 import random
 import unittest
 from contextlib import contextmanager
@@ -51,6 +52,9 @@
 from torch.testing._internal.triton_utils import requires_cuda_and_triton
 
 
+log = logging.getLogger(__name__)
+
+
 def reset_rng_state():
     torch.manual_seed(1337)
     random.seed(1337)
@@ -1200,6 +1204,116 @@ def f(x):
             for r in res[1:]:
                 self.assertEqual(res[0], r)
 
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @patch.object(torch._dynamo.config, "enable_compiler_collectives", True)
+    @patch.object(torch._inductor.config, "max_autotune_gemm", True)
+    @patch.object(torch._inductor.config, "distributed_max_autotune_gemm", True)
+    def test_multiproc_autotune(self):
+        with _dynamo_dist_per_rank_init(self.rank, self.world_size):
+            torch._dynamo.utils.clear_compilation_metrics()
+
+            @torch.compile()
+            def f(a, b, c):
+                res = (
+                    torch.sum((a @ b) + 1.0)
+                    + torch.sum(torch.relu(b @ c))
+                    + torch.sum(c @ a)
+                )
+
+                return res
+
+            a = torch.randn(1024, 1024, device=self.rank, dtype=torch.bfloat16)
+            b = torch.randn(1024, 2048, device=self.rank, dtype=torch.bfloat16)
+            c = torch.randn(2048, 1024, device=self.rank, dtype=torch.bfloat16)
+
+            try:
+                f(a, b, c)
+            except Exception:
+                log.exception("Caught exception running f")
+                raise
+
+            metrics = torch._dynamo.utils.get_compilation_metrics()
+            res = [None] * self.world_size
+            torch.distributed.all_gather_object(res, len(metrics))
+            for r in res[1:]:
+                self.assertEqual(res[0], r)
+
+            print(f"Result from {self.rank} is {f(a, b, c)}")
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @patch.object(torch._dynamo.config, "enable_compiler_collectives", True)
+    @patch.object(torch._inductor.config, "max_autotune_gemm", True)
+    @patch.object(torch._inductor.config, "distributed_max_autotune_gemm", True)
+    def test_multiproc_autotune_dynamic_shapes(self):
+        with _dynamo_dist_per_rank_init(self.rank, self.world_size):
+            torch._dynamo.utils.clear_compilation_metrics()
+
+            @torch.compile()
+            def f(a, b, c):
+                res = (
+                    torch.sum((a @ b) + 1.0)
+                    + torch.sum(torch.relu(b @ c))
+                    + torch.sum(c @ a)
+                )
+
+                return res
+
+            a = torch.randn(1024, 1024, device=self.rank, dtype=torch.bfloat16)
+            b = torch.randn(1024, 2048, device=self.rank, dtype=torch.bfloat16)
+            c = torch.randn(2048, 1024, device=self.rank, dtype=torch.bfloat16)
+
+            # Mark tensors as dynamic on dimension 0
+            torch._dynamo.mark_dynamic(a, 0)
+            torch._dynamo.mark_dynamic(a, 1)
+            torch._dynamo.mark_dynamic(b, 0)
+            torch._dynamo.mark_dynamic(b, 1)
+            torch._dynamo.mark_dynamic(c, 0)
+            torch._dynamo.mark_dynamic(c, 1)
+
+            try:
+                f(a, b, c)
+            except Exception:
+                log.exception("Caught exception running f")
+                raise
+
+            metrics = torch._dynamo.utils.get_compilation_metrics()
+            res = [None] * self.world_size
+            torch.distributed.all_gather_object(res, len(metrics))
+            for r in res[1:]:
+                self.assertEqual(res[0], r)
+
+            print(f"Result from {self.rank} is {f(a, b, c)}")
+
+            # Store the initial compilation count
+            initial_compile_count = len(metrics)
+
+            # # Test with different sizes to ensure dynamic shapes work without recompilation
+            a2 = torch.randn(512, 512, device=self.rank, dtype=torch.bfloat16)
+            b2 = torch.randn(512, 2048, device=self.rank, dtype=torch.bfloat16)
+            c2 = torch.randn(2048, 512, device=self.rank, dtype=torch.bfloat16)
+
+            try:
+                result2 = f(a2, b2, c2)
+                print(f"Result2 from {self.rank} is {result2}")
+            except Exception:
+                log.exception("Caught exception running f with different sizes")
+                raise
+
+            # Verify no recompilation occurred
+            metrics_after = torch._dynamo.utils.get_compilation_metrics()
+            final_compile_count = len(metrics_after)
+            self.assertEqual(
+                initial_compile_count,
+                final_compile_count,
+                "Expected no recompilation with dynamic shapes",
+            )
+
+            # Verify all ranks have the same compilation count
+            res_after = [None] * self.world_size
+            torch.distributed.all_gather_object(res_after, final_compile_count)
+            for r in res_after[1:]:
+                self.assertEqual(res_after[0], r)
+
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     def test_get_pg_attr(self):
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
diff --git a/test/distributed/test_functional_api.py b/test/distributed/test_functional_api.py
index b5522fe2bef06..d4954b3e4f56d 100644
--- a/test/distributed/test_functional_api.py
+++ b/test/distributed/test_functional_api.py
@@ -485,7 +485,7 @@ def allred_mesh_dim(input):
 def exit_if_lt_x_accelerators(x):
     if torch.accelerator.is_available():
         if torch.accelerator.device_count() < x:
-            sys.exit(TEST_SKIPS[f"multi-accelerator-{x}"].exit_code)
+            sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code)
 
 
 def with_comms(func=None):
diff --git a/test/distributed/test_inductor_collectives.py b/test/distributed/test_inductor_collectives.py
index ac3103e09341d..33bf475b91460 100644
--- a/test/distributed/test_inductor_collectives.py
+++ b/test/distributed/test_inductor_collectives.py
@@ -23,7 +23,12 @@
     sink_waits_iterative,
 )
 from torch._inductor.compile_fx import compile_fx as inductor_compile_fx
-from torch._inductor.fx_passes.bucketing import is_all_gather_into_tensor
+from torch._inductor.fx_passes.bucketing import (
+    is_all_gather_into_tensor,
+    is_all_reduce_tensor,
+    is_all_to_all_tensor,
+    is_reduce_scatter_tensor,
+)
 from torch._inductor.scheduler import (
     _get_mm_like_fn,
     BaseSchedulerNode,
@@ -40,6 +45,7 @@
     DynamoDistributedSingleProcTestCase,
     MultiProcessTestCase,
     requires_accelerator_dist_backend,
+    requires_gloo,
     skip_if_lt_x_gpu,
 )
 from torch.testing._internal.common_utils import (
@@ -1341,13 +1347,11 @@ def func(inp, *, tag, ranks, group_size):
         assert counter.op_count == 3  # It generates 2 getattr to unpack the array
         assert same(out, correct)
 
+    # This doesn't work in all cases, and now we properly loudly error.
+    # See: https://github.com/pytorch/pytorch/issues/151240
+    # When differentiable funcols are implemented can revert.
+    @unittest.expectedFailure
     def test_backwards(self):
-        """
-        It's probably not that common to need backwards support for collectives.
-
-        However, I wanted to at least see if it was possible to support it as a design goal.
-        """
-
         def func(inp):
             ar = _functional_collectives.all_reduce(inp, "sum", "0")
             return ar
@@ -1775,16 +1779,10 @@ def func(x, w, ar_0, ar_1, tag, ranks, group_size):
         inputs = [x, w, ar_0, ar_1]
         f(*inputs, **self.get_world_trs())
 
-        def _pass(g):
-            from torch._inductor.fx_passes.bucketing import bucket_all_reduce
-
-            bucket_all_reduce(g.owning_module, lambda _: 2000)
-
-        torch._inductor.config.post_grad_custom_post_pass = _pass
-
         with torch._inductor.config.patch(
             {
                 "reorder_for_compute_comm_overlap": False,
+                "bucket_all_reduces_fx": bucket_mode,
             }
         ):
             compiled = torch.compile(f)
@@ -1985,6 +1983,7 @@ def _reorder_communication_preserving_peak_memory(
                     "bucket_reduce_scatters_fx_bucket_size_determinator": lambda _: 2,
                     "reorder_for_compute_comm_overlap": True,
                     "reorder_for_compute_comm_overlap_passes": [
+                        _reorder_communication_preserving_peak_memory,
                         sink_waits_iterative,
                         _reorder_communication_preserving_peak_memory,
                     ],
@@ -2046,11 +2045,6 @@ def _reorder_communication_preserving_peak_memory(
         assert node_stats is not None
         self.assertTrue(isinstance(node_stats, dict))
         self.assertEqual(len(node_stats), 4)
-        it = iter(node_stats.values())
-        node_stat0 = next(it)
-        self.assertTrue(node_stat0.limiting_factor == "None")
-        node_stat1 = next(it)
-        self.assertTrue("collective ordering" in node_stat1.limiting_factor)
 
     @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@@ -2199,7 +2193,7 @@ def test_sync_decision_cross_ranks(self):
         self.assertEqual(saved_values, [wt1])
 
     @skip_if_lt_x_gpu(2)
-    def test_comm_analysis(self):
+    def test_all_gather_comm_analysis(self):
         store = c10d.FileStore(self.file_name, self.world_size)
         torch.cuda.set_device(self.rank)
         c10d.init_process_group(
@@ -2240,6 +2234,184 @@ def func(inp, group_size, group_name):
                 )
                 assert est_ms_nccl > 0
 
+    @skip_if_lt_x_gpu(2)
+    def test_reduce_scatter_comm_analysis(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        torch.cuda.set_device(self.rank)
+        c10d.init_process_group(
+            backend="nccl", store=store, rank=self.rank, world_size=self.world_size
+        )
+        group = c10d.distributed_c10d._get_default_group()
+        group_name = "default"
+        torch._C._distributed_c10d._register_process_group(
+            group_name, torch.distributed.group.WORLD
+        )
+        group_size = group.size()
+
+        def func(inp, group_size, group_name):
+            rs_0_out = torch.ops._c10d_functional.reduce_scatter_tensor(
+                inp, "sum", group_size, group_name
+            )
+            rs_0_wait = torch.ops.c10d_functional.wait_tensor(rs_0_out)
+            rs_1_out = torch.ops._c10d_functional.reduce_scatter_tensor(
+                rs_0_wait, "sum", group_size, group_name
+            )
+            rs_1_wait = torch.ops.c10d_functional.wait_tensor(rs_1_out)
+            return rs_1_wait
+
+        gm = make_fx(func)(torch.ones(4, 4, device=self.device), group_size, group_name)
+        g = gm.graph
+        for n in g.nodes:
+            if is_reduce_scatter_tensor(n):
+                from torch._inductor.comm_analysis import (
+                    estimate_nccl_collective_runtime_from_fx_node,
+                )
+
+                est_ms = estimate_nccl_collective_runtime_from_fx_node(
+                    n, use_nccl_estimator=False
+                )
+                assert est_ms > 0
+                est_ms_nccl = estimate_nccl_collective_runtime_from_fx_node(
+                    n, use_nccl_estimator=True
+                )
+                assert est_ms_nccl > 0
+
+    @skip_if_lt_x_gpu(2)
+    def test_all_reduce_comm_analysis(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        torch.cuda.set_device(self.rank)
+        c10d.init_process_group(
+            backend="nccl", store=store, rank=self.rank, world_size=self.world_size
+        )
+        group = c10d.distributed_c10d._get_default_group()
+        group_name = "default"
+        torch._C._distributed_c10d._register_process_group(
+            group_name, torch.distributed.group.WORLD
+        )
+        group_size = group.size()
+
+        def func(inp, group_size, group_name):
+            ar_0_out = torch.ops._c10d_functional.all_reduce(inp, "sum", group_name)
+            ar_0_wait = torch.ops.c10d_functional.wait_tensor(ar_0_out)
+            ar_1_out = torch.ops._c10d_functional.all_reduce(
+                ar_0_wait, "sum", group_name
+            )
+            ar_1_wait = torch.ops.c10d_functional.wait_tensor(ar_1_out)
+            return ar_1_wait
+
+        gm = make_fx(func)(torch.ones(4, 4, device=self.device), group_size, group_name)
+        g = gm.graph
+        for n in g.nodes:
+            if is_all_reduce_tensor(n):
+                from torch._inductor.comm_analysis import (
+                    estimate_nccl_collective_runtime_from_fx_node,
+                )
+
+                est_ms = estimate_nccl_collective_runtime_from_fx_node(
+                    n, use_nccl_estimator=False
+                )
+                assert est_ms > 0
+                est_ms_nccl = estimate_nccl_collective_runtime_from_fx_node(
+                    n, use_nccl_estimator=True
+                )
+                assert est_ms_nccl > 0
+
+    @skip_if_lt_x_gpu(2)
+    def test_all_to_all_comm_analysis(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        torch.cuda.set_device(self.rank)
+        c10d.init_process_group(
+            backend="nccl", store=store, rank=self.rank, world_size=self.world_size
+        )
+        group = c10d.distributed_c10d._get_default_group()
+        group_name = "default"
+        torch._C._distributed_c10d._register_process_group(
+            group_name, torch.distributed.group.WORLD
+        )
+        group_size = group.size()
+
+        def func(inp, group_size, group_name):
+            chunk = inp.numel() // self.world_size
+            split_sizes = [chunk] * self.world_size
+            a2a_0_out = torch.ops._c10d_functional.all_to_all_single(
+                inp,
+                split_sizes,
+                split_sizes,
+                group_name,
+            )
+            a2a_0_wait = torch.ops.c10d_functional.wait_tensor(a2a_0_out)
+            a2a_1_out = torch.ops._c10d_functional.all_to_all_single(
+                a2a_0_wait,
+                split_sizes,
+                split_sizes,
+                group_name,
+            )
+            a2a_1_wait = torch.ops.c10d_functional.wait_tensor(a2a_1_out)
+            return a2a_1_wait
+
+        gm = make_fx(func)(
+            torch.ones(group_size * 4, 1, device=self.device), group_size, group_name
+        )
+        g = gm.graph
+        for n in g.nodes:
+            if is_all_to_all_tensor(n):
+                from torch._inductor.comm_analysis import (
+                    estimate_nccl_collective_runtime_from_fx_node,
+                )
+
+                est_ms = estimate_nccl_collective_runtime_from_fx_node(
+                    n, use_nccl_estimator=False
+                )
+                assert est_ms > 0
+                est_ms_nccl = estimate_nccl_collective_runtime_from_fx_node(
+                    n, use_nccl_estimator=True
+                )
+                assert est_ms_nccl > 0
+
+    @skip_if_lt_x_gpu(2)
+    @requires_gloo()
+    def test_regression_use_nccl_estimate_with_gloo(self):
+        # Test checks that using nccl estimator option does not hard fail
+        # with backends that does not support runtime estimations, e.g. gloo
+        store = c10d.FileStore(self.file_name, self.world_size)
+        c10d.init_process_group(
+            backend="gloo", store=store, rank=self.rank, world_size=self.world_size
+        )
+        group = c10d.distributed_c10d._get_default_group()
+        group_name = "default"
+        torch._C._distributed_c10d._register_process_group(
+            group_name, torch.distributed.group.WORLD
+        )
+        group_size = group.size()
+
+        def func(inp, group_size, group_name):
+            ag_0_out = torch.ops._c10d_functional.all_gather_into_tensor(
+                inp, group_size, group_name
+            )
+            ag_0_wait = torch.ops.c10d_functional.wait_tensor(ag_0_out)
+            ag_1_out = torch.ops._c10d_functional.all_gather_into_tensor(
+                ag_0_wait, group_size, group_name
+            )
+            ag_1_wait = torch.ops.c10d_functional.wait_tensor(ag_1_out)
+            return ag_1_wait
+
+        gm = make_fx(func)(torch.ones(4, 4), group_size, group_name)
+        g = gm.graph
+        for n in g.nodes:
+            if is_all_gather_into_tensor(n):
+                from torch._inductor.comm_analysis import (
+                    estimate_nccl_collective_runtime_from_fx_node,
+                )
+
+                est_ms = estimate_nccl_collective_runtime_from_fx_node(
+                    n, use_nccl_estimator=False
+                )
+                assert est_ms > 0
+                est_ms_nccl = estimate_nccl_collective_runtime_from_fx_node(
+                    n, use_nccl_estimator=True
+                )
+                assert est_ms_nccl > 0
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/distributed/test_local_tensor.py b/test/distributed/test_local_tensor.py
index 114780627e334..fa081243c2816 100644
--- a/test/distributed/test_local_tensor.py
+++ b/test/distributed/test_local_tensor.py
@@ -7,6 +7,8 @@
 import torch.distributed as dist
 from torch.distributed._local_tensor import (
     local_tensor_mode,
+    LocalIntNode,
+    LocalRunnerMode,
     LocalTensor,
     LocalTensorMode,
 )
@@ -17,8 +19,10 @@
     Partial,
     Replicate,
     Shard,
+    zeros,
 )
 from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.testing._internal.distributed._tensor.common_dtensor import reduce_local_int
 
 
 class LocalTensorTestBase(TestCase):
@@ -124,14 +128,14 @@ def test_basic_arithmetic_operations(self):
         self.assertEqual(len(result_add._local_tensors), 2)
 
         # Verify the operation was applied to each local tensor
-        for rank in identical_local_tensors.keys():
+        for rank in identical_local_tensors:
             expected = identical_local_tensors[rank] + identical_local_tensors[rank]
             self.assertEqual(result_add._local_tensors[rank], expected)
 
         # Test multiplication
         result_mul = lt1 * 2.0
         self.assertIsInstance(result_mul, LocalTensor)
-        for rank in identical_local_tensors.keys():
+        for rank in identical_local_tensors:
             expected = identical_local_tensors[rank] * 2.0
             self.assertEqual(result_mul._local_tensors[rank], expected)
 
@@ -159,7 +163,7 @@ def test_mixed_operations_with_regular_tensors(self):
         result = lt + regular_tensor
         self.assertIsInstance(result, LocalTensor)
 
-        for rank in identical_local_tensors.keys():
+        for rank in identical_local_tensors:
             expected = identical_local_tensors[rank] + regular_tensor
             self.assertEqual(result._local_tensors[rank], expected)
 
@@ -208,14 +212,14 @@ def test_collectives_within_local_tensor_mode(self):
             dist.all_reduce(lt_sum, group=fake_pg)
 
             expected_sum = torch.tensor([[6.0, 8.0], [10.0, 12.0]])
-            for rank in test_tensors.keys():
+            for rank in test_tensors:
                 self.assertEqual(lt_sum._local_tensors[rank], expected_sum)
 
             # Test broadcast within mode
             lt_broadcast = LocalTensor({k: v.clone() for k, v in test_tensors.items()})
             dist.broadcast(lt_broadcast, src=0, group=fake_pg)
 
-            for rank in test_tensors.keys():
+            for rank in test_tensors:
                 self.assertEqual(lt_broadcast._local_tensors[rank], test_tensors[0])
 
             # Test that regular operations still work
@@ -289,21 +293,21 @@ def test_collective_reduction_operations(self):
         lt_sum = LocalTensor({k: v.clone() for k, v in test_tensors.items()})
         dist.all_reduce(lt_sum, op=dist.ReduceOp.SUM, group=fake_pg)
         expected_sum = torch.tensor([[6.0, 7.0], [6.0, 15.0]])  # Sum of all tensors
-        for rank in test_tensors.keys():
+        for rank in test_tensors:
             self.assertEqual(lt_sum._local_tensors[rank], expected_sum)
 
         # Test MAX reduction
         lt_max = LocalTensor({k: v.clone() for k, v in test_tensors.items()})
         dist.all_reduce(lt_max, op=dist.ReduceOp.MAX, group=fake_pg)
         expected_max = torch.tensor([[3.0, 4.0], [3.0, 6.0]])  # Max across all tensors
-        for rank in test_tensors.keys():
+        for rank in test_tensors:
             self.assertEqual(lt_max._local_tensors[rank], expected_max)
 
         # Test MIN reduction
         lt_min = LocalTensor({k: v.clone() for k, v in test_tensors.items()})
         dist.all_reduce(lt_min, op=dist.ReduceOp.MIN, group=fake_pg)
         expected_min = torch.tensor([[1.0, 1.0], [1.0, 4.0]])  # Min across all tensors
-        for rank in test_tensors.keys():
+        for rank in test_tensors:
             self.assertEqual(lt_min._local_tensors[rank], expected_min)
 
     def test_all_reduce_collective(self):
@@ -324,7 +328,7 @@ def test_all_reduce_collective(self):
 
         # Verify all ranks have the sum of all tensors (after adding 1 to each)
         expected_sum = torch.tensor([[114.0, 225.0, 336.0], [447.0, 558.0, 669.0]])
-        for rank in different_tensors.keys():
+        for rank in different_tensors:
             self.assertEqual(lt_sum._local_tensors[rank], expected_sum)
 
     def test_broadcast_collective(self):
@@ -344,7 +348,7 @@ def test_broadcast_collective(self):
 
         # Verify all ranks have rank 1's original tensor
         expected_broadcast = different_tensors[1]
-        for rank in different_tensors.keys():
+        for rank in different_tensors:
             self.assertEqual(lt_broadcast._local_tensors[rank], expected_broadcast)
 
     def test_all_gather_collective(self):
@@ -411,5 +415,78 @@ def test_dtensor_addmm(self):
             self.assertEqual(full_tensor, local_res)
 
 
+from torch.distributed._local_tensor._c10d import local_p2p_op, wait_all
+
+
+class TestLocalRunner(LocalTensorTestBase):
+    world_size = 6
+
+    @staticmethod
+    def _get_pp_peer(pp_index, mesh, dim, dir):
+        pp_meshes = mesh._get_all_submeshes(dim)
+        pp_ret = {}
+        for pp_mesh in pp_meshes:
+            global_rank = pp_mesh.mesh[pp_index].item()
+            global_peer = pp_mesh.mesh[(pp_index + dir) % pp_mesh.size()].item()
+            pp_ret[global_rank] = global_peer
+
+        return torch.SymInt(LocalIntNode(pp_ret))
+
+    def _run_dp_pp(
+        self,
+        mesh: DeviceMesh,
+        pp_index: int,
+        actual: list[torch.Tensor | None],
+        expected: list[torch.Tensor | None],
+    ) -> None:
+        ltm = LocalTensorMode(mesh.size())
+        with ltm:
+            dp_mesh = mesh["dp"]
+            pp_mesh = mesh["pp"]
+
+            x = torch.rand(2, 4)
+            xd = distribute_tensor(x, dp_mesh, [Shard(0)])
+            xd = xd * 2
+            x = x * 2
+
+            yd = zeros(*xd.shape, device_mesh=dp_mesh, placements=[Shard(0)])
+
+            if pp_index != pp_mesh.size(0) - 1:
+                # Send to next pp rank
+                pp_next_rank = TestLocalRunner._get_pp_peer(pp_index, mesh, "pp", +1)
+                local_p2p_op(pp_next_rank, xd, dist.isend)
+                expected[pp_index + 1] = ltm.tensor_map(
+                    x,
+                    lambda r, t: t
+                    if reduce_local_int(pp_next_rank, lambda vals: r in vals.values())
+                    else torch.zeros_like(t),
+                )
+
+            if pp_index != 0:
+                # Receive from prev pp rank
+                pp_prev_rank = TestLocalRunner._get_pp_peer(pp_index, mesh, "pp", -1)
+                rw = local_p2p_op(pp_prev_rank, yd, dist.irecv)
+                wait_all(rw)
+
+                y = yd.full_tensor()
+                actual[pp_index] = y
+
+    def test_dp_pp(self):
+        pp_size = 3
+        mesh = init_device_mesh(
+            "cpu", (self.world_size // pp_size, pp_size), mesh_dim_names=("dp", "pp")
+        )
+        actual: list[torch.Tensor | None] = [None] * pp_size
+        expected: list[torch.Tensor | None] = [None] * pp_size
+        with LocalRunnerMode(
+            self.world_size,
+            pp_size,
+            lambda pp_index: self._run_dp_pp(mesh, pp_index, actual, expected),
+        ):
+            pass
+
+        self.assertEqual(actual, expected)
+
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/test_nvshmem.py b/test/distributed/test_nvshmem.py
index 8c6d40ced0705..d7d9bd3ef7245 100644
--- a/test/distributed/test_nvshmem.py
+++ b/test/distributed/test_nvshmem.py
@@ -208,6 +208,21 @@ def test_get_remote_tensor(self) -> None:
         )
         self.assertEqual(y, expected)
 
+    def test_get_remote_tensors(self) -> None:
+        """
+        Get all remote tensors
+        """
+        self._init_device()
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+
+        my_tensor = symm_mem.empty(1, device=self.device).fill_(self.rank)
+        remote_tensors = torch.ops.symm_mem.get_remote_tensors(my_tensor, group_name)
+        dist.barrier()
+
+        for peer, tensor in enumerate(remote_tensors):
+            self.assertEqual(tensor, peer)
+
     @skipIfRocm
     def test_nvshmem_put(self) -> None:
         self._init_device()
diff --git a/test/distributed/test_overlap_bucketing_unit.py b/test/distributed/test_overlap_bucketing_unit.py
index c26bf0e93bab4..de6f2ba612977 100644
--- a/test/distributed/test_overlap_bucketing_unit.py
+++ b/test/distributed/test_overlap_bucketing_unit.py
@@ -49,7 +49,8 @@ def build_collective_info(graph, hiding_annotations):
     """
     Build CollectiveInfo dict from manual hiding annotations.
 
-    hiding_annotations: dict mapping collective_start -> hiding_compute_node
+    hiding_annotations: dict mapping collective_start -> hiding_compute_node(s)
+                        Can be a single node or a list/OrderedSet of nodes
     """
     from torch._inductor.fx_passes.overlap_scheduling import CollectiveInfo
 
@@ -65,12 +66,20 @@ def build_collective_info(graph, hiding_annotations):
 
     # Build CollectiveInfo for each collective
     for start_node, wait_node in start_to_wait.items():
-        hiding_node = hiding_annotations.get(start_node)
+        hiding_annotation = hiding_annotations.get(start_node)
+
+        # Convert to OrderedSet
+        hiding_nodes = OrderedSet()
+        if hiding_annotation is not None:
+            if isinstance(hiding_annotation, list | OrderedSet):
+                hiding_nodes = OrderedSet(hiding_annotation)
+            else:
+                hiding_nodes = OrderedSet([hiding_annotation])
 
         # Estimate size and time
         size_bytes = 16 * 4  # 4x4 tensor of floats
         estimated_time_ms = 1.0  # Dummy time
-        exposed_time_ms = 0.0 if hiding_node else 1.0  # Hidden if has hiding_node
+        exposed_time_ms = 0.0 if hiding_nodes else 1.0  # Hidden if has hiding_nodes
 
         collective_info[start_node] = CollectiveInfo(
             start_node=start_node,
@@ -78,7 +87,7 @@ def build_collective_info(graph, hiding_annotations):
             size_bytes=size_bytes,
             estimated_time_ms=estimated_time_ms,
             exposed_time_ms=exposed_time_ms,
-            hiding_node=hiding_node,
+            hiding_nodes=hiding_nodes,
         )
 
     return collective_info
@@ -567,6 +576,97 @@ def func(a, b):
             graph_str
         )
 
+    def test_can_bucket_with_multiple_hiding_nodes(self):
+        """
+        Test that collectives with multiple hiding nodes CAN bucket.
+
+        Graph structure:
+        ag1_start -> ag2_start -> mm1 -> mm2 -> mm3 -> ag1_wait -> ag2_wait
+
+        Where:
+        - ag1 is hidden by mm1 and mm2
+        - ag2 is hidden by mm2 and mm3
+        - Both collectives share mm2 as a hiding node
+        """
+
+        def func(a, b):
+            group_name = "0"
+            group_size = 1
+
+            # Start both collectives
+            ag1 = torch.ops._c10d_functional.all_gather_into_tensor(
+                a, group_size, group_name
+            )
+            ag2 = torch.ops._c10d_functional.all_gather_into_tensor(
+                b, group_size, group_name
+            )
+
+            # Three compute operations that hide the collectives
+            mm1 = torch.mm(a, a)
+            mm2 = torch.mm(b, b)
+            mm3 = torch.mm(a + b, a + b)
+
+            # Wait for both
+            ag1_out = torch.ops._c10d_functional.wait_tensor(ag1)
+            ag2_out = torch.ops._c10d_functional.wait_tensor(ag2)
+
+            return ag1_out.sum() + ag2_out.sum() + mm1.sum() + mm2.sum() + mm3.sum()
+
+        # Use fake mode to trace without executing
+        with FakeTensorMode():
+            a = torch.ones(4, 4, device=self.device)
+            b = torch.ones(4, 4, device=self.device) * 2
+
+            # Trace with make_fx
+            traced = make_fx(func)(a, b)
+
+        # Find nodes using find_nodes
+        ag1, ag2 = traced.graph.find_nodes(
+            op="call_function",
+            target=torch.ops._c10d_functional.all_gather_into_tensor.default,
+        )
+        mm1, mm2, mm3 = traced.graph.find_nodes(
+            op="call_function", target=torch.ops.aten.mm.default
+        )
+
+        # Manually annotate hiding relationships with multiple hiding nodes
+        hiding_annotations = {
+            ag1: [mm1, mm2],  # ag1 is hidden by mm1 and mm2
+            ag2: [mm2, mm3],  # ag2 is hidden by mm2 and mm3
+        }
+
+        # Build collective info and ancestors
+        collective_info = build_collective_info(traced.graph, hiding_annotations)
+        node_ancestors = compute_ancestors(traced.graph)
+        scheduled = OrderedSet(traced.graph.nodes)
+
+        # Verify hiding_nodes are correctly set
+        self.assertEqual(len(collective_info[ag1].hiding_nodes), 2)
+        self.assertIn(mm1, collective_info[ag1].hiding_nodes)
+        self.assertIn(mm2, collective_info[ag1].hiding_nodes)
+        self.assertEqual(len(collective_info[ag2].hiding_nodes), 2)
+        self.assertIn(mm2, collective_info[ag2].hiding_nodes)
+        self.assertIn(mm3, collective_info[ag2].hiding_nodes)
+
+        # Run bucketing
+        from torch._inductor.fx_passes.overlap_preserving_bucketer import (
+            OverlapPreservingBucketer,
+        )
+
+        bucketer = OverlapPreservingBucketer(
+            traced.graph,
+            collective_info,
+            node_ancestors,
+            scheduled,
+        )
+        bucketer.bucket_collectives()
+
+        FileCheck().check_count(
+            "all_gather_into_tensor_out", 1, exactly=False
+        ).check_count("torch.ops.aten.mm.default", 3, exactly=True).run(
+            str(traced.graph)
+        )
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/test_run.py b/test/distributed/test_run.py
index 659241dbcbe99..509c08cbf0c35 100644
--- a/test/distributed/test_run.py
+++ b/test/distributed/test_run.py
@@ -17,6 +17,7 @@
 
 class RunTest(TestCase):
     def setUp(self):
+        super().setUp()
         # Save original environment variable if it exists
         self.original_signals_env = os.environ.get(
             "TORCHELASTIC_SIGNALS_TO_HANDLE", None
diff --git a/test/distributed/test_serialization.py b/test/distributed/test_serialization.py
index 3adb099aa7a3b..6c1d82b5c18da 100644
--- a/test/distributed/test_serialization.py
+++ b/test/distributed/test_serialization.py
@@ -25,6 +25,7 @@ def __eq__(self, other: "MyClass") -> bool:
 
 class TestSerialization(TestCase):
     def setUp(self) -> None:
+        super().setUp()
         # disable debug asserts
         self._old_debug = os.environ.get(DEBUG_ENV)
         os.environ[DEBUG_ENV] = "0"
diff --git a/test/distributed/test_store.py b/test/distributed/test_store.py
index a6b69eeb8b93e..e1412701807b6 100644
--- a/test/distributed/test_store.py
+++ b/test/distributed/test_store.py
@@ -253,6 +253,14 @@ def test_clone(self):
         a.set("foo", "bar")
         self.assertEqual(b.get("foo"), b"bar")
 
+    def test_list_keys(self):
+        a = self._create_store()
+        a.set("foo", "bar")
+        a.set("baz", "qux")
+        keys = a.list_keys()
+        self.assertIn("foo", keys)
+        self.assertIn("baz", keys)
+
     # This is the number of keys used in test_set_get. Adding this as a class
     # property instead of hardcoding in the test since some Store
     # implementations will have differing number of keys. In the base case,
@@ -317,6 +325,7 @@ def _create_store(self):
 
 class PrefixStoreTest(TestCase):
     def setUp(self):
+        super().setUp()
         # delete is false as FileStore will automatically clean up the file
         self.file = tempfile.NamedTemporaryFile(delete=False)
 
diff --git a/test/dynamo/test_activation_checkpointing.py b/test/dynamo/test_activation_checkpointing.py
index df5906c091ad3..0d32a9e4917f5 100644
--- a/test/dynamo/test_activation_checkpointing.py
+++ b/test/dynamo/test_activation_checkpointing.py
@@ -1,4 +1,6 @@
 # Owner(s): ["module: dynamo"]
+# flake8: noqa: B950
+# flake8: noqa: E731
 import contextlib
 import copy
 import functools
@@ -15,7 +17,11 @@
 import torch.utils.checkpoint
 from functorch.compile import min_cut_rematerialization_partition
 from torch._dynamo.backends.common import aot_autograd
-from torch._dynamo.testing import CompileCounterWithBackend
+from torch._dynamo.testing import (
+    AotEagerAndRecordGraphs,
+    CompileCounterWithBackend,
+    normalize_gm,
+)
 from torch._higher_order_ops.wrap import tag_activation_checkpoint
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_utils import IS_WINDOWS, skipIfHpu
@@ -1649,6 +1655,42 @@ def fn(x):
 
         self.assertEqual(opt_fn(x), fn(x))
 
+    def test_return_same_element_twice(self):
+        def gn(x):
+            y = torch.sin(x)
+            return y, y
+
+        def fn(x):
+            return torch.utils.checkpoint.checkpoint(gn, x, use_reentrant=True)
+
+        x = torch.randn(4, 4, requires_grad=True)
+        ref = fn(x)
+
+        backend = AotEagerAndRecordGraphs()
+        opt_fn = torch.compile(fn, backend=backend, fullgraph=True)
+        res = opt_fn(x)
+        self.assertEqual(ref[0], res[0])
+        self.assertEqual(ref[1], res[1])
+
+        self.assertExpectedInline(
+            normalize_gm(backend.graphs[0].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[4, 4]"):
+        l_x_ = L_x_
+
+        wrap_body_0 = self.wrap_body_0
+        tag_activation_checkpoint = torch.ops.higher_order.tag_activation_checkpoint(wrap_body_0, l_x_, use_reentrant = True);  wrap_body_0 = l_x_ = None
+        getitem: "f32[4, 4]" = tag_activation_checkpoint[0];  tag_activation_checkpoint = None
+        return (getitem,)
+
+    class wrap_body_0(torch.nn.Module):
+        def forward(self, l_x_: "f32[4, 4]"):
+            y: "f32[4, 4]" = torch.sin(l_x_);  l_x_ = None
+            return (y,)
+""",
+        )
+
     @torch._dynamo.config.patch(skip_fwd_side_effects_in_bwd_under_checkpoint=True)
     def test_nonlocal_mutation(self):
         counter = 0
@@ -1672,6 +1714,114 @@ def fn(x):
         # The mutation is not reapplied in the backward because the flag was on.
         self.assertEqual(counter, 1)
 
+    @torch._dynamo.config.patch(skip_fwd_side_effects_in_bwd_under_checkpoint=True)
+    def test_nonlocal_list_mutation(self):
+        def gn(x, z):
+            out = x.sin()
+            z.append(out)
+            return torch.cos(torch.sin(torch.matmul(x, x) @ x)), out
+
+        def fn(x):
+            z = []
+
+            out1, out2 = torch.utils.checkpoint.checkpoint(
+                gn,
+                x,
+                z,
+                use_reentrant=False,
+            )
+
+            return out1, z[0]
+
+        x = torch.randn(4, 4, requires_grad=True)
+        ref = fn(x)
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        res = opt_fn(x)
+        self.assertEqual(ref[0], res[0])
+        self.assertEqual(ref[1], res[1])
+
+    @torch._dynamo.config.patch(skip_fwd_side_effects_in_bwd_under_checkpoint=True)
+    def test_nonlocal_list_mutation_hidden(self):
+        def gn(x, z):
+            o = torch.matmul(x, x) @ x
+            out = x.sin()
+            z.append(out)
+            return torch.cos(torch.sin(o)), torch.sin(x)
+
+        def fn(x):
+            z = []
+
+            outs = torch.utils.checkpoint.checkpoint(
+                gn,
+                x,
+                z,
+                use_reentrant=False,
+            )
+            out1 = outs[0]
+            # Check that the extra output pytree handling is done properly
+            out2 = outs[-1]
+
+            return out1 + out2, z[0]
+
+        x = torch.randn(4, 4, requires_grad=True)
+        ref = fn(x)
+
+        backend = AotEagerAndRecordGraphs()
+        opt_fn = torch.compile(fn, backend=backend, fullgraph=True)
+        res = opt_fn(x)
+        self.assertEqual(ref[0], res[0])
+        self.assertEqual(ref[1], res[1])
+
+        self.assertExpectedInline(
+            normalize_gm(backend.graphs[0].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[4, 4]"):
+        l_x_ = L_x_
+
+        wrap_body_0 = self.wrap_body_0
+        tag_activation_checkpoint = torch.ops.higher_order.tag_activation_checkpoint(wrap_body_0, l_x_, use_reentrant = False);  wrap_body_0 = l_x_ = None
+        out1: "f32[4, 4]" = tag_activation_checkpoint[0]
+        out2: "f32[4, 4]" = tag_activation_checkpoint[1]
+        getitem_4: "f32[4, 4]" = tag_activation_checkpoint[4];  tag_activation_checkpoint = None
+
+        add: "f32[4, 4]" = out1 + out2;  out1 = out2 = None
+        return (add, getitem_4)
+
+    class wrap_body_0(torch.nn.Module):
+        def forward(self, l_x_: "f32[4, 4]"):
+            matmul: "f32[4, 4]" = torch.matmul(l_x_, l_x_)
+            o: "f32[4, 4]" = matmul @ l_x_
+
+            out: "f32[4, 4]" = l_x_.sin()
+
+            sin_1: "f32[4, 4]" = torch.sin(o)
+            cos: "f32[4, 4]" = torch.cos(sin_1)
+            sin_2: "f32[4, 4]" = torch.sin(l_x_);  l_x_ = None
+            return (cos, sin_2, matmul, o, out, sin_1)
+""",
+        )
+
+        self.assertExpectedInline(
+            normalize_gm(backend.fw_graphs[0].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "f32[4, 4]"):
+        mm: "f32[4, 4]" = torch.ops.aten.mm.default(primals_1, primals_1)
+        mm_1: "f32[4, 4]" = torch.ops.aten.mm.default(mm, primals_1);  mm = None
+
+        sin: "f32[4, 4]" = torch.ops.aten.sin.default(primals_1)
+
+        sin_1: "f32[4, 4]" = torch.ops.aten.sin.default(mm_1);  mm_1 = None
+        cos: "f32[4, 4]" = torch.ops.aten.cos.default(sin_1);  sin_1 = None
+        sin_2: "f32[4, 4]" = torch.ops.aten.sin.default(primals_1)
+
+        add: "f32[4, 4]" = torch.ops.aten.add.Tensor(cos, sin_2);  cos = sin_2 = None
+        return (add, sin, primals_1)
+""",
+        )
+
 
 devices = ["cuda", "hpu"]
 instantiate_device_type_tests(
diff --git a/test/dynamo/test_aot_autograd.py b/test/dynamo/test_aot_autograd.py
index 47e2be2d17c19..568bf23a4d196 100644
--- a/test/dynamo/test_aot_autograd.py
+++ b/test/dynamo/test_aot_autograd.py
@@ -950,7 +950,7 @@ def _prepare_model_args():
 2|aten.threshold_backward.default||relu
 1|aten.native_batch_norm_backward.default||batch_norm
 0|aten.convolution_backward.default||conv2d
-11|aten.add.Tensor||l1_loss
+11|aten.add.Tensor||
 """
             ),
         )
diff --git a/test/dynamo/test_aot_compile.py b/test/dynamo/test_aot_compile.py
index 8f39435b922ae..8ab8155aa9704 100644
--- a/test/dynamo/test_aot_compile.py
+++ b/test/dynamo/test_aot_compile.py
@@ -1,9 +1,11 @@
 # Owner(s): ["module: dynamo"]
 
+import copy
 import functools
 import inspect
 import os
 import pickle
+import unittest
 from contextlib import contextmanager
 from unittest.mock import patch
 
@@ -13,17 +15,22 @@
 import torch._inductor.test_case
 import torch.onnx.operators
 import torch.utils.cpp_extension
-from torch._dynamo.aot_compile import ModelInput, SerializableCallable
+from torch._dynamo.aot_compile import AOTCompiledModel, ModelInput, SerializableCallable
 from torch._dynamo.exc import PackageError, Unsupported
 from torch._dynamo.package import DynamoCache
 from torch._dynamo.precompile_context import PrecompileContext
 from torch._inductor.runtime.runtime_utils import cache_dir
 from torch.fx._graph_pickler import GraphPickler
-from torch.testing._internal.common_utils import instantiate_parametrized_tests
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    TEST_CUDA,
+)
 
 
 MY_LAMBDA = lambda x: x + 1  # noqa: E731
 
+EPS = torch.tensor(1e-7)
+
 
 class CustomCompiledFunction(torch._dynamo.aot_compile.SerializableCallable):
     def __init__(self, gm: torch.fx.GraphModule, example_inputs: list[torch.Tensor]):
@@ -582,6 +589,18 @@ def test_aot_compile_with_super_call(self):
         actual = compiled_fn(fn, *inputs)
         self.assertEqual(expected, actual)
 
+    def test_aot_compile_with_global_tensor(self):
+        def fn(x, y):
+            return x + y + EPS
+
+        def make_inputs():
+            return (torch.randn(3, 4), torch.randn(3, 4))
+
+        compiled_fn = torch.compile(fn, fullgraph=True).aot_compile((make_inputs(), {}))
+
+        test_inputs = make_inputs()
+        self.assertEqual(compiled_fn(*test_inputs), fn(*test_inputs))
+
     def test_aot_compile_with_default_args(self):
         def fn(x, y=1):
             return x + x
@@ -599,6 +618,92 @@ def fn(x, y=1):
         actual = compiled_fn(*inputs)
         self.assertEqual(expected, actual)
 
+    @unittest.skipIf(not TEST_CUDA, "requires cuda")
+    def test_aot_compile_with_aoti(self):
+        with torch.device("cuda"):
+            from torch._dynamo.hooks import Hooks
+
+            def fn(x, y):
+                return x + y
+
+            def make_inputs():
+                return (torch.randn(3, 4), torch.randn(3, 4))
+
+            compiled_fn = torch._dynamo.aot_compile.aot_compile_fullgraph(
+                fn,
+                (make_inputs(), {}),
+                Hooks(),
+                torch._TorchCompileAOTInductorWrapper(None, None, None),
+            )
+
+            test_inputs = make_inputs()
+            expected = fn(*test_inputs)
+            actual = compiled_fn(*test_inputs)
+            self.assertEqual(expected, actual)
+            compiled_fn.save_compiled_function(self.path())
+            with open(self.path(), "rb") as f:
+                compiled_fn = torch.compiler.load_compiled_function(f)
+            actual = compiled_fn(*test_inputs)
+            self.assertEqual(expected, actual)
+
+    @unittest.skipIf(not TEST_CUDA, "requires cuda")
+    def test_aot_compile_with_aoti_module(self):
+        with torch.device("cuda"):
+            from torch._dynamo.hooks import Hooks
+
+            mod = SimpleLinearModule()
+
+            def make_inputs():
+                return (torch.randn(4, 3),)
+
+            compiled_mod = torch._dynamo.aot_compile.aot_compile_module(
+                mod,
+                [ModelInput(make_inputs(), {}, [])],
+                Hooks(),
+                torch._TorchCompileAOTInductorWrapper(None, None, None),
+            )
+
+            def get_grads(m: torch.nn.Module):
+                return {name: p.grad for name, p in m.named_parameters()}
+
+            original_mod = copy.deepcopy(mod)
+            test_inputs = make_inputs()
+            expected = mod(*test_inputs)
+            expected.sum().backward()
+            expected_grads = get_grads(mod)
+
+            actual = compiled_mod(*test_inputs)
+            self.assertEqual(expected, actual)
+            serialized = compiled_mod.serialize()
+            compiled_fn = AOTCompiledModel.deserialize(original_mod, serialized)
+            actual = compiled_fn(*test_inputs)
+            actual.sum().backward()
+            self.assertEqual(get_grads(original_mod), expected_grads)
+
+    @unittest.skipIf(not TEST_CUDA, "requires cuda")
+    def test_aot_compile_with_aoti_torch_compile(self):
+        with torch.device("cuda"):
+
+            def fn(x, y):
+                return x + y
+
+            def make_inputs():
+                return (torch.randn(3, 4), torch.randn(3, 4))
+
+            compiled_fn = torch.compile(
+                fn, fullgraph=True, options={"use_aoti": True}
+            ).aot_compile((make_inputs(), {}))
+            test_inputs = make_inputs()
+            expected = fn(*test_inputs)
+            actual = compiled_fn(*test_inputs)
+            self.assertEqual(expected, actual)
+            compiled_fn.save_compiled_function(self.path())
+            with open(self.path(), "rb") as f:
+                compiled_fn = torch.compiler.load_compiled_function(f)
+            actual = compiled_fn(*test_inputs)
+            self.assertEqual(compiled_fn._artifacts.backend_name, "aotinductor")
+            self.assertEqual(expected, actual)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_base_hop.py b/test/dynamo/test_base_hop.py
index 607b502351aaf..24f36e8b995ee 100644
--- a/test/dynamo/test_base_hop.py
+++ b/test/dynamo/test_base_hop.py
@@ -222,13 +222,13 @@ def forward(self, l_x_: "f32[3, 3]", l_y_: "f32[3, 3]"):
 
             matmul: "f32[3, 3]" = l_x_ @ l_y_
             sin: "f32[3, 3]" = matmul.sin();  matmul = None
-            child: "f32[3, 3]" = sin.cos();  sin = None
+            cos: "f32[3, 3]" = sin.cos();  sin = None
 
-            child_1: "f32[3, 3]" = l_x_ + l_y_
-            child_2: "f32[3, 3]" = l_x_ - l_y_
+            add: "f32[3, 3]" = l_x_ + l_y_
+            sub: "f32[3, 3]" = l_x_ - l_y_
 
-            child_3: "f32[3, 3]" = l_x_ @ l_y_;  l_x_ = l_y_ = None
-            return (child, child_1, child_2, child_3)
+            matmul_1: "f32[3, 3]" = l_x_ @ l_y_;  l_x_ = l_y_ = None
+            return (cos, add, sub, matmul_1)
 """,  # noqa: B950
         )
         self.assertExpectedInline(
diff --git a/test/dynamo/test_compiler_bisector.py b/test/dynamo/test_compiler_bisector.py
index 8810a30aaf3b7..8ebf35f3f0d3f 100644
--- a/test/dynamo/test_compiler_bisector.py
+++ b/test/dynamo/test_compiler_bisector.py
@@ -283,7 +283,7 @@ def test_fn():
     )
     def test_bisect_pre_grad_graph(self):
         def f(x):
-            for i in range(5):
+            for _ in range(5):
                 x = x + 1
             return x.relu()
 
diff --git a/test/dynamo/test_comptime.py b/test/dynamo/test_comptime.py
index efd0f0e9f0f65..619d2800e281c 100644
--- a/test/dynamo/test_comptime.py
+++ b/test/dynamo/test_comptime.py
@@ -330,6 +330,13 @@ def _(ctx):
             'obj_weakref': None
             'guarded_class': None
         }
+        global '' GLOBAL_STATE
+        {
+            'guard_types': None,
+            'code': None,
+            'obj_weakref': None
+            'guarded_class': None
+        }
         global '' TORCH_FUNCTION_STATE
         {
             'guard_types': None,
diff --git a/test/dynamo/test_ctx_manager.py b/test/dynamo/test_ctx_manager.py
index 0433354b953b9..780a660227bf1 100644
--- a/test/dynamo/test_ctx_manager.py
+++ b/test/dynamo/test_ctx_manager.py
@@ -408,6 +408,9 @@ def fn(x, s0, s1):
         self.assertEqual(ref0, res0)
 
     @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
+    @unittest.skip(
+        "Will not support external events for now: https://github.com/pytorch/pytorch/issues/167257"
+    )
     def test_cuda_event_reconstruct(self):
         def fn(x):
             e = torch.cuda.Event()
@@ -425,6 +428,9 @@ def fn(x):
         self.assertEqual(cnts.op_count, 3)
 
     @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
+    @unittest.skip(
+        "Will not support external events for now: https://github.com/pytorch/pytorch/issues/167257"
+    )
     def test_cuda_event_across_graph_break(self):
         def fn(x):
             e = torch.cuda.Event()
@@ -446,9 +452,12 @@ def fn(x):
         res = opt_fn(x)
         self.assertEqual(ref[0], res[0])
         self.assertEqual(cnts.frame_count, 2)
-        self.assertEqual(cnts.op_count, 9)
+        self.assertEqual(cnts.op_count, 10)
 
     @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
+    @unittest.skip(
+        "Will not support external events for now: https://github.com/pytorch/pytorch/issues/167257"
+    )
     def test_cuda_event_created_outside_of_graph(self):
         user_stream = torch.cuda.Stream()
         event = torch.cuda.Event()
@@ -478,9 +487,12 @@ def run_iters(fn, compile=False):
         res = run_iters(func, compile=True)
         self.assertEqual(ref, res)
         self.assertEqual(cnts.frame_count, 1)
-        self.assertEqual(cnts.op_count, 3)
+        self.assertEqual(cnts.op_count, 4)
 
     @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
+    @unittest.skip(
+        "Will not support external events for now: https://github.com/pytorch/pytorch/issues/167257"
+    )
     def test_cuda_event_method_create_stream_outside_of_compile(self):
         def fn(x, cur_stream, new_stream):
             x = torch.mul(x, 1)
diff --git a/test/dynamo/test_decorators.py b/test/dynamo/test_decorators.py
index 0eb21c9cef068..09936044bd450 100644
--- a/test/dynamo/test_decorators.py
+++ b/test/dynamo/test_decorators.py
@@ -2109,6 +2109,89 @@ def outer_f2(x):
         with self.assertRaises(Unsupported):
             outer_f2(inp)
 
+    def test_disable_recursive_flags(self):
+        class SimpleLinear(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.layer0 = torch.nn.Linear(4, 4)
+
+            def forward(self, inp):
+                return self.layer0(torch.sigmoid(inp))
+
+        class SimpleModel(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.layer0 = SimpleLinear()
+                self.layer1 = torch.nn.Linear(4, 4)
+
+            def forward(self, inp):
+                z = self.layer0(torch.sin(inp))
+                return self.layer1(z)
+
+        for recursive_flag in [True, False]:
+            model = SimpleModel()
+            other_model = SimpleModel()
+
+            model.forward = torch._dynamo.disable(
+                model.forward,
+                recursive=recursive_flag,
+            )
+            self.assertEqual(
+                torch._dynamo.is_dynamo_disable_recursive(model.forward),
+                recursive_flag,
+            )
+
+            other_model = torch._dynamo.disable(other_model, recursive=recursive_flag)
+            self.assertEqual(
+                torch._dynamo.is_dynamo_disable_recursive(
+                    other_model.forward
+                    if isinstance(other_model, torch.nn.Module)
+                    else other_model
+                ),
+                recursive_flag,
+            )
+
+            # check the model is compilable
+            torch.compile(model)
+            torch.compile(other_model)
+
+    def test_dynamo_disable_annotations(self):
+        class SimpleModel(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.register_buffer("buffer", torch.rand(2, 2))
+
+            @torch._dynamo.disable()
+            def f1(self, x) -> torch.Tensor:
+                return x + self.buffer + 1
+
+            @torch._dynamo.disable()
+            def f2(self, x) -> torch.Tensor:
+                return x + self.buffer + 2
+
+            def forward(self, x) -> torch.Tensor:
+                return self.f1(x) + self.f2(x)
+
+        model = SimpleModel()
+        inp = torch.rand(2, 2)
+        with torch.fx.traceback.preserve_node_meta():
+            exported_model = torch.export.export(model, (inp,))
+        graph = exported_model.graph_module.graph
+        found_f1 = False
+        found_f2 = False
+        for node in graph.nodes:
+            if "custom" in node.meta:
+                if "_torchdynamo_disable_method" in node.meta["custom"]:
+                    if node.meta["custom"]["_torchdynamo_disable_method"] == "f1":
+                        found_f1 = True
+                    elif node.meta["custom"]["_torchdynamo_disable_method"] == "f2":
+                        found_f2 = True
+        self.assertTrue(found_f1)
+        self.assertTrue(found_f2)
+        model.forward = torch._dynamo.disable(model.forward, recursive=False)
+        with self.assertRaises(RuntimeError):
+            exported_model = torch.export.export(model, (inp,))
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_dicts.py b/test/dynamo/test_dicts.py
index 966acd1d81394..4a4d2ff87718f 100644
--- a/test/dynamo/test_dicts.py
+++ b/test/dynamo/test_dicts.py
@@ -36,6 +36,15 @@ class DummyUserDict(UserDict):
     pass
 
 
+class FakeMapping:
+    def __init__(self, value: Any) -> None:
+        self._value = value
+        self.keys = lambda: ["a", "b", "c"]  # not required to be a method
+
+    def __getitem__(self, key: str) -> Any:
+        return self._value
+
+
 class DictTests(torch._dynamo.test_case.TestCase):
     def test_dict_subclass_instantiation(self):
         def fn(x):
@@ -666,6 +675,18 @@ def fn():
         for k1, m2 in zip(modules, module_dict.children()):
             self.assertTrue(modules[k1] is m2)
 
+    # FIXME: see comment in torch/_dynamo/polyfills/__init__.py:mutable_mapping_update
+    @unittest.expectedFailure
+    def test_dict_construct_from_mapping_like(self):
+        def fn(x):
+            fm = FakeMapping(x)
+            d = dict(fm, x=x)
+            return d
+
+        x = torch.randn(4)
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        self.assertEqual(fn(x), opt_fn(x))
+
     def test_dict_subclass_initialization_in_graph(self):
         for super_class in (
             OrderedDict,
@@ -1087,12 +1108,52 @@ def f(x):
 
         self.assertEqual(ref, res)
 
-    @unittest.expectedFailure
+    def test_newly_constructed_default_dict_no_default_factory(self):
+        def f1(x):
+            d = defaultdict()
+            try:
+                d[1] += 42
+            except KeyError:
+                d[1] = 1
+            return x + 1, d
+
+        x = torch.ones(2)
+        ref = f1(x)
+        res = torch.compile(f1, backend="eager", fullgraph=True)(x)
+
+        self.assertEqual(ref, res)
+
+        def f2(x):
+            d = defaultdict(None)
+            try:
+                d[1] += 42
+            except KeyError:
+                d[1] = 1
+            return x + 1, d
+
+        ref = f2(x)
+        res = torch.compile(f2, backend="eager", fullgraph=True)(x)
+        self.assertEqual(ref, res)
+
+        def f3(x):
+            d = defaultdict(None, {1: 10})
+            d[1] += 42
+            try:
+                d[2] += 24
+            except KeyError:
+                d[2] = 1
+            return x + 1, d
+
+        ref = f3(x)
+        res = torch.compile(f3, backend="eager", fullgraph=True)(x)
+        self.assertEqual(ref, res)
+
     def test_newly_constructed_default_dict_with_dict(self):
         def f(x):
-            d = defaultdict(dict, {2: {"a": 1}})
-            d[0] = {"b": 2}
-            return x + 1, d
+            d = dict([("a", 1), ("b", 2)], c=3)  # noqa: C406
+            dd = defaultdict(list, d, d=4, e=5)
+            dd["x"].append(42)
+            return x + 1, d, dd
 
         x = torch.ones(2)
         ref = f(x)
diff --git a/test/dynamo/test_error_messages.py b/test/dynamo/test_error_messages.py
index df8364e78e40d..c706e5f7af025 100644
--- a/test/dynamo/test_error_messages.py
+++ b/test/dynamo/test_error_messages.py
@@ -422,34 +422,41 @@ def test_optree_graph_break_message(self):
         import optree
 
         @torch.compile(backend="eager")
-        def fn(x):
-            d = {"a": 1}
-            optree.tree_flatten_with_path(d)
-            return torch.sin(x)
-
-        def post_munge(s):
-            s = re.sub(
-                r"optree\.\S*\.flatten_with_path",
-                "optree.<path>.flatten_with_path",
-                s,
+        def fn1(x):
+            tree = {"a": x, "b": (x - 1, 2 * x)}
+            sin, cos = optree.tree_transpose_map(
+                lambda t: (torch.sin(t), torch.cos(t)),
+                tree,
             )
+            return sin, cos
+
+        fn1(torch.randn(4))
+        self.assertEqual(len(counters["graph_break"]), 0)
+
+        @torch.compile(backend="eager")
+        def fn2(x):
+            spec = optree.treespec_deque([])
+            return spec, x
+
+        fn2(torch.randn(4))
+        self.assertGreaterEqual(len(counters["graph_break"]), 1)
+        first_graph_break = next(iter(counters["graph_break"].keys()))
+
+        def post_munge(string):
             return re.sub(
-                r"qualname: \S*flatten_with_path",
-                "qualname: <path>.flatten_with_path",
-                s,
+                r"(optree\.|qualname: )\S*(\.make_from_collection)",
+                r"\1<path>\2",
+                string,
             )
 
-        fn(torch.randn(4))
-        self.assertEqual(len(counters["graph_break"]), 1)
-        first_graph_break = next(iter(counters["graph_break"].keys()))
         self.assertExpectedInline(
             post_munge(first_graph_break),
             """\
 Attempted to call function marked as skipped
-  Explanation: Dynamo cannot trace optree C/C++ function optree.<path>.flatten_with_path.
+  Explanation: Dynamo cannot trace optree C/C++ function optree.<path>.make_from_collection.
   Hint: Consider using torch.utils._pytree - https://github.com/pytorch/pytorch/blob/main/torch/utils/_pytree.py
 
-  Developer debug context: module: optree._C, qualname: <path>.flatten_with_path, skip reason: <missing reason>
+  Developer debug context: module: optree._C, qualname: <path>.make_from_collection, skip reason: <missing reason>
 
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0007.html""",
         )
@@ -945,7 +952,9 @@ def fn(x):
         self.assertExpectedInline(
             munge_exc(records[0].getMessage(), suppress_suffix=True, skip=0),
             """\
-Graph break: skip: from user code at:
+Graph break: torch.compile cannot properly resume from this graph break, which results in a skip.
+torch.compile will skip tracing the frame fn (test_error_messages.py line N) and fall back to eager.
+The graph break occurred in the following user code:
   File "test_error_messages.py", line N, in fn
     assert x is None
 """,
@@ -1043,7 +1052,7 @@ def gn():
         msg = re.sub(r"line (\d+)", "line N", msg)
         msg = re.sub(
             r"""(?s)Traceback \(most recent call last\):.*
-  File "exc.py", line N, in unimplemented_v2
+  File "exc.py", line N, in unimplemented
     raise Unsupported\(msg\)""",
             "<Internal traceback>\n",
             msg,
@@ -1071,6 +1080,88 @@ def gn():
 """,
         )
 
+    @torch._dynamo.config.patch(verbose=True)
+    @make_logging_test(graph_breaks=True)
+    def test_skipped_frame_with_verbose_traceback(self, records):
+        def fn(x):
+            with GenericCtxMgr():
+                torch._dynamo.graph_break()
+                return x + 1
+
+        torch.compile(fn, backend="eager")(torch.randn(3))
+        self.assertEqual(len(records), 1)
+        self.assertExpectedInline(
+            munge_exc(records[0].getMessage(), suppress_suffix=True, skip=0),
+            """\
+Graph break: torch.compile cannot properly resume from this graph break, which results in a skip.
+torch.compile will skip tracing the frame fn (test_error_messages.py line N) and fall back to eager.
+The graph break occurred in the following user code:
+  File "test_error_messages.py", line N, in fn
+    torch._dynamo.graph_break()
+""",
+        )
+        self.assertExpectedInline(
+            munge_exc(records[0].exc_info[1], suppress_suffix=True, skip=0),
+            """\
+Graph break under GenericContextWrappingVariable
+  Explanation: Attempted to graph break in an active context manager(s) that doesn't support graph breaking.
+  Hint: Move the offending context manager(s) to outside the compiled region.
+  Hint: This graph break may have been caused by an earlier graph break. Resolving the earlier graph break may resolve this one.
+
+  Developer debug context: Active generic context managers: [GenericContextWrappingVariable(GenericCtxMgr)]
+
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0066.html
+
+from user code:
+   File "test_error_messages.py", line N, in fn
+    torch._dynamo.graph_break()
+""",
+        )
+
+    @make_logging_test(graph_breaks=True)
+    def test_skip_frame_in_loop_message(self, records):
+        def fn(x):
+            for i in range(2):
+                with GenericCtxMgr():
+                    if x.sum() > 0:
+                        x = x + 1
+            return x
+
+        torch.compile(fn, backend="eager")(torch.randn(3))
+        self.assertEqual(len(records), 1)
+        self.assertExpectedInline(
+            munge_exc(records[0].getMessage(), suppress_suffix=True, skip=0),
+            """\
+Graph break: torch.compile cannot properly resume from this graph break, which results in a skip.
+torch.compile will skip tracing the frame fn (test_error_messages.py line N) and fall back to eager.
+The graph break occurred in the following user code:
+  File "test_error_messages.py", line N, in fn
+    if x.sum() > 0:
+""",
+        )
+
+    @make_logging_test(dynamo=logging.DEBUG)
+    def test_skip_frame_empty_function_message(self, records):
+        def empty_fn(x):
+            pass
+
+        torch.compile(empty_fn, backend="eager")(torch.randn(3))
+        skip_messages = [
+            r
+            for r in records
+            if "intentionally decided to skip the frame" in r.getMessage()
+        ]
+        self.assertEqual(len(skip_messages), 1)
+        msg = munge_exc(skip_messages[0].getMessage(), suppress_suffix=True, skip=0)
+        msg = re.sub(r" (\d+)$", r" N", msg, flags=re.MULTILINE)
+
+        self.assertExpectedInline(
+            msg,
+            """\
+Skipping frame torch.compile intentionally decided to skip the frame empty_fn (test_error_messages.py line N) and fall back to eager.
+Reason: no content in function call empty_fn                 test_error_messages.py N""",
+        )
+
     @make_logging_test(graph_breaks=True)
     def test_nested_compile_user_frames(self, records):
         def fn(x):
@@ -1617,6 +1708,110 @@ def fn(x):
             )
 
 
+class NestedGraphBreakLoggingTests(
+    LoggingTestCase, torch._dynamo.test_case.TestCaseWithNestedGraphBreaks
+):
+    @make_logging_test(graph_breaks=True)
+    def test_skipped_frame_with_verbose_traceback_nested(self, records):
+        global f1, f2, f3
+
+        class GenericCtxMgr:
+            def __enter__(self):
+                return self
+
+            def __exit__(self, exc_type, exc_value, traceback):
+                pass
+
+        def f1(x):
+            with GenericCtxMgr():
+                torch._dynamo.graph_break()
+                return x + 1
+
+        def f2(x):
+            return f1(x + 2)
+
+        def f3(x):
+            return f2(x + 3)
+
+        torch.compile(f3, backend="eager")(torch.randn(3))
+        self.assertEqual(len(records), 1)
+        self.assertExpectedInline(
+            munge_exc(records[0].getMessage(), suppress_suffix=True, skip=0),
+            """\
+Graph break in user code at test_error_messages.py:N
+Graph Break Reason: Encountered graph break that we cannot resume from. Compiling up to the previous resumable state, then skipping the rest of the function. Graph break encountered:
+Graph break under GenericContextWrappingVariable
+  Explanation: Attempted to graph break in an active context manager(s) that doesn't support graph breaking.
+  Hint: Move the offending context manager(s) to outside the compiled region.
+  Hint: This graph break may have been caused by an earlier graph break. Resolving the earlier graph break may resolve this one.
+
+  Developer debug context: Active generic context managers: [GenericContextWrappingVariable(GenericCtxMgr)]
+
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0066.html
+User code traceback:
+  File "test_error_messages.py", line N, in test_skipped_frame_with_verbose_traceback_nested
+    torch.compile(f3, backend="eager")(torch.randn(3))
+  File "test_error_messages.py", line N, in f3
+    return f2(x + 3)
+  File "test_error_messages.py", line N, in f2
+    return f1(x + 2)
+  File "test_error_messages.py", line N, in f1
+    torch._dynamo.graph_break()
+""",
+        )
+
+    @make_logging_test(graph_breaks=True)
+    def test_skip_frame_in_loop_message_nested(self, records):
+        global f1, f2, f3
+
+        class GenericCtxMgr:
+            def __enter__(self):
+                return self
+
+            def __exit__(self, exc_type, exc_value, traceback):
+                pass
+
+        def f1(x):
+            for i in range(2):
+                with GenericCtxMgr():
+                    if x.sum() > 0:
+                        x = x + 1
+            return x
+
+        def f2(x):
+            return f1(x + 4)
+
+        def f3(x):
+            return f2(x + 5)
+
+        result = torch.compile(f3, backend="eager")(torch.randn(3))  # noqa: F841
+        self.assertEqual(len(records), 1)
+        self.assertExpectedInline(
+            munge_exc(records[0].getMessage(), suppress_suffix=True, skip=0),
+            """\
+Graph break in user code at test_error_messages.py:N
+Graph Break Reason: Encountered graph break that we cannot resume from. Compiling up to the previous resumable state, then skipping the rest of the function. Graph break encountered:
+Data-dependent branching
+  Explanation: Detected data-dependent branching (e.g. `if my_tensor.sum() > 0:`). Dynamo does not support tracing dynamic control flow.
+  Hint: This graph break is fundamental - it is unlikely that Dynamo will ever be able to trace through your code. Consider finding a workaround.
+  Hint: Use `torch.cond` to express dynamic control flow.
+
+  Developer debug context: attempted to jump with TensorVariable()
+
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0170.html
+User code traceback:
+  File "test_error_messages.py", line N, in test_skip_frame_in_loop_message_nested
+    result = torch.compile(f3, backend="eager")(torch.randn(3))  # noqa: F841
+  File "test_error_messages.py", line N, in f3
+    return f2(x + 5)
+  File "test_error_messages.py", line N, in f2
+    return f1(x + 4)
+  File "test_error_messages.py", line N, in f1
+    if x.sum() > 0:
+""",
+        )
+
+
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
diff --git a/test/dynamo/test_exceptions.py b/test/dynamo/test_exceptions.py
index 5240204df84c0..6bd9f65e156cb 100644
--- a/test/dynamo/test_exceptions.py
+++ b/test/dynamo/test_exceptions.py
@@ -962,7 +962,7 @@ def forward(self, x):
         x = (torch.randn(4, 16, requires_grad=True),)
 
         with self.assertRaisesRegex(Exception, "weight = self.linear.w"):
-            torch._dynamo.functional_export._dynamo_graph_capture_for_export(Model())(x)
+            torch._dynamo.functional_export.dynamo_graph_capture_for_export(Model())(x)
 
 
 instantiate_parametrized_tests(ExceptionTests)
diff --git a/test/dynamo/test_functions.py b/test/dynamo/test_functions.py
index 419e57a1cc280..bac435cebfdfc 100644
--- a/test/dynamo/test_functions.py
+++ b/test/dynamo/test_functions.py
@@ -2363,6 +2363,34 @@ def func(x, a):
         self.assertTrue(same(output, expected))
         assert cnt.frame_count == 1
 
+    @unittest.skipIf(sys.version_info < (3, 13), "math.fma introduced in python 3.13")
+    def test_math_fma(self):
+        def fma_func(a, b, c):
+            return math.fma(a, b, c)
+
+        # Test with scalar constants (constant folding path)
+        cnt = torch._dynamo.testing.CompileCounter()
+        cfma_scalars = torch._dynamo.optimize_assert(cnt)(fma_func)
+
+        assert cnt.frame_count == 0
+        expected = fma_func(2.0, 3.0, 4.0)
+        output = cfma_scalars(2.0, 3.0, 4.0)
+        self.assertEqual(output, expected)
+        assert cnt.frame_count == 0
+
+        # Test with tensors (Inductor path)
+        cnt2 = torch._dynamo.testing.CompileCounter()
+        cfma_tensors = torch._dynamo.optimize_assert(cnt2)(fma_func)
+
+        assert cnt2.frame_count == 0
+        x = torch.tensor(2.0)
+        y = torch.tensor(3.0)
+        z = torch.tensor(4.0)
+        expected_tensors = x * y + z
+        output_tensors = cfma_tensors(x, y, z)
+        torch.testing.assert_close(output_tensors, expected_tensors)
+        assert cnt2.frame_count == 1
+
     @make_test
     def test_numpy_meshgrid(x, y):
         r1, r2 = np.meshgrid(x.numpy(), y.numpy())
diff --git a/test/dynamo/test_graph_deduplication.py b/test/dynamo/test_graph_deduplication.py
index 004aee88a8633..fc9284a3c9542 100644
--- a/test/dynamo/test_graph_deduplication.py
+++ b/test/dynamo/test_graph_deduplication.py
@@ -8,21 +8,11 @@
 from torch._dynamo.graph_utils import _detect_cycles
 from torch._dynamo.output_graph import FakeRootModule
 from torch._dynamo.test_case import TestCase
-from torch._dynamo.testing import (
-    AotEagerAndRecordGraphs,
-    extract_graph_and_tracker,
-    normalize_gm,
-)
+from torch._dynamo.testing import extract_graph, extract_graph_and_tracker, normalize_gm
 from torch.compiler import allow_in_graph
 from torch.utils._ordered_set import OrderedSet
 
 
-def extract_graph(fn, *args, **kwargs):
-    backend = AotEagerAndRecordGraphs()
-    result = torch.compile(backend=backend)(fn)(*args, **kwargs)
-    return result, backend.graphs, backend.fw_graphs
-
-
 def graph_str(gm):
     return normalize_gm(gm.print_readable(print_output=False))
 
@@ -40,7 +30,7 @@ def tearDown(self):
         super().tearDown()
 
     def run_and_return_graphs(self, fn, *args, **kwargs):
-        return extract_graph(fn, *args, **kwargs)
+        return extract_graph(fn, *args, **kwargs)[0:3]
 
     def run_and_get_simple_graph(self):
         def fn(x, y):
diff --git a/test/dynamo/test_guard_serialization.py b/test/dynamo/test_guard_serialization.py
index d81032a457abc..efa9b7572b2be 100644
--- a/test/dynamo/test_guard_serialization.py
+++ b/test/dynamo/test_guard_serialization.py
@@ -1214,7 +1214,7 @@ def fn(x):
 
         x = torch.randn(3, 2)
         with torch.enable_grad():
-            ref, loaded = self._test_serialization("GRAD_MODE", fn, x)
+            ref, loaded = self._test_serialization("GLOBAL_STATE", fn, x)
         with torch.no_grad():
             self._test_check_fn(ref, loaded, {"x": x}, False)
         with torch.enable_grad():
@@ -1226,7 +1226,7 @@ def fn(x):
 
         x = torch.randn(3, 2)
         with torch.enable_grad():
-            ref, _ = self._test_serialization("GRAD_MODE", fn, x)
+            ref, _ = self._test_serialization("GLOBAL_STATE", fn, x)
         with torch.no_grad():
             # Ensure guards state loading is not affected by the current global grad mode.
             guards_state = pickle.loads(self._cached_guards_state)
@@ -1246,7 +1246,7 @@ def fn(x):
         try:
             x = torch.randn(3, 2)
             torch.use_deterministic_algorithms(True)
-            ref, loaded = self._test_serialization("DETERMINISTIC_ALGORITHMS", fn, x)
+            ref, loaded = self._test_serialization("GLOBAL_STATE", fn, x)
             torch.use_deterministic_algorithms(False)
             self._test_check_fn(ref, loaded, {"x": x}, False)
             torch.use_deterministic_algorithms(True)
@@ -1270,6 +1270,9 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
             ref, loaded = self._test_serialization("TORCH_FUNCTION_STATE", fn, x)
             self._test_check_fn(ref, loaded, {"x": x}, True)
         self._test_check_fn(ref, loaded, {"x": x}, False)
+        with GlobalTorchFunctionMode():
+            ref, loaded = self._test_serialization("GLOBAL_STATE", fn, x)
+            self._test_check_fn(ref, loaded, {"x": x}, True)
         with GlobalTorchFunctionMode():
             with torch._C.DisableTorchFunction():
                 self._test_check_fn(ref, loaded, {"x": x}, False)
@@ -1306,7 +1309,7 @@ def fn(x):
         x = torch.randn(3, 2)
 
         with torch.enable_grad():
-            ref, loaded = self._test_serialization("FSDP_TRAINING_STATE", fn, x)
+            ref, loaded = self._test_serialization("GLOBAL_STATE", fn, x)
         with torch.no_grad():
             self._test_check_fn(ref, loaded, {"x": x}, False)
         with torch.enable_grad():
@@ -1690,6 +1693,38 @@ def foo(x, mod):
             ref, loaded, {"x": x, "d": ModWithDict({"b": 1e-9, "a": 1e9})}, False
         )
 
+    def test_global_state_guard_filter(self):
+        def foo(x):
+            return x + 1
+
+        x = torch.randn(3, 2)
+
+        with torch.no_grad():
+            compiled_fn = torch.compile(
+                foo, options={"guard_filter_fn": torch.compiler.skip_all_guards_unsafe}
+            )
+            compiled_fn(x)
+
+        # Check global guards are gone.
+        with torch.enable_grad(), torch.compiler.set_stance("fail_on_recompile"):
+            self.assertEqual(compiled_fn(x), foo(x))
+
+    def test_torch_function_state_filter(self):
+        def foo(x):
+            return x + 1
+
+        x = torch.randn(3, 2)
+
+        with GlobalTorchFunctionMode():
+            compiled_fn = torch.compile(
+                foo, options={"guard_filter_fn": torch.compiler.skip_all_guards_unsafe}
+            )
+            compiled_fn(x)
+
+        # Check global guards are gone.
+        with torch.compiler.set_stance("fail_on_recompile"):
+            self.assertEqual(compiled_fn(x), foo(x))
+
 
 class SimpleModule(torch.nn.Module):
     def __init__(self, c):
diff --git a/test/dynamo/test_higher_order_ops.py b/test/dynamo/test_higher_order_ops.py
index 204e5114320f6..4e2a292fc69d4 100644
--- a/test/dynamo/test_higher_order_ops.py
+++ b/test/dynamo/test_higher_order_ops.py
@@ -39,7 +39,10 @@
 )
 from torch.testing._internal.hop_db import hop_db
 from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
-from torch.testing._internal.triton_utils import requires_cuda_and_triton
+from torch.testing._internal.triton_utils import (
+    requires_cuda_and_triton,
+    requires_gpu_and_triton,
+)
 
 
 def count_ops(gm, args, freq, op):
@@ -131,7 +134,7 @@ def default_args_generator(seed_value):
         yield new_args
 
 
-class HigherOrderOpTests(torch._dynamo.test_case.TestCase):
+class HigherOrderOpTests(torch._dynamo.test_case.TestCaseWithNestedGraphBreaks):
     def _assert_wrap_fallback(self, func, args, setup=lambda: None):
         counters.clear()
         backend = EagerAndRecordGraphs()
@@ -249,7 +252,7 @@ def fn(x):
 
         # when testing with dynamic shape, symbols are lifted as input
         arg_count = ifdynstaticdefault(2, 3)
-        self._test_wrap_simple(fn, default_args_generator((x,)), arg_count)
+        self._test_wrap_simple(fn, default_args_generator((x,)), arg_count, 1)
 
     def test_return_captured_vars(self):
         freevar1 = torch.randn(3)
@@ -267,7 +270,7 @@ def fn(x):
         # be the input.
         # when testing with dynamic shape, a symbol is lifted as input
         arg_count = ifdynstaticdefault(3, 4)
-        self._test_wrap_simple(fn, default_args_generator((x,)), arg_count, 4)
+        self._test_wrap_simple(fn, default_args_generator((x,)), arg_count, 1)
 
     def test_return_captured_var_used_multiple_times(self):
         freevar = torch.randn(3)
@@ -282,7 +285,7 @@ def fn(x):
         x = torch.randn(3)
         # when testing with dynamic shape, a symbol is lifted as input
         arg_count = ifdynstaticdefault(3, 4)
-        self._test_wrap_simple(fn, default_args_generator((x,)), arg_count, 3)
+        self._test_wrap_simple(fn, default_args_generator((x,)), arg_count, 2)
 
     def test_capture_untracked_global(self):
         def f(x):
@@ -762,15 +765,15 @@ class wrap_body_1(torch.nn.Module):
         def forward(self, s77: "Sym(s77)", l_x_: "f32[s77]", u0: "Sym(u0)", c: "i64[u0, 1]"):
             wrap_body_0 = self.wrap_body_0
             wrap = torch.ops.higher_order.wrap(wrap_body_0, s77, l_x_, u0, c);  wrap_body_0 = s77 = l_x_ = u0 = c = None
-            child: "f32[s77]" = wrap[0]
-            child_1: "f32[u0, 1]" = wrap[1];  wrap = None
-            return (child, child_1)
+            getitem: "f32[s77]" = wrap[0]
+            getitem_1: "f32[u0, 1]" = wrap[1];  wrap = None
+            return (getitem, getitem_1)
 
         class wrap_body_0(torch.nn.Module):
             def forward(self, s77: "Sym(s77)", l_x_: "f32[s77]", u0: "Sym(u0)", c: "i64[u0, 1]"):
-                child: "f32[s77]" = l_x_.sin();  l_x_ = None
-                child_1: "f32[u0, 1]" = c.sin();  c = None
-                return (child, child_1)
+                sin: "f32[s77]" = l_x_.sin();  l_x_ = None
+                sin_1: "f32[u0, 1]" = c.sin();  c = None
+                return (sin, sin_1)
 """,
             )
         else:
@@ -801,15 +804,15 @@ class wrap_body_1(torch.nn.Module):
         def forward(self, l_x_: "f32[3]", u0: "Sym(u0)", c: "i64[u0, 1]"):
             wrap_body_0 = self.wrap_body_0
             wrap = torch.ops.higher_order.wrap(wrap_body_0, l_x_, u0, c);  wrap_body_0 = l_x_ = u0 = c = None
-            child: "f32[3]" = wrap[0]
-            child_1: "f32[u0, 1]" = wrap[1];  wrap = None
-            return (child, child_1)
+            getitem: "f32[3]" = wrap[0]
+            getitem_1: "f32[u0, 1]" = wrap[1];  wrap = None
+            return (getitem, getitem_1)
 
         class wrap_body_0(torch.nn.Module):
             def forward(self, l_x_: "f32[3]", u0: "Sym(u0)", c: "i64[u0, 1]"):
-                child: "f32[3]" = l_x_.sin();  l_x_ = None
-                child_1: "f32[u0, 1]" = c.sin();  c = None
-                return (child, child_1)
+                sin: "f32[3]" = l_x_.sin();  l_x_ = None
+                sin_1: "f32[u0, 1]" = c.sin();  c = None
+                return (sin, sin_1)
 """,
             )
 
@@ -922,16 +925,16 @@ class wrap_body_1(torch.nn.Module):
         def forward(self, l_x_: "f32[3]", size: "Sym(u0)", c: "i64[u0, 1]"):
             wrap_body_0 = self.wrap_body_0
             wrap = torch.ops.higher_order.wrap(wrap_body_0, l_x_, size, c);  wrap_body_0 = l_x_ = size = c = None
-            child: "f32[3]" = wrap[0]
-            child_1: "f32[u0, 1]" = wrap[1];  wrap = None
-            return (child, child_1)
+            getitem: "f32[3]" = wrap[0]
+            getitem_1: "f32[u0, 1]" = wrap[1];  wrap = None
+            return (getitem, getitem_1)
 
         class wrap_body_0(torch.nn.Module):
             def forward(self, l_x_: "f32[3]", size: "Sym(u0)", c: "i64[u0, 1]"):
                 sin: "f32[3]" = l_x_.sin();  l_x_ = None
-                child: "f32[3]" = sin + size;  sin = size = None
-                child_1: "f32[u0, 1]" = c.sin();  c = None
-                return (child, child_1)
+                add: "f32[3]" = sin + size;  sin = size = None
+                sin_1: "f32[u0, 1]" = c.sin();  c = None
+                return (add, sin_1)
 """,
             )
 
@@ -2458,10 +2461,10 @@ def forward(self, L_arg1_0_: "f32[3]", L_arg2_0_: "f32[3]"):
 
     class wrap_body_0(torch.nn.Module):
         def forward(self, l_arg1_0_: "f32[3]", l_arg2_0_: "f32[3]"):
-            child: "f32[3]" = l_arg1_0_ + 1;  l_arg1_0_ = None
+            add: "f32[3]" = l_arg1_0_ + 1;  l_arg1_0_ = None
 
-            child_1: "f32[3]" = l_arg2_0_ + 1;  l_arg2_0_ = None
-            return (child, child_1)
+            add_1: "f32[3]" = l_arg2_0_ + 1;  l_arg2_0_ = None
+            return (add, add_1)
 """,
         )
 
@@ -2655,9 +2658,9 @@ def forward(self, L_x_: "f32[2, 3]"):
 
     class wrap_body_0(torch.nn.Module):
         def forward(self, l_x_: "f32[2, 3]"):
-            child: "f32[2, 3]" = l_x_.sin()
-            child_1: "f32[2, 3]" = l_x_.cos();  l_x_ = None
-            return (child, child_1)
+            sin: "f32[2, 3]" = l_x_.sin()
+            cos: "f32[2, 3]" = l_x_.cos();  l_x_ = None
+            return (sin, cos)
 """,
         )
 
@@ -2687,13 +2690,13 @@ def forward(self, L_x_: "f32[3]"):
 
         wrap_body_0 = self.wrap_body_0
         wrap = torch.ops.higher_order.wrap(wrap_body_0, l_x_);  wrap_body_0 = l_x_ = None
-        value: "f32[3]" = wrap[0];  wrap = None
-        return (value,)
+        getitem: "f32[3]" = wrap[0];  wrap = None
+        return (getitem,)
 
     class wrap_body_0(torch.nn.Module):
         def forward(self, l_x_: "f32[3]"):
-            child: "f32[3]" = -l_x_;  l_x_ = None
-            return (child,)
+            neg: "f32[3]" = -l_x_;  l_x_ = None
+            return (neg,)
 """,
         )
 
@@ -3318,17 +3321,17 @@ def forward(self, L_x_: "f32[2, 4]", L_y_: "f32[4]"):
 
         hints_wrapper_body_1 = self.hints_wrapper_body_1
         hints_wrapper = torch.ops.higher_order.hints_wrapper(hints_wrapper_body_1, (x, l_y_), {}, hints = {'outer_body': True});  hints_wrapper_body_1 = x = l_y_ = None
-        res: "f32[2, 4]" = hints_wrapper[0];  hints_wrapper = None
-        return (res,)
+        getitem: "f32[2, 4]" = hints_wrapper[0];  hints_wrapper = None
+        return (getitem,)
 
     class hints_wrapper_body_1(torch.nn.Module):
         def forward(self, x: "f32[2, 4]", l_y_: "f32[4]"):
             hints_wrapper_body_0 = self.hints_wrapper_body_0
             hints_wrapper = torch.ops.higher_order.hints_wrapper(hints_wrapper_body_0, (x, l_y_), {}, hints = {'inner_body': True});  hints_wrapper_body_0 = x = l_y_ = None
-            x_1: "f32[2, 4]" = hints_wrapper[0];  hints_wrapper = None
+            getitem: "f32[2, 4]" = hints_wrapper[0];  hints_wrapper = None
 
-            x_2: "f32[2, 4]" = torch.abs(x_1);  x_1 = None
-            return (x_2,)
+            x_1: "f32[2, 4]" = torch.abs(getitem);  getitem = None
+            return (x_1,)
 
         class hints_wrapper_body_0(torch.nn.Module):
             def forward(self, x: "f32[2, 4]", l_y_: "f32[4]"):
@@ -3354,7 +3357,7 @@ def outer_body_fn(x, y):
         x = torch.randn(2, 4)
         y = torch.ones(4)
 
-        msg = "hints_wrapper - key hints not provided"
+        msg = "hints_wrapper: improper args/kwargs"
         with self.assertRaisesRegex(RuntimeError, msg):
             torch.compile(fn_with_hints, backend=cnt)(x, y)
 
@@ -3395,8 +3398,95 @@ def outer_body_fn(x):
         with self.assertRaisesRegex(RuntimeError, msg):
             fn_with_hints(x, y)
 
+    @requires_cuda_and_triton
+    def test_wrap_inductor_compiled_regions_option(self):
+        """
+        Test that wrap_inductor_compiled_regions option wraps compiled regions
+        in inductor_compiled_code HOP, making them visible to DebugMode.
+        """
+        from torch.utils._debug_mode import DebugMode
+
+        # Test with wrapping enabled
+        @torch.compile(
+            backend="inductor",
+            options={"wrap_inductor_compiled_regions": True},
+            fullgraph=True,
+        )
+        def fn_wrapped(x, y):
+            return torch.matmul(x, y)
+
+        # Test with wrapping disabled (default)
+        @torch.compile(backend="inductor", fullgraph=True)
+        def fn_not_wrapped(x, y):
+            return torch.matmul(x, y)
 
-class HigherOrderOpVmapGuardTests(LoggingTestCase):
+        x = torch.randn(4, 4, device="cuda")
+        y = torch.randn(4, 4, device="cuda")
+
+        # Test wrapped version - HOP should be visible in DebugMode
+        with DebugMode() as debug_mode_wrapped:
+            result_wrapped = fn_wrapped(x, y)
+
+        debug_string_wrapped = debug_mode_wrapped.debug_string()
+        self.assertIn("inductor_compiled_code", debug_string_wrapped)
+
+        # Test non-wrapped version - HOP should NOT be visible
+        with DebugMode() as debug_mode_not_wrapped:
+            result_not_wrapped = fn_not_wrapped(x, y)
+
+        debug_string_not_wrapped = debug_mode_not_wrapped.debug_string()
+        self.assertNotIn("inductor_compiled_code", debug_string_not_wrapped)
+
+        # Both should produce correct results
+        expected = torch.matmul(x, y)
+        self.assertEqual(result_wrapped, expected)
+        self.assertEqual(result_not_wrapped, expected)
+
+    @requires_cuda_and_triton
+    def test_wrap_inductor_compiled_regions_with_backward(self):
+        """
+        Test that wrap_inductor_compiled_regions works correctly with autograd.
+        """
+        from torch.utils._debug_mode import DebugMode
+
+        @torch.compile(
+            backend="inductor",
+            options={"wrap_inductor_compiled_regions": True},
+            fullgraph=True,
+        )
+        def fn(x, y):
+            return torch.matmul(x, y)
+
+        x = torch.randn(4, 4, device="cuda", requires_grad=True)
+        y = torch.randn(4, 4, device="cuda", requires_grad=True)
+
+        # Clone for eager comparison
+        x_eager = x.detach().clone().requires_grad_(True)
+        y_eager = y.detach().clone().requires_grad_(True)
+
+        # Compiled forward and backward
+        with DebugMode() as debug_mode:
+            result = fn(x, y)
+            loss = result.sum()
+            loss.backward()
+
+        # HOP should be visible in forward pass
+        self.assertIn("inductor_compiled_code", debug_mode.debug_string())
+
+        # Eager forward and backward for comparison
+        expected = torch.matmul(x_eager, y_eager)
+        expected_loss = expected.sum()
+        expected_loss.backward()
+
+        # Check correctness
+        self.assertEqual(result, expected)
+        self.assertEqual(x.grad, x_eager.grad)
+        self.assertEqual(y.grad, y_eager.grad)
+
+
+class HigherOrderOpVmapGuardTests(
+    torch._dynamo.test_case.TestCaseWithNestedGraphBreaks, LoggingTestCase
+):
     @make_logging_test(recompiles=True)
     def test_vmap_grad_guard_ok(self, records):
         vmap = torch.vmap
@@ -3665,7 +3755,9 @@ def fn(x):
         self.assertGreater(len(records), 0)
 
 
-class FuncTorchHigherOrderOpTests(torch._dynamo.test_case.TestCase):
+class FuncTorchHigherOrderOpTests(
+    torch._dynamo.test_case.TestCaseWithNestedGraphBreaks
+):
     def tearDown(self):
         # Ensure that in the case of a test failure, the next test won't fail
         # because of a previous call to _vmap_increment_nesting that wasn't undone
@@ -4516,12 +4608,9 @@ def wrapper_fn(model, params, inputs, targets):
             model, params, inputs, targets
         )
         self.assertEqual(len(counters["graph_break"]), 1)
-        self.assertEqual(
-            {
-                "torch.func.functional_call capture is disabled, it can be "
-                "turned on by setting `torch._dynamo.config.inline_inbuilt_nn_modules=True`": 1,
-            },
-            dict(counters["graph_break"]),
+        self.assertIn(
+            "torch.func.functional_call capture is disabled",
+            next(iter(counters["graph_break"].keys())),
         )
         self.assertEqual(actual, expected)
 
@@ -6785,7 +6874,9 @@ def wrapper_fn(x):
         self.assertEqual(expected, actual)
 
 
-class ActivationCheckpointingTests(torch._dynamo.test_case.TestCase):
+class ActivationCheckpointingTests(
+    torch._dynamo.test_case.TestCaseWithNestedGraphBreaks
+):
     def _validate(self, fn, backend, *args, skip_check=False, fullgraph=True):
         cloned_args = []
         for arg in args:
@@ -6892,7 +6983,7 @@ def fn(x, y):
             fn, backend, x, y, skip_check=True
         )  # dropout decomp is known to diverge with eager
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     @torch._functorch.config.patch(functionalize_rng_ops=True)
     def test_fallback(self):
         def gn(x, y):
@@ -7176,7 +7267,7 @@ def false_branch(x):
 }
 
 
-class TestHigherOrderOpsOpInfo(torch._dynamo.test_case.TestCase):
+class TestHigherOrderOpsOpInfo(torch._dynamo.test_case.TestCaseWithNestedGraphBreaks):
     @requires_cuda_and_triton
     @parametrize("backend", ("aot_eager", "inductor"))
     @ops(
diff --git a/test/dynamo/test_install_free_tensors.py b/test/dynamo/test_install_free_tensors.py
index 3858b827bd598..fd9e14c4c3f76 100644
--- a/test/dynamo/test_install_free_tensors.py
+++ b/test/dynamo/test_install_free_tensors.py
@@ -1,7 +1,7 @@
 # Owner(s): ["module: dynamo"]
 import unittest
-from collections.abc import Sequence
-from typing import Any, Callable, Union
+from collections.abc import Callable, Sequence
+from typing import Any, Union
 
 import torch
 import torch._dynamo
diff --git a/test/dynamo/test_logging.py b/test/dynamo/test_logging.py
index 860d82784ea70..f472705101e35 100644
--- a/test/dynamo/test_logging.py
+++ b/test/dynamo/test_logging.py
@@ -861,7 +861,7 @@ def fn(a):
     def test_logs_out(self):
         import tempfile
 
-        with tempfile.NamedTemporaryFile(delete=False) as tmp:
+        with tempfile.NamedTemporaryFile(delete=True) as tmp:
             file_path = _as_posix_path(tmp.name)
             """
             NamedTemporaryFile will include a file open operation.
@@ -888,10 +888,6 @@ def fn(a):
                 file_path, encoding="utf-8"
             ) as fd:  # encoding file to UTF-8 for Windows.
                 lines = fd.read()
-                fd.close()
-                os.remove(
-                    file_path
-                )  # Delete temp file manually, due to setup NamedTemporaryFile as delete=False.
                 orig_maxDiff = unittest.TestCase.maxDiff
                 unittest.TestCase.maxDiff = None
                 try:
@@ -988,6 +984,7 @@ def bar():
     "hierarchical_compile",
     "compute_dependencies",
     "annotation",
+    "node_runtime_estimation",
 }
 for name in torch._logging._internal.log_registry.artifact_names:
     if name not in exclusions:
diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index 169f43ce0a077..6348ba5638e05 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -19,6 +19,7 @@
 import os
 import pickle
 import random
+import re
 import sys
 import tempfile
 import threading
@@ -243,6 +244,61 @@ def boolarg(aa, bb, flag):
         self.assertTrue(same(val4, correct1))
         self.assertEqual(counter.frame_count, 3)
 
+    @unittest.skipIf(not TEST_CUDA, "cuda needed")
+    def test_assume_32_bit_indexing(self):
+        @torch.compile(backend="inductor")
+        def func(a, b):
+            # Multiple concat operations
+            x = torch.concat([a, b], dim=0)
+            y = torch.concat([a, b], dim=1)
+
+            # Reshape to create indexing patterns
+            x_flat = x.reshape(-1)
+            y_flat = y.reshape(-1)
+
+            # Take the smaller one and expand
+            min_size = min(x_flat.shape[0], y_flat.shape[0])
+            x_trunc = x_flat[:min_size]
+            y_trunc = y_flat[:min_size]
+
+            # Combine and compute
+            result = (x_trunc + y_trunc) * 10
+
+            # Cumulative operations create complex indexing
+            cumsum = result.cumsum(dim=0)
+
+            return cumsum.sum()
+
+        a = torch.rand(100, 30, device="cuda")
+        b = torch.rand(100, 30, device="cuda")
+
+        torch._dynamo.decorators.mark_unbacked(a, 0)
+        torch._dynamo.decorators.mark_unbacked(a, 1)
+        torch._dynamo.decorators.mark_unbacked(b, 0)
+        torch._dynamo.decorators.mark_unbacked(b, 1)
+
+        source_code = run_and_get_code(func, a, b)[1]
+
+        self.assertTrue(
+            "xindex = xoffset + tl.arange(0, XBLOCK)[:].to(tl.int64)\\n"
+            in str(source_code)
+        )
+        self.assertFalse(
+            "xindex = xoffset + tl.arange(0, XBLOCK)[:]\\n" in str(source_code)
+        )
+
+        torch._dynamo.reset()
+
+        with torch._inductor.config.patch(assume_32bit_indexing=True):
+            source_code = run_and_get_code(func, a, b)[1]
+            self.assertFalse(
+                "xindex = xoffset + tl.arange(0, XBLOCK)[:].to(tl.int64)\\n"
+                in str(source_code)
+            )
+            self.assertTrue(
+                "xindex = xoffset + tl.arange(0, XBLOCK)[:]\\n" in str(source_code)
+            )
+
     def test_dynamo_inside_custom_op(self):
         cnt = torch._dynamo.testing.InductorAndRecordGraphs()
         cnt1 = torch._dynamo.testing.InductorAndRecordGraphs()
@@ -1428,6 +1484,170 @@ def f(x):
 
         self.assertRaises(torch._dynamo.exc.UserError, lambda: f(torch.tensor([3])))
 
+    def test_check_compiles_when_predicate_true_and_message_has_no_closure(self):
+        @torch.compile(backend="eager", fullgraph=True)
+        def f(x):
+            torch._check(x.shape[0] > 3, lambda: "Shape is not greater than 3")
+            return x + 1
+
+        x = torch.randn(4)
+        torch._dynamo.maybe_mark_dynamic(x, 0)
+
+        y = f(x)
+        self.assertEqual(y.shape, x.shape)
+
+    def test_check_compiles_when_predicate_true_constant_and_message_has_no_closure(
+        self,
+    ):
+        @torch.compile(backend="eager", fullgraph=True)
+        def f(x):
+            torch._check(x.shape[0] > 3, lambda: "Shape is not greater than 3")
+            return x + 1
+
+        x = torch.randn(4)
+
+        y = f(x)
+        self.assertEqual(y.shape, x.shape)
+
+    def test_check_compiles_when_predicate_true_constant_and_message_None(self):
+        @torch.compile(backend="eager", fullgraph=True)
+        def f(x):
+            torch._check(x.shape[0] > 3)
+            return x + 1
+
+        x = torch.randn(4)
+
+        y = f(x)
+        self.assertEqual(y.shape, x.shape)
+
+    def test_check_compiles_when_predicate_true_and_message_None(self):
+        @torch.compile(backend="eager", fullgraph=True)
+        def f(x):
+            torch._check(x.shape[0] > 3)
+            return x + 1
+
+        x = torch.randn(4)
+        torch._dynamo.maybe_mark_dynamic(x, 0)
+
+        y = f(x)
+        self.assertEqual(y.shape, x.shape)
+
+    def test_check_compiles_when_predicate_true_and_message_has_global(self):
+        global GLOBAL_INT
+        GLOBAL_INT = 1
+
+        @torch.compile(backend="eager", fullgraph=True)
+        def f(x):
+            torch._check(x.shape[0] > 3, lambda: f"{GLOBAL_INT} is not greater than 3")
+            return x + 1
+
+        x = torch.randn(4)
+        torch._dynamo.maybe_mark_dynamic(x, 0)
+
+        y = f(x)
+        self.assertEqual(y.shape, x.shape)
+
+    def test_check_raises_at_runtime_when_predicate_false_and_message_has_global(self):
+        global GLOBAL_INT
+        GLOBAL_INT = 1
+
+        @torch.compile(backend="eager", fullgraph=True)
+        def f(x):
+            torch._check(x.shape[0] > 3, lambda: f"{GLOBAL_INT} is not greater than 3")
+            return x + 1
+
+        x = torch.randn(3)
+        torch._dynamo.maybe_mark_dynamic(x, 0)
+
+        with self.assertRaisesRegex(
+            RuntimeError, f"{GLOBAL_INT} is not greater than 3"
+        ):
+            f(x)
+
+    def test_check_raises_at_runtime_when_predicate_false_and_message_None(self):
+        @torch.compile(backend="eager", fullgraph=True)
+        def f(x):
+            torch._check(x.shape[0] > 3)
+            return x + 1
+
+        x = torch.randn(3)
+        torch._dynamo.maybe_mark_dynamic(x, 0)
+
+        with self.assertRaisesRegex(RuntimeError, None):
+            f(x)
+
+    def test_check_raises_at_runtime_when_predicate_false_constant_and_message_None(
+        self,
+    ):
+        @torch.compile(backend="eager", fullgraph=True)
+        def f(x):
+            torch._check(x.shape[0] > 3)
+            return x + 1
+
+        x = torch.randn(3)
+
+        with self.assertRaisesRegex(RuntimeError, None):
+            f(x)
+
+    def test_check_raises_at_runtime_when_predicate_false_and_message_has_no_closure(
+        self,
+    ):
+        @torch.compile(backend="eager", fullgraph=True)
+        def f(x):
+            torch._check(x.shape[0] > 3, lambda: "Shape is not greater than 3")
+            return x + 1
+
+        x = torch.randn(3)
+        torch._dynamo.maybe_mark_dynamic(x, 0)
+
+        with self.assertRaisesRegex(RuntimeError, "Shape is not greater than 3"):
+            f(x)
+
+    def test_check_raises_at_runtime_when_predicate_false_constant_and_message_has_no_closure(
+        self,
+    ):
+        @torch.compile(backend="eager", fullgraph=True)
+        def f(x):
+            torch._check(x.shape[0] > 3, lambda: "Shape is not greater than 3")
+            return x + 1
+
+        x = torch.randn(3)
+
+        with self.assertRaisesRegex(RuntimeError, "Shape is not greater than 3"):
+            f(x)
+
+    def test_check_assert_error_at_runtime_when_predicate_false_and_message_has_closure(
+        self,
+    ):
+        @torch.compile(backend="eager", fullgraph=True)
+        def f(x):
+            torch._check(x.shape[0] > 3, lambda: f"{x.shape[0]} is not greater than 3")
+            return x + 1
+
+        x = torch.randn(3)
+        torch._dynamo.maybe_mark_dynamic(x, 0)
+
+        with self.assertRaisesRegex(
+            torch._dynamo.exc.Unsupported, "Can't extract message from torch._check()"
+        ):
+            f(x)
+
+    def test_check_assert_error_at_runtime_when_predicate_true_and_message_has_closure(
+        self,
+    ):
+        @torch.compile(backend="eager", fullgraph=True)
+        def f(x):
+            torch._check(x.shape[0] > 3, lambda: f"{x.shape[0]} is not greater than 3")
+            return x + 1
+
+        x = torch.randn(4)
+        torch._dynamo.maybe_mark_dynamic(x, 0)
+
+        with self.assertRaisesRegex(
+            torch._dynamo.exc.Unsupported, "Can't extract message from torch._check()"
+        ):
+            f(x)
+
     def test_assert(self):
         @torch.compile
         def fn1(x):
@@ -5471,6 +5691,115 @@ def f2(a, b):
         self.assertTrue(same(res11, res12))
         self.assertTrue(same(res21, res22))
 
+    def test_replay_side_effects_config(self):
+        # Test that replay_side_effects config controls mutation replay
+        def fn(x, lst):
+            lst.append(x + 1)
+            return x * 2
+
+        x = torch.tensor([5.0])
+
+        # Test with replay enabled (default)
+        lst_with_replay = []
+        opt_fn_with_replay = torch.compile(fn, backend="eager")
+        result1 = opt_fn_with_replay(x, lst_with_replay)
+        self.assertEqual(len(lst_with_replay), 1)  # Mutation should be replayed
+        self.assertTrue(same(result1, x * 2))
+
+        torch._dynamo.reset()
+
+        # Test with replay disabled
+        lst_without_replay = []
+        with torch._dynamo.config.patch(
+            replay_side_effects=False, side_effect_replay_policy="warn"
+        ):
+            opt_fn_without_replay = torch.compile(fn, backend="eager")
+            result2 = opt_fn_without_replay(x, lst_without_replay)
+            self.assertEqual(
+                len(lst_without_replay), 0
+            )  # Mutation should NOT be replayed
+            self.assertTrue(same(result2, x * 2))
+
+        torch._dynamo.reset()
+        lst_without_replay = []
+        with torch._dynamo.config.patch(
+            replay_side_effects=False, side_effect_replay_policy="error"
+        ):
+            opt_fn_without_replay = torch.compile(fn, backend="eager")
+            with self.assertRaisesRegex(
+                RuntimeError,
+                re.escape(
+                    "While compiling, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: [\"L['lst']\"]"
+                ),
+            ):
+                _ = opt_fn_without_replay(x, lst_without_replay)
+
+    def test_replay_side_effects_model_attr(self):
+        class Bar(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.const = 4
+
+            def forward(self, x):
+                return x.cos()
+
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.const = 4
+                self.tensor = None
+                self.bar = Bar()
+
+            def forward(self, x):
+                self.const = 5
+                self.tensor = x.sin()
+                res = self.bar(x)
+                return x.cos() + res.sum() + self.tensor
+
+        with torch._dynamo.config.patch(
+            replay_side_effects=False, side_effect_replay_policy="error"
+        ):
+            foo = Foo()
+            with self.assertRaisesRegex(
+                RuntimeError,
+                re.escape(
+                    "While compiling, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: [\"L['self']\"]"
+                ),
+            ):
+                torch.compile(foo, fullgraph=True)(torch.randn(4, 4))
+
+        with torch._dynamo.config.patch(
+            replay_side_effects=False, side_effect_replay_policy="silent"
+        ):
+            foo_v2_compile = Foo()
+            foo_v2_eager = Foo()
+            inp = torch.randn(4, 4)
+            res = torch.compile(foo_v2_compile, fullgraph=True)(torch.randn(4, 4))
+            self.assertEqual(foo_v2_compile.tensor, None)
+            self.assertEqual(foo_v2_compile.const, 4)
+            self.assertEqual(foo_v2_compile.bar.const, 4)
+            same(res, foo_v2_eager(inp))
+
+    def test_replay_side_effects_input_mut(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.const = 4
+                self.tensor = None
+
+            def forward(self, x):
+                x.add_(5)
+                return x.cos()
+
+        # This is ok because we actually capture the graph which
+        # has mutation. In export, we never retrace the actual
+        # gm so we won't see any mutation applied to inputs
+        with torch._dynamo.config.patch(
+            replay_side_effects=False, side_effect_replay_policy="error"
+        ):
+            foo = Foo()
+            torch.compile(foo, fullgraph=True)(torch.randn(4, 4))
+
     def test_list_append_return_none(self):
         def fn(x):
             alist = []
@@ -5624,6 +5953,20 @@ def test_cross_entropy_loss_simple_ctor(self):
 
         self.assertTrue(torch.allclose(dynamo_output, output))
 
+    def test_repr(self):
+        class Config:
+            def __repr__(self):
+                return "Config()"
+
+        def forward(x, config):
+            return x * len(repr(config))
+
+        config = Config()
+        x = torch.randn(2, 2)
+
+        compiled = torch.compile(forward, fullgraph=True)
+        compiled(x, config)
+
     def test_nn_functional_reduction(self):
         def fn(loss, reduction):
             reduction_enum = F._Reduction.get_enum(reduction)
@@ -13194,6 +13537,30 @@ def fn(x, y):
 
         self.assertEqual(actual, expected)
 
+    @parametrize_pytree_module
+    def test_pytree_tree_map_dict_order(self, pytree):
+        def fn(tree):
+            new_tree = pytree.tree_map(lambda x: x, tree)
+            return list(new_tree.keys()), list(new_tree.values())
+
+        x = torch.randn(3, 2)
+        fn_opt = torch.compile(fullgraph=True)(fn)
+
+        tree1 = {"b": x + 2, "a": x, "c": x - 1}
+        expected1 = fn(tree1)
+        actual1 = fn_opt(tree1)
+        self.assertEqual(actual1, expected1)
+
+        tree2 = collections.OrderedDict([("b", x + 2), ("a", x), ("c", x - 1)])
+        expected2 = fn(tree2)
+        actual2 = fn_opt(tree2)
+        self.assertEqual(actual2, expected2)
+
+        tree3 = collections.defaultdict(int, {"b": x + 2, "a": x, "c": x - 1})
+        expected3 = fn(tree3)
+        actual3 = fn_opt(tree3)
+        self.assertEqual(actual3, expected3)
+
     @parametrize_pytree_module
     def test_pytree_tree_map_only(self, pytree):
         if not callable(getattr(pytree, "tree_map_only", None)):
@@ -13219,6 +13586,27 @@ def mapper(x):
         self.assertEqual(counter.frame_count, 1)
         self.assertEqual(counter.op_count, 9)
 
+    def test_pytree_register_constant_with_side_effect(self):
+        class Foo:
+            pass
+
+        class Bar:
+            def __eq__(self, other):
+                return super().__eq__(other)
+
+            def __hash__(self):
+                return 0
+
+        python_pytree.register_constant(Bar)
+
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(x, obj):
+            obj.attr = {3: Bar()}
+            return x + 1
+
+        inp = torch.ones(3)
+        self.assertEqual(fn(inp, Foo()), inp + 1)
+
 
 class TestTracer(JitTestCase):
     def test_jit_save(self):
@@ -13703,6 +14091,44 @@ def fuzzed_program(arg_0, sentinel):
         except Exception as e:
             self.fail(f"torch.compile failed with error: {e}")
 
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_tensorify_track_item_symint(self):
+        def _random_resize(image: torch.Tensor):
+            image_metanet = image
+            default_patch_size = 14
+            rand_cnn_resolution = (224, 256)
+            min_nump = rand_cnn_resolution[0] // default_patch_size
+            max_nump = rand_cnn_resolution[1] // default_patch_size
+            new_nump = torch.randint(min_nump, max_nump + 1, (1,)).item()
+            torch._check(new_nump > 0)
+            torch._check(new_nump * default_patch_size > 1)
+
+            image_metanet = F.interpolate(
+                image_metanet,
+                size=(new_nump * default_patch_size, new_nump * default_patch_size),
+                mode="bilinear",
+                align_corners=True,
+            )
+            img_h_new, img_w_new = image_metanet.shape[2:]
+
+            return (img_h_new, img_w_new), image_metanet
+
+        _random_resize_compiled = torch.compile(fullgraph=True)(_random_resize)
+
+        # Test the function
+        input_tensor = torch.rand(1, 3, 224, 224)
+        (h, w), output = _random_resize_compiled(input_tensor)
+
+        # Verify output properties
+        self.assertEqual(output.shape[0], 1)
+        self.assertEqual(output.shape[1], 3)
+        self.assertEqual(output.shape[2], h)
+        self.assertEqual(output.shape[3], w)
+        self.assertTrue(h % 14 == 0)
+        self.assertTrue(w % 14 == 0)
+        self.assertTrue(224 <= h <= 256)
+        self.assertTrue(224 <= w <= 256)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_nested_graph_breaks.py b/test/dynamo/test_nested_graph_breaks.py
index bc41e19c9ef01..c3ce926b8dd5d 100644
--- a/test/dynamo/test_nested_graph_breaks.py
+++ b/test/dynamo/test_nested_graph_breaks.py
@@ -874,6 +874,32 @@ def f5(x):
         self.assertEqual(cnts.frame_count, 8)
         self.assertEqual(cnts.op_count, 10)
 
+    def test_functorch_with_nested_graph_break(self):
+        def f1(x):
+            x = x * 2
+            torch._dynamo.graph_break()
+            return x * 4
+
+        def f2(x):
+            return (f1(x * 8) * 16).sum()
+
+        def f3(x):
+            return torch.func.grad(f2)(x * 32) * 64
+
+        def f4(x):
+            return f3(x * 128) * 256
+
+        cnts = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
+        x = torch.randn(3)
+        actual = f4(x)
+        expected = torch.compile(f4, backend=cnts, fullgraph=False)(x)
+        self.assertEqual(actual, expected)
+        self.assertEqual(len(torch._dynamo.utils.counters["graph_break"]), 1)
+        # f4 + f3, f3 end + f4 end
+        self.assertEqual(cnts.frame_count, 2)
+        # multiplication by 32, 64, 128, 256
+        self.assertEqual(cnts.op_count, 4)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_python_autograd.py b/test/dynamo/test_python_autograd.py
index a615c653f56c3..a6117bb4093a7 100644
--- a/test/dynamo/test_python_autograd.py
+++ b/test/dynamo/test_python_autograd.py
@@ -1,5 +1,5 @@
 # Owner(s): ["module: dynamo"]
-from typing import Callable, NamedTuple, Optional
+from typing import NamedTuple, Optional, TYPE_CHECKING
 
 import torch
 import torch._dynamo
@@ -7,6 +7,10 @@
 from torch._dynamo.testing import CompileCounter, same
 
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
 """
 This is an example of a pure-python version of autograd implemented by
 @zdevito.  It represents a rather challenging test case for TorchDynamo
diff --git a/test/dynamo/test_reorder_logs.py b/test/dynamo/test_reorder_logs.py
index be6bf8085af27..a147b216e7703 100644
--- a/test/dynamo/test_reorder_logs.py
+++ b/test/dynamo/test_reorder_logs.py
@@ -67,7 +67,7 @@ def test_ignore_logger(self, ignore_method, fn, should_ignore_logger):
             self.assertEqual(len(counters["graph_break"]), 0)
         else:
             self.assertIn("moo", printed_output)
-            self.assertEqual(len(counters["graph_break"]), 1)
+            self.assertGreater(len(counters["graph_break"]), 0)
 
 
 class ReorderLogsTests(torch._dynamo.test_case.TestCase):
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index c6138f7574fd4..10342f56d55d1 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -48,6 +48,7 @@
     CompileCounter,
     CompileCounterWithBackend,
     EagerAndRecordGraphs,
+    expectedFailureDynamic,
     rand_strided,
     same,
     skipIfNotPy312,
@@ -7455,6 +7456,93 @@ def forward(self, x):
             msg,
         )
 
+    @expectedFailureDynamic
+    def test_dynamo_default_lru_cache_behavior(self):
+        @torch.compile(backend="eager")
+        def fn(x):
+            return x + 10
+
+        torch._dynamo.reset()
+        assert not torch._C._dynamo.eval_frame._debug_get_cache_entry_list(
+            fn._torchdynamo_orig_callable.__code__
+        )
+
+        # Step 1: Compile a static shapes graph
+        x = torch.randn(10, 10)
+        fn(x)
+        a = torch._C._dynamo.eval_frame._debug_get_cache_entry_list(
+            fn._torchdynamo_orig_callable.__code__
+        )
+        self.assertEqual(len(a), 1)
+        static_shapes_cache_entry = a[0]
+
+        # Step 2: Compile a dynamic shapes graph
+        y = torch.randn(20, 20)
+        fn(y)
+        b = torch._C._dynamo.eval_frame._debug_get_cache_entry_list(
+            fn._torchdynamo_orig_callable.__code__
+        )
+        self.assertEqual(len(b), 2)
+        self.assertEqual(b[1], static_shapes_cache_entry)
+        dynamic_shapes_cache_entry = b[0]
+
+        # Step 3: Run with Step 1's inputs
+        # LRU cache will match against dynamic shape graph first
+        fn(x)
+        c = torch._C._dynamo.eval_frame._debug_get_cache_entry_list(
+            fn._torchdynamo_orig_callable.__code__
+        )
+        self.assertEqual(len(c), 2)
+        self.assertEqual(c[0], dynamic_shapes_cache_entry)
+        self.assertEqual(c[1], static_shapes_cache_entry)
+
+    @expectedFailureDynamic
+    def test_dynamo_disable_lru_cache_behavior(self):
+        @torch.compile(backend="eager")
+        def fn(x):
+            return x + 10
+
+        def run():
+            torch._dynamo.reset()
+            assert not torch._C._dynamo.eval_frame._debug_get_cache_entry_list(
+                fn._torchdynamo_orig_callable.__code__
+            )
+
+            # Step 1: Compile a static shapes graph
+            x = torch.randn(10, 10)
+            fn(x)
+            a = torch._C._dynamo.eval_frame._debug_get_cache_entry_list(
+                fn._torchdynamo_orig_callable.__code__
+            )
+            self.assertEqual(len(a), 1)
+            static_shapes_cache_entry = a[0]
+
+            # Step 2: Compile a dynamic shapes graph
+            y = torch.randn(20, 20)
+            fn(y)
+            b = torch._C._dynamo.eval_frame._debug_get_cache_entry_list(
+                fn._torchdynamo_orig_callable.__code__
+            )
+            self.assertEqual(len(b), 2)
+            self.assertEqual(b[0], static_shapes_cache_entry)
+            dynamic_shapes_cache_entry = b[1]
+
+            # Step 3: Run with Step 1's inputs
+            # LRU cache is disabled, we should still have static entry first
+            fn(x)
+            c = torch._C._dynamo.eval_frame._debug_get_cache_entry_list(
+                fn._torchdynamo_orig_callable.__code__
+            )
+            self.assertEqual(len(c), 2)
+            self.assertEqual(c[0], static_shapes_cache_entry)
+            self.assertEqual(c[1], dynamic_shapes_cache_entry)
+
+        try:
+            torch._C._dynamo.eval_frame._set_lru_cache(False)
+            run()
+        finally:
+            torch._C._dynamo.eval_frame._set_lru_cache(True)
+
 
 class ReproTestsDevice(torch._dynamo.test_case.TestCase):
     def test_sub_alpha_scalar_repro(self, device):
@@ -8058,7 +8146,6 @@ def unsafe_grad(y):
             unsafe_grad(y)  # should not warn
             self.assertEqual(len(w), 1)
 
-    @torch._dynamo.config.patch(install_free_tensors=True)
     def test_partial_export(self):
         class Foo(torch.nn.Module):
             def __init__(self):
@@ -8078,14 +8165,14 @@ def wrapped_fn(fn, *args, **kwargs):
             def forward(self, a, b):
                 return a + b
 
-        from torch._dynamo.functional_export import _dynamo_graph_capture_for_export
+        from torch._dynamo.functional_export import dynamo_graph_capture_for_export
 
         foo = Foo()
         foo.parallelize()
         x = torch.randn(4, 4, dtype=torch.float32)
         y = torch.randn(4, 4, dtype=torch.float32)
         ref = foo(x, y)
-        gm = _dynamo_graph_capture_for_export(foo)(x, y)
+        gm = dynamo_graph_capture_for_export(foo)(x, y)
         res = gm(x, y)
         self.assertEqual(res, ref)
 
diff --git a/test/dynamo/test_streams.py b/test/dynamo/test_streams.py
index e05e1304d2860..3b4aff724eee4 100644
--- a/test/dynamo/test_streams.py
+++ b/test/dynamo/test_streams.py
@@ -1,11 +1,18 @@
 # Owner(s): ["module: dynamo"]
 import functools
+import re
 import unittest
 import weakref
+from unittest.mock import patch
 
 import torch
 import torch._dynamo.test_case
 import torch._dynamo.testing
+from torch._dynamo.graph_bytecode_inputs import (
+    reset_user_object_tracking,
+    store_user_object_weakrefs,
+)
+from torch._dynamo.testing import extract_graph, remove_trailing_space
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_utils import requires_cuda
 
@@ -15,6 +22,14 @@
 )
 
 
+def remove_file_comment(gm_str: str) -> str:
+    return remove_trailing_space(re.sub(r"File.*\n", "\n", gm_str))
+
+
+def print_graph(graph: torch.fx.GraphModule) -> str:
+    return remove_file_comment(graph.print_readable())
+
+
 class TestStreams(torch._dynamo.test_case.TestCase):
     @classmethod
     def setUpClass(cls):
@@ -36,9 +51,7 @@ def test_event_weakref(self):
 
     @requires_cuda
     def test_stream_enter_exit(self):
-        def fn(x, y):
-            s2 = torch.Stream()
-            s1 = torch.Stream()
+        def fn(x, y, s1, s2):
             with s1:
                 z1 = torch.add(x, y)
             with s2:
@@ -47,13 +60,36 @@ def fn(x, y):
 
             return y
 
-        inp = (torch.ones(2, 2) + 1, torch.ones(2, 2))
+        inp = (torch.ones(2, 2) + 1, torch.ones(2, 2), torch.Stream(), torch.Stream())
         expected = fn(*inp)
-        fn_opt = torch.compile(fn, fullgraph=True)
-        actual = fn_opt(*inp)
+        (
+            actual,
+            _,
+            fw_graphs,
+            _,
+        ) = extract_graph(fn, *inp)
+        self.assertEqual(len(fw_graphs), 1)
         self.assertEqual(expected, actual)
+        self.assertExpectedInline(
+            print_graph(fw_graphs[0]),
+            """\
+class <lambda>(torch.nn.Module):
+    def forward(self, arg0_1: "f32[2, 2]", arg1_1: "f32[2, 2]"):
+        # Annotation: {'stream': 0}
+        add: "f32[2, 2]" = torch.ops.aten.add.Tensor(arg0_1, arg1_1)
+
+        # Annotation: {'stream': 1}
+        add_1: "f32[2, 2]" = torch.ops.aten.add.Tensor(arg0_1, arg1_1);  arg0_1 = arg1_1 = None
+
+        # Annotation: {'stream': 1}
+        add_2: "f32[2, 2]" = torch.ops.aten.add.Tensor(add_1, 2);  add_1 = None
+        add_3: "f32[2, 2]" = torch.ops.aten.add.Tensor(add_2, add);  add_2 = add = None
+        return (add_3,)
+""",
+        )
 
     @requires_cuda
+    @unittest.skip("Needs graph break support with annotation context")
     def test_stream_context_graph_break(self):
         def fn(x, y):
             s2 = torch.Stream()
@@ -70,9 +106,16 @@ def fn(x, y):
 
         inp = (torch.ones(2, 2) + 1, torch.ones(2, 2))
         expected = fn(*inp)
-        fn_opt = torch.compile(fn)
-        actual = fn_opt(*inp)
+        (
+            actual,
+            _,
+            fw_graphs,
+            _,
+        ) = extract_graph(fn, *inp)
         self.assertEqual(expected, actual)
+        self.assertEqual(len(fw_graphs), 2)
+        self.assertExpectedInline(print_graph(fw_graphs[0]), """""")
+        self.assertExpectedInline(print_graph(fw_graphs[1]), """""")
 
     @requires_cuda
     def test_stream_input(self):
@@ -154,36 +197,523 @@ def fn(x, s0, s1):
         s_exp = fn(*inp)
         self.assertEqual(s_act, s_exp)
 
+    @requires_cuda
     def test_nested_stream_enter_exit(self):
-        pass
-
+        def fn(x, y, s0, s1, s2):
+            with s1:
+                with s2:
+                    z1 = torch.add(x, y)
+            with s0:
+                z0 = torch.add(x, y)
+                with s2:
+                    y = 2 + z1
+
+            return z0, y
+
+        inp = (
+            torch.ones(2, 2) + 1,
+            torch.ones(2, 2),
+            torch.Stream(),
+            torch.Stream(),
+            torch.Stream(),
+        )
+        expected = fn(*inp)
+        (
+            actual,
+            _,
+            fw_graphs,
+            _,
+        ) = extract_graph(fn, *inp)
+        self.assertEqual(len(fw_graphs), 1)
+        self.assertEqual(expected, actual)
+        self.assertExpectedInline(
+            print_graph(fw_graphs[0]),
+            """\
+class <lambda>(torch.nn.Module):
+    def forward(self, arg0_1: "f32[2, 2]", arg1_1: "f32[2, 2]"):
+        # Annotation: {'stream': 1}
+        add: "f32[2, 2]" = torch.ops.aten.add.Tensor(arg0_1, arg1_1)
+
+        # Annotation: {'stream': 2}
+        add_1: "f32[2, 2]" = torch.ops.aten.add.Tensor(arg0_1, arg1_1);  arg0_1 = arg1_1 = None
+
+        # Annotation: {'stream': 1}
+        add_2: "f32[2, 2]" = torch.ops.aten.add.Tensor(add, 2);  add = None
+        return (add_1, add_2)
+""",
+        )
+
+    @unittest.skip("Needs graph break support with annotation context")
     def test_stream_enter_exit_graph_break(self):
         pass
 
+    @unittest.skip("Needs graph break support with annotation context")
     def test_nested_stream_enter_exit_graph_break(self):
         pass
 
+    @requires_cuda
     def test_local_stream_enter_exit(self):
-        pass
+        def fn(x, y):
+            s2 = torch.Stream()
+            s1 = torch.Stream()
+            with s1:
+                z1 = torch.add(x, y)
+            with s2:
+                z = torch.add(x, y)
+                y = z + 2 + z1
 
+            return y
+
+        inp = (torch.ones(2, 2) + 1, torch.ones(2, 2))
+        expected = fn(*inp)
+        (
+            actual,
+            _,
+            fw_graphs,
+            _,
+        ) = extract_graph(fn, *inp)
+        self.assertEqual(len(fw_graphs), 1)
+        self.assertEqual(expected, actual)
+        self.assertExpectedInline(
+            print_graph(fw_graphs[0]),
+            """\
+class <lambda>(torch.nn.Module):
+    def forward(self, arg0_1: "f32[2, 2]", arg1_1: "f32[2, 2]"):
+        # Annotation: {'stream': 1}
+        add: "f32[2, 2]" = torch.ops.aten.add.Tensor(arg0_1, arg1_1)
+
+        # Annotation: {'stream': 0}
+        add_1: "f32[2, 2]" = torch.ops.aten.add.Tensor(arg0_1, arg1_1);  arg0_1 = arg1_1 = None
+
+        # Annotation: {'stream': 0}
+        add_2: "f32[2, 2]" = torch.ops.aten.add.Tensor(add_1, 2);  add_1 = None
+        add_3: "f32[2, 2]" = torch.ops.aten.add.Tensor(add_2, add);  add_2 = add = None
+        return (add_3,)
+""",
+        )
+
+    @requires_cuda
     def test_local_stream_nested_enter_exit(self):
-        pass
+        def fn(x, y):
+            s2 = torch.Stream()
+            s1 = torch.Stream()
+            s0 = torch.Stream()
+            with s1:
+                with s2:
+                    z1 = torch.add(x, y)
+            with s0:
+                z0 = torch.add(x, y)
+                with s2:
+                    y = 2 + z1
+
+            return z0, y
+
+        inp = (torch.ones(2, 2) + 1, torch.ones(2, 2))
+        expected = fn(*inp)
+        (
+            actual,
+            _,
+            fw_graphs,
+            _,
+        ) = extract_graph(fn, *inp)
+        self.assertEqual(len(fw_graphs), 1)
+        self.assertEqual(expected, actual)
+        self.assertExpectedInline(
+            print_graph(fw_graphs[0]),
+            """\
+class <lambda>(torch.nn.Module):
+    def forward(self, arg0_1: "f32[2, 2]", arg1_1: "f32[2, 2]"):
+        # Annotation: {'stream': 0}
+        add: "f32[2, 2]" = torch.ops.aten.add.Tensor(arg0_1, arg1_1)
+
+        # Annotation: {'stream': 2}
+        add_1: "f32[2, 2]" = torch.ops.aten.add.Tensor(arg0_1, arg1_1);  arg0_1 = arg1_1 = None
+
+        # Annotation: {'stream': 0}
+        add_2: "f32[2, 2]" = torch.ops.aten.add.Tensor(add, 2);  add = None
+        return (add_1, add_2)
+""",
+        )
+
+    @requires_cuda
+    @requires_multigpu()
+    def test_new_event_api(self) -> None:
+        from torch._dynamo.graph_bytecode_inputs import get_external_object_by_index
+        from torch._dynamo.variables.streams import new_event
+
+        def event_generation_backend(gm, *args, **kwargs):  # type: ignore[no-untyped-def]
+            e0_ind = new_event()
+            with torch.Stream(device="cuda:1"):
+                get_external_object_by_index(e0_ind).record()
+            e1_ind = new_event()
+            self.assertNotEqual(e0_ind, e1_ind)
+            self.assertNotEqual(
+                get_external_object_by_index(e0_ind),
+                get_external_object_by_index(e1_ind),
+            )
+            with gm.graph.inserting_after(next(iter(gm.graph.nodes))):
+                gm.graph.call_function(
+                    get_external_object_by_index, args=(1,), kwargs={}
+                )
+            return gm
+
+        @torch.compile(backend=event_generation_backend)
+        def fn(x):
+            return x + 1
+
+        fn(torch.ones(2, 2, device="cuda:0"))
+
+    @requires_cuda
+    def test_new_stream_api(self) -> None:
+        from torch._dynamo.graph_bytecode_inputs import get_external_object_by_index
+        from torch._dynamo.variables.streams import new_stream
+
+        def stream_generation_backend(gm, *args, **kwargs):  # type: ignore[no-untyped-def]
+            s0_ind = new_stream()
+            s1_ind = new_stream()
+            self.assertNotEqual(s0_ind, s1_ind)
+            self.assertNotEqual(
+                get_external_object_by_index(s0_ind),
+                get_external_object_by_index(s1_ind),
+            )
+            with gm.graph.inserting_after(next(iter(gm.graph.nodes))):
+                gm.graph.call_function(
+                    get_external_object_by_index, args=(1,), kwargs={}
+                )
+            return gm
+
+        @torch.compile(backend=stream_generation_backend)
+        def fn(x):
+            return x + 1
+
+        fn(torch.ones(2, 2, device="cuda:0"))
 
+    @requires_cuda
+    def test_current_stream_api(self) -> None:
+        from torch._dynamo.graph_bytecode_inputs import get_external_object_by_index
+        from torch._dynamo.variables.streams import get_current_stream
+
+        cur_stream = torch.accelerator.current_stream()
+        s0 = None
+
+        def stream_generation_backend(gm, *args, **kwargs):  # type: ignore[no-untyped-def]
+            nonlocal s0
+            s0_ind = get_current_stream(torch.device("cuda:0"))
+            self.assertEqual(get_external_object_by_index(s0_ind), cur_stream)
+            with gm.graph.inserting_after(next(iter(gm.graph.nodes))):
+                gm.graph.call_function(
+                    get_external_object_by_index, args=(s0_ind,), kwargs={}
+                )
+                gm.graph.call_function(
+                    lambda x: self.assertEqual(
+                        cur_stream, get_external_object_by_index(x)
+                    ),
+                    args=(s0_ind,),
+                    kwargs={},
+                )
+            return gm
+
+        @torch.compile(backend=stream_generation_backend)
+        def fn(x):
+            return x + 1
+
+        fn(torch.ones(2, 2, device="cuda:0"))
+
+    @requires_cuda
     def test_stream_with_mutation(self):
-        pass
+        def fn(x, y):
+            s2 = torch.Stream()
+            s1 = torch.Stream()
+            s0 = torch.Stream()
+            with s1:
+                with s2:
+                    x.add_(y)
+            with s0:
+                z1 = torch.add(y, y)
+                z0 = torch.add(z1, y)
+                with s2:
+                    y = 2 + z1
+
+            return z0, y
+
+        inp = (torch.ones(2, 2) + 1, torch.ones(2, 2))
+        expected = fn(*inp)
+        (
+            actual,
+            _,
+            fw_graphs,
+            _,
+        ) = extract_graph(fn, *inp)
+        self.assertEqual(len(fw_graphs), 1)
+        self.assertEqual(expected, actual)
+        self.assertExpectedInline(
+            print_graph(fw_graphs[0]),
+            """\
+class <lambda>(torch.nn.Module):
+    def forward(self, arg0_1: "f32[2, 2]", arg1_1: "f32[2, 2]"):
+        # Annotation: {'stream': 0}
+        add: "f32[2, 2]" = torch.ops.aten.add.Tensor(arg0_1, arg1_1)
+
+        # Annotation: {'stream': 2}
+        add_1: "f32[2, 2]" = torch.ops.aten.add.Tensor(arg1_1, arg1_1)
+
+        # Annotation: {'stream': 2}
+        add_2: "f32[2, 2]" = torch.ops.aten.add.Tensor(add_1, arg1_1);  arg1_1 = None
+
+        # Annotation: {'stream': 0}
+        add_3: "f32[2, 2]" = torch.ops.aten.add.Tensor(add_1, 2);  add_1 = None
+
+        #
+        copy_: "f32[2, 2]" = torch.ops.aten.copy_.default(arg0_1, add);  arg0_1 = add = copy_ = None
+        return (add_2, add_3)
+""",
+        )
+
+    @requires_cuda
+    def test_stream_backward_simple(self) -> None:
+        def fn(x, y):
+            s2 = torch.Stream()
+            s0 = torch.Stream()
+            with s0:
+                y0 = 2 * x + y
+            with s2:
+                z = 2 * x + y
+
+            return y0, z
+
+        inp = (
+            torch.ones(2, 2, requires_grad=True) + 1,
+            torch.ones(2, 2, requires_grad=True),
+        )
+        expected = fn(*inp)
+        (
+            actual,
+            _,
+            fw_graphs,
+            bw_graphs,
+        ) = extract_graph(fn, *inp)
+        self.assertEqual(len(fw_graphs), 1)
+        self.assertEqual(expected, actual)
+        self.assertExpectedInline(
+            print_graph(fw_graphs[0]),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "f32[2, 2]", primals_2: "f32[2, 2]"):
+        # Annotation: {'stream': 1}
+        mul: "f32[2, 2]" = torch.ops.aten.mul.Tensor(primals_1, 2);  primals_1 = None
+        add: "f32[2, 2]" = torch.ops.aten.add.Tensor(mul, primals_2)
+
+        # Annotation: {'stream': 0}
+        add_1: "f32[2, 2]" = torch.ops.aten.add.Tensor(mul, primals_2);  mul = primals_2 = None
+        return (add, add_1)
+""",
+        )
+
+        actual[1].sum().backward()
+        self.assertExpectedInline(
+            print_graph(bw_graphs[0]),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, tangents_1: "f32[2, 2]", tangents_2: "f32[2, 2]"):
+        # Annotation: {'stream': 0}
+        mul_2: "f32[2, 2]" = torch.ops.aten.mul.Tensor(tangents_2, 2)
+
+        #
+        add_2: "f32[2, 2]" = torch.ops.aten.add.Tensor(tangents_2, tangents_1);  tangents_2 = None
+
+        # Annotation: {'stream': 1}
+        mul_3: "f32[2, 2]" = torch.ops.aten.mul.Tensor(tangents_1, 2);  tangents_1 = None
+
+        # Annotation: {'stream': 0}
+        add_3: "f32[2, 2]" = torch.ops.aten.add.Tensor(mul_2, mul_3);  mul_2 = mul_3 = None
+        return (add_3, add_2)
+""",
+        )
 
     @requires_cuda
-    def test_run_opcheck(self):
+    def test_stream_backward_sync(self) -> None:
+        def fn(x, y):
+            s2 = torch.Stream()
+            s0 = torch.Stream()
+            with s0:
+                y0 = 2 * x + y
+            with s2:
+                z = 2 * x + y
+
+            return y0, z
+
+        inp = (
+            torch.ones(2, 2, device="cuda:0", requires_grad=True) + 1,
+            torch.ones(2, 2, device="cuda:0", requires_grad=True),
+        )
+        expected = fn(*inp)
+        (
+            actual,
+            _,
+            fw_graphs,
+            bw_graphs,
+        ) = extract_graph(fn, *inp)
+        self.assertEqual(len(fw_graphs), 1)
+        self.assertEqual(expected, actual)
+        self.assertExpectedInline(
+            print_graph(fw_graphs[0]),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "f32[2, 2]", primals_2: "f32[2, 2]"):
+        # Annotation: {'stream': 1}
+        mul: "f32[2, 2]" = torch.ops.aten.mul.Tensor(primals_1, 2);  primals_1 = None
+        add: "f32[2, 2]" = torch.ops.aten.add.Tensor(mul, primals_2)
+
+        # Annotation: {'stream': 0}
+        add_1: "f32[2, 2]" = torch.ops.aten.add.Tensor(mul, primals_2);  mul = primals_2 = None
+        return (add, add_1)
+""",
+        )
+
+        actual[1].sum().backward()
+        self.assertExpectedInline(
+            print_graph(bw_graphs[0]),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, tangents_1: "f32[2, 2]", tangents_2: "f32[2, 2]"):
+        # Annotation: {'stream': 0}
+        mul_2: "f32[2, 2]" = torch.ops.aten.mul.Tensor(tangents_2, 2)
+
+        #
+        add_2: "f32[2, 2]" = torch.ops.aten.add.Tensor(tangents_2, tangents_1);  tangents_2 = None
+
+        # Annotation: {'stream': 1}
+        mul_3: "f32[2, 2]" = torch.ops.aten.mul.Tensor(tangents_1, 2);  tangents_1 = None
+
+        # Annotation: {'stream': 0}
+        add_3: "f32[2, 2]" = torch.ops.aten.add.Tensor(mul_2, mul_3);  mul_2 = mul_3 = None
+        return (add_3, add_2)
+""",
+        )
+
+    @requires_cuda
+    def test_event_tracing(self):
+        def fn(x) -> None:
+            e = torch.Event()
+            e.record()
+            x.add_(1)
+            return x
+
+        inp = (torch.ones(2, 2, device="cuda"),)
+        (
+            _,
+            _,
+            fw_graphs,
+            _,
+        ) = extract_graph(fn, *inp)
+
+        self.assertExpectedInline(
+            print_graph(fw_graphs[0]),
+            """\
+class <lambda>(torch.nn.Module):
+    def forward(self, arg0_1: "f32[2, 2]"):
+        #
+        record_event = torch.ops.streams.record_event.default(0, 1);  record_event = None
+
+        #
+        add: "f32[2, 2]" = torch.ops.aten.add.Tensor(arg0_1, 1)
+        copy_: "f32[2, 2]" = torch.ops.aten.copy_.default(arg0_1, add);  arg0_1 = add = None
+        return (copy_,)
+""",
+        )
+
+    @requires_cuda
+    def test_run_opcheck_fork_join(self):
         from torch._dynamo.variables.streams import fork_stream, join_stream
         from torch.library import opcheck
 
+        original_stream = torch.accelerator.current_stream()
+        try:
+            s0 = torch.Stream()
+            s1 = torch.Stream()
+            store_user_object_weakrefs(s0, s1)
+
+            sample_inputs = [
+                (0, 1),
+                (1, 0),
+            ]
+            for args in sample_inputs:
+                opcheck(fork_stream, args)
+                opcheck(join_stream, args)
+        finally:
+            torch.accelerator.set_stream(original_stream)
+            reset_user_object_tracking()
+
+    @requires_cuda
+    def test_run_opcheck_wait_record(self):
+        from torch._dynamo.variables.streams import record_event, wait_event
+        from torch.library import opcheck
+
+        original_stream = torch.accelerator.current_stream()
+        try:
+            s0 = torch.Stream()
+            s1 = torch.Stream()
+            e0 = torch.Event()
+            e1 = torch.Event()
+            store_user_object_weakrefs(s0, s1, e0, e1)
+
+            sample_inputs = [
+                (2, 0),
+                (3, 1),
+            ]
+            for args in sample_inputs:
+                opcheck(wait_event, args)
+                opcheck(record_event, args)
+        finally:
+            torch.accelerator.set_stream(original_stream)
+            reset_user_object_tracking()
+
+    @requires_cuda
+    def test_run_opcheck_wait_record_stream(self):
+        from torch._dynamo.variables.streams import wait_stream
+        from torch.library import opcheck
+
+        s0 = torch.Stream()
+        s1 = torch.Stream()
+        s2 = torch.Stream()
+        store_user_object_weakrefs(s0, s1, s2)
+
         sample_inputs = [
-            (0, torch.device("cuda:0"), 1, torch.device("cuda:1")),
-            (2, torch.device("cuda:2"), 3, torch.device("cuda:1")),
+            (0, 1),
+            (2, 0),
         ]
         for args in sample_inputs:
-            opcheck(fork_stream, args)
-            opcheck(join_stream, args)
+            opcheck(wait_stream, args)
+
+    @requires_cuda
+    def test_inductor_lowering(self):
+        with patch("torch._inductor.config.implicit_fallbacks", False):
+
+            @torch.compile()
+            def fn(x):
+                e = torch.Event()
+                x += x + 1
+                e.record()
+                return x
+
+            inp = (torch.ones(2, 2, device="cuda"),)
+            fn(*inp)
+
+    def test_is_marked_side_effectful(self):
+        self.assertIn(
+            torch.ops.streams.fork.default, torch.fx.node._side_effectful_functions
+        )
+        self.assertIn(
+            torch.ops.streams.join.default, torch.fx.node._side_effectful_functions
+        )
+        self.assertIn(
+            torch.ops.streams.wait_event.default,
+            torch.fx.node._side_effectful_functions,
+        )
+        self.assertIn(
+            torch.ops.streams.record_event.default,
+            torch.fx.node._side_effectful_functions,
+        )
 
 
 if __name__ == "__main__":
diff --git a/test/dynamo/test_subclasses.py b/test/dynamo/test_subclasses.py
index 39a0dc628baec..25c0da48f602f 100644
--- a/test/dynamo/test_subclasses.py
+++ b/test/dynamo/test_subclasses.py
@@ -2169,6 +2169,46 @@ def fn(t0, t1, t2):
 
         fn(torch.ones(4), x, torch.ones(4))
 
+    @torch._dynamo.config.patch("inline_inbuilt_nn_modules", True)
+    def test_subclass_parameters_are_static_under_training(self):
+        from collections.abc import Callable
+        from typing import Any, Optional
+
+        from torch._inductor.compile_fx import compile_fx
+        from torch._inductor.cudagraph_utils import BoxedDeviceIndex
+        from torch._inductor.utils import BoxedBool
+
+        def inner_compile(
+            gm: torch.fx.GraphModule,
+            example_inputs: list[torch.Tensor],
+            cudagraphs: Optional[BoxedBool] = None,
+            static_input_idxs: Optional[list[int]] = None,
+            is_backward: bool = False,
+            graph_id: Optional[int] = None,
+            cpp_wrapper: bool = False,
+            aot_mode: bool = False,
+            is_inference: bool = False,
+            boxed_forward_device_index: Optional[BoxedDeviceIndex] = None,
+            layout_opt: Optional[bool] = None,
+            extern_node_serializer: Optional[Callable[[list[Any]], Any]] = None,
+        ):
+            # Important bit: there are 3 params: linear.weight.a, linear.weight.b, linear.bias,
+            # which are the first 3 args of the graph.
+            self.assertEqual(static_input_idxs, [0, 1, 2])
+            return gm
+
+        compiler = functools.partial(compile_fx, inner_compile=inner_compile)
+
+        mod = torch.nn.Linear(4, 4)
+        w_a = torch.randn(4, 4)
+        w_b = torch.randn(4, 4)
+        w = torch.nn.Parameter(TwoTensor(w_a, w_b).requires_grad_())
+        mod.weight = w
+
+        mod = torch.compile(mod, backend=compiler)
+
+        mod(torch.randn(4))
+
     # copied from common_utils.py::NestedTensorTestCase
     def assertEqualIgnoringNestedInts(self, a, b):
         # unbinding NJTs allows us to compare them as essentially equal without
@@ -4036,7 +4076,7 @@ def backend(gm, args):
 
     @parametrize(
         "nt_view_name",
-        [k for k in VIEW_TEST_CASES.keys() if k != "subclass_dense_subclass_dense"],
+        [k for k in VIEW_TEST_CASES if k != "subclass_dense_subclass_dense"],
     )
     def test_inputs_to_compiled_fn_are_views(self, nt_view_name):
         self._input_view_test(nt_view_name)
diff --git a/test/dynamo/test_utils.py b/test/dynamo/test_utils.py
index 66ebe17399ac7..24573a3a8178b 100644
--- a/test/dynamo/test_utils.py
+++ b/test/dynamo/test_utils.py
@@ -562,7 +562,7 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'graph_node_count': 3,
  'graph_node_shapes': None,
  'graph_op_count': 1,
- 'guard_count': 9,
+ 'guard_count': 10,
  'has_guarded_code': True,
  'inductor_code_gen_cumulative_compile_time_us': 0,
  'inductor_compile_time_s': 0.0,
@@ -608,7 +608,7 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'tensorify_float_attempt': None,
  'tensorify_float_failure': None,
  'tensorify_float_success': None,
- 'triton_compile_time_us': None,
+ 'triton_compile_time_us': 0,
  'triton_kernel_compile_times_us': None,
  'triton_version': None}"""
                 if _IS_WINDOWS
@@ -649,7 +649,7 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'graph_node_count': 3,
  'graph_node_shapes': None,
  'graph_op_count': 1,
- 'guard_count': 9,
+ 'guard_count': 10,
  'has_guarded_code': True,
  'inductor_code_gen_cumulative_compile_time_us': 0,
  'inductor_compile_time_s': 0.0,
@@ -920,7 +920,7 @@ def test_ir_count(self):
         first, second = {
             (3, 9): (10, 6),
             (3, 10): (10, 6),
-            (3, 11): (10, 6),
+            (3, 11): (11, 7),
             (3, 12): (11, 7),
             (3, 13): (11, 7),
             (3, 14): (11, 7),
diff --git a/test/dynamo/test_wrap_inductor_compiled_regions.py b/test/dynamo/test_wrap_inductor_compiled_regions.py
new file mode 100644
index 0000000000000..5c2f23e30e30d
--- /dev/null
+++ b/test/dynamo/test_wrap_inductor_compiled_regions.py
@@ -0,0 +1,1075 @@
+# Owner(s): ["module: dynamo"]
+
+import functools
+
+import torch
+import torch._dynamo.test_case
+from functorch.compile import min_cut_rematerialization_partition
+from torch._dynamo.backends.common import aot_autograd
+from torch._dynamo.utils import counters
+from torch._functorch import config as functorch_config
+from torch._inductor import config as inductor_config
+from torch.nn.attention.flex_attention import flex_attention, flex_attention_hop
+from torch.testing._internal.common_utils import skipIfRocm
+from torch.testing._internal.triton_utils import requires_cuda_and_triton
+from torch.utils._debug_mode import DebugMode
+from torch.utils.checkpoint import (
+    checkpoint,
+    CheckpointPolicy,
+    create_selective_checkpoint_contexts,
+)
+
+
+def count_ops(
+    gm, args, freq=None, freq_ge=None, op=None, freqs=None, freqs_ge=None, ops=None
+):
+    """
+    Count operations in a graph module.
+    Used to verify SAC behavior by counting ops in forward/backward graphs.
+    """
+
+    def match_rng_op(node, op):
+        if isinstance(node.target, torch._ops.HigherOrderOperator):
+            if node.name == "run_and_save_rng_state":
+                return node.args[0] == op
+            elif node.name == "run_with_rng_state":
+                return node.args[1] == op
+            elif node.name == "graphsafe_run_with_rng_state":
+                return node.args[0] == op
+        return False
+
+    if op is not None:
+        assert not isinstance(op, list)
+        ops = [op]
+    if freq is not None:
+        freqs = [freq]
+    if freq_ge is not None:
+        freqs_ge = [freq_ge]
+    if freqs:
+        for op, freq in zip(ops, freqs):
+            actual_count = 0
+            for node in gm.graph.nodes:
+                if match_rng_op(node, op) or node.target == op:
+                    actual_count += 1
+            err_msg = f"In graph {gm}, expected {op} to have occurred {freq} times in the graph, but got {actual_count}."
+            assert actual_count == freq, err_msg
+    else:
+        assert freqs_ge is not None
+        for op, freq_ge in zip(ops, freqs_ge):
+            actual_count = 0
+            for node in gm.graph.nodes:
+                if match_rng_op(node, op) or node.target == op:
+                    actual_count += 1
+            assert actual_count >= freq_ge, (
+                f"In graph {gm}, expected {op} to have occurred at least {freq_ge} times in the graph, but got {actual_count}."
+            )
+    return gm
+
+
+class TestWrapInductorCompiledRegions(torch._dynamo.test_case.TestCase):
+    """Tests for wrap_inductor_compiled_regions option"""
+
+    @requires_cuda_and_triton
+    def test_wrap_enabled_visible_in_debug_mode(self):
+        """Test that compiled regions are wrapped when option is enabled"""
+
+        @torch.compile(
+            backend="inductor",
+            options={"wrap_inductor_compiled_regions": True},
+            fullgraph=True,
+        )
+        def fn(x, y):
+            return torch.matmul(x, y)
+
+        x = torch.randn(4, 4, device="cuda")
+        y = torch.randn(4, 4, device="cuda")
+
+        with DebugMode() as debug_mode:
+            result = fn(x, y)
+
+        debug_string = debug_mode.debug_string()
+
+        # inductor_compiled_code HOP should be visible in DebugMode
+        self.assertIn("inductor_compiled_code", debug_string)
+
+        # Result should be correct
+        expected = torch.matmul(x, y)
+        self.assertEqual(result, expected)
+
+    @requires_cuda_and_triton
+    def test_wrap_disabled_not_visible_in_debug_mode(self):
+        """Test that compiled regions are not wrapped when option is disabled"""
+
+        @torch.compile(
+            backend="inductor",
+            options={"wrap_inductor_compiled_regions": False},
+            fullgraph=True,
+        )
+        def fn(x, y):
+            return torch.matmul(x, y)
+
+        x = torch.randn(4, 4, device="cuda")
+        y = torch.randn(4, 4, device="cuda")
+
+        with DebugMode() as debug_mode:
+            result = fn(x, y)
+
+        debug_string = debug_mode.debug_string()
+
+        # inductor_compiled_code HOP should NOT be visible
+        self.assertNotIn("inductor_compiled_code", debug_string)
+
+        # Result should still be correct
+        expected = torch.matmul(x, y)
+        self.assertEqual(result, expected)
+
+    @requires_cuda_and_triton
+    def test_wrap_default_disabled(self):
+        """Test that wrapping is disabled by default"""
+
+        @torch.compile(backend="inductor", fullgraph=True)
+        def fn(x, y):
+            return torch.matmul(x, y)
+
+        x = torch.randn(4, 4, device="cuda")
+        y = torch.randn(4, 4, device="cuda")
+
+        with DebugMode() as debug_mode:
+            result = fn(x, y)
+
+        debug_string = debug_mode.debug_string()
+
+        # inductor_compiled_code HOP should NOT be visible by default
+        self.assertNotIn("inductor_compiled_code", debug_string)
+
+        # Result should be correct
+        expected = torch.matmul(x, y)
+        self.assertEqual(result, expected)
+
+    @requires_cuda_and_triton
+    def test_wrap_with_backward(self):
+        """Test that wrapping works correctly with backward pass"""
+
+        @torch.compile(
+            backend="inductor",
+            options={"wrap_inductor_compiled_regions": True},
+            fullgraph=True,
+        )
+        def fn(x, y):
+            return torch.matmul(x, y)
+
+        x = torch.randn(4, 4, device="cuda", requires_grad=True)
+        y = torch.randn(4, 4, device="cuda", requires_grad=True)
+
+        # Clone for eager comparison
+        x_eager = x.detach().clone().requires_grad_(True)
+        y_eager = y.detach().clone().requires_grad_(True)
+
+        # Compiled forward and backward
+        with DebugMode() as debug_mode:
+            result = fn(x, y)
+            loss = result.sum()
+            loss.backward()
+
+        debug_string = debug_mode.debug_string()
+
+        # inductor_compiled_code HOP should be visible in forward
+        self.assertIn("inductor_compiled_code", debug_string)
+
+        # Eager forward and backward
+        expected = torch.matmul(x_eager, y_eager)
+        expected_loss = expected.sum()
+        expected_loss.backward()
+
+        # Check correctness
+        self.assertEqual(result, expected)
+        self.assertEqual(x.grad, x_eager.grad)
+        self.assertEqual(y.grad, y_eager.grad)
+
+    @requires_cuda_and_triton
+    def test_wrap_with_multiple_ops(self):
+        """Test wrapping with a function that has multiple operations"""
+
+        @torch.compile(
+            backend="inductor",
+            options={"wrap_inductor_compiled_regions": True},
+            fullgraph=True,
+        )
+        def fn(x, y):
+            a = torch.matmul(x, y)
+            b = torch.relu(a)
+            c = b + x
+            return c
+
+        x = torch.randn(4, 4, device="cuda")
+        y = torch.randn(4, 4, device="cuda")
+
+        with DebugMode() as debug_mode:
+            result = fn(x, y)
+
+        debug_string = debug_mode.debug_string()
+
+        # inductor_compiled_code HOP should be visible
+        self.assertIn("inductor_compiled_code", debug_string)
+
+        # Result should be correct
+        a = torch.matmul(x, y)
+        b = torch.relu(a)
+        expected = b + x
+        self.assertEqual(result, expected)
+
+    @requires_cuda_and_triton
+    def test_wrap_option_type_validation(self):
+        """Test that wrap_inductor_compiled_regions validates type correctly"""
+
+        # Should accept bool
+        @torch.compile(
+            backend="inductor",
+            options={"wrap_inductor_compiled_regions": True},
+        )
+        def fn_true(x):
+            return x + 1
+
+        @torch.compile(
+            backend="inductor",
+            options={"wrap_inductor_compiled_regions": False},
+        )
+        def fn_false(x):
+            return x + 1
+
+        x = torch.randn(4, device="cuda")
+        _ = fn_true(x)
+        _ = fn_false(x)
+
+        # Should reject non-bool
+        with self.assertRaises(RuntimeError) as cm:
+
+            @torch.compile(
+                backend="inductor",
+                options={"wrap_inductor_compiled_regions": "true"},
+            )
+            def fn_invalid(x):
+                return x + 1
+
+        self.assertIn("Unexpected type", str(cm.exception))
+
+    @requires_cuda_and_triton
+    def test_wrap_per_compilation(self):
+        """Test that wrap option is per-compilation, not global"""
+
+        @torch.compile(
+            backend="inductor",
+            options={"wrap_inductor_compiled_regions": True},
+            fullgraph=True,
+        )
+        def fn_wrapped(x, y):
+            return torch.matmul(x, y)
+
+        @torch.compile(
+            backend="inductor",
+            options={"wrap_inductor_compiled_regions": False},
+            fullgraph=True,
+        )
+        def fn_not_wrapped(x, y):
+            return torch.matmul(x, y)
+
+        x = torch.randn(4, 4, device="cuda")
+        y = torch.randn(4, 4, device="cuda")
+
+        # First function should be wrapped
+        with DebugMode() as debug_mode1:
+            _ = fn_wrapped(x, y)
+        self.assertIn("inductor_compiled_code", debug_mode1.debug_string())
+
+        # Second function should not be wrapped
+        with DebugMode() as debug_mode2:
+            _ = fn_not_wrapped(x, y)
+        self.assertNotIn("inductor_compiled_code", debug_mode2.debug_string())
+
+    @requires_cuda_and_triton
+    @inductor_config.patch("fx_graph_cache", True)
+    @inductor_config.patch("fx_graph_remote_cache", False)
+    @functorch_config.patch({"enable_autograd_cache": True})
+    def test_wrap_with_cache(self):
+        """
+        Test that wrap_inductor_compiled_regions works correctly with caching.
+        Verify that the wrapper is properly applied when loading from cache by
+        checking that DebugMode can see the inductor_compiled_code HOP on both
+        cache miss and cache hit.
+        """
+        from torch._functorch._aot_autograd.autograd_cache import AOTAutogradCache
+
+        def fn(x, y):
+            return torch.matmul(x, y)
+
+        x = torch.randn(4, 4, device="cuda")
+        y = torch.randn(4, 4, device="cuda")
+
+        # Clear all caches and counters
+        counters.clear()
+        torch._inductor.codecache.FxGraphCache.clear()
+        AOTAutogradCache.clear()
+        torch._dynamo.reset()
+        torch._inductor.codecache.PyCodeCache.cache_clear(purge=True)
+
+        compiled_fn = torch.compile(
+            fn,
+            backend="inductor",
+            options={"wrap_inductor_compiled_regions": True},
+            fullgraph=True,
+        )
+
+        # First call should miss the cache
+        with DebugMode() as debug_mode1:
+            result1 = compiled_fn(x, y)
+
+        debug_string1 = debug_mode1.debug_string()
+
+        # Verify wrapper is applied and invoked on cache miss
+        # If DebugMode sees the HOP, it means the wrapper was actually invoked
+        # (because DebugMode is registered with redirect_to_mode)
+        self.assertIn(
+            "inductor_compiled_code",
+            debug_string1,
+            "inductor_compiled_code HOP should be visible to DebugMode on cache miss",
+        )
+
+        # Verify cache miss
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
+
+        # Clear dynamo and codecache (but not FX or AOT autograd cache)
+        torch._dynamo.reset()
+        torch._inductor.codecache.PyCodeCache.cache_clear(purge=True)
+
+        # Second call should hit the cache
+        with DebugMode() as debug_mode2:
+            result2 = compiled_fn(x, y)
+
+        debug_string2 = debug_mode2.debug_string()
+
+        # Verify wrapper is still applied and invoked after loading from cache
+        # This proves that post_compile() properly wraps the cached callable
+        self.assertIn(
+            "inductor_compiled_code",
+            debug_string2,
+            "inductor_compiled_code HOP should be visible to DebugMode on cache hit, "
+            "proving wrapper was properly applied in post_compile()",
+        )
+
+        # Verify cache hit
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
+
+        # Results should be correct and identical
+        expected = torch.matmul(x, y)
+        self.assertEqual(result1, expected)
+        self.assertEqual(result2, expected)
+
+    @requires_cuda_and_triton
+    @inductor_config.patch("fx_graph_cache", True)
+    @inductor_config.patch("fx_graph_remote_cache", False)
+    @functorch_config.patch({"enable_autograd_cache": True})
+    def test_wrap_config_affects_cache_key(self):
+        """
+        Test that wrap_inductor_compiled_regions is part of the cache key.
+        Changing this option should cause a cache miss because it produces
+        different compiled artifacts (wrapped vs unwrapped).
+        """
+        from torch._functorch._aot_autograd.autograd_cache import AOTAutogradCache
+
+        def fn(x, y):
+            return torch.matmul(x, y)
+
+        x = torch.randn(4, 4, device="cuda")
+        y = torch.randn(4, 4, device="cuda")
+
+        # Clear all caches and counters
+        counters.clear()
+        torch._inductor.codecache.FxGraphCache.clear()
+        AOTAutogradCache.clear()
+        torch._dynamo.reset()
+        torch._inductor.codecache.PyCodeCache.cache_clear(purge=True)
+
+        # Compile with wrapping enabled
+        compiled_fn_wrapped = torch.compile(
+            fn,
+            backend="inductor",
+            options={"wrap_inductor_compiled_regions": True},
+            fullgraph=True,
+        )
+
+        # First call with wrapping=True should miss the cache
+        result1 = compiled_fn_wrapped(x, y)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
+
+        # Clear dynamo and codecache (but not FX or AOT autograd cache)
+        torch._dynamo.reset()
+        torch._inductor.codecache.PyCodeCache.cache_clear(purge=True)
+
+        # Second call with wrapping=True should hit the cache
+        result2 = compiled_fn_wrapped(x, y)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
+
+        # Clear dynamo and codecache again
+        torch._dynamo.reset()
+        torch._inductor.codecache.PyCodeCache.cache_clear(purge=True)
+
+        # Now compile with wrapping disabled - should miss cache because
+        # the config is different, even though the function is the same
+        compiled_fn_unwrapped = torch.compile(
+            fn,
+            backend="inductor",
+            options={"wrap_inductor_compiled_regions": False},
+            fullgraph=True,
+        )
+
+        result3 = compiled_fn_unwrapped(x, y)
+        # Should have a new cache miss because config changed
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 2)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
+
+        # Clear dynamo and codecache again
+        torch._dynamo.reset()
+        torch._inductor.codecache.PyCodeCache.cache_clear(purge=True)
+
+        # Call again with wrapping=False - should hit the cache for unwrapped version
+        result4 = compiled_fn_unwrapped(x, y)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 2)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 2)
+
+        # All results should be correct
+        expected = torch.matmul(x, y)
+        self.assertEqual(result1, expected)
+        self.assertEqual(result2, expected)
+        self.assertEqual(result3, expected)
+        self.assertEqual(result4, expected)
+
+        # Verify the wrapping behavior is different
+        with DebugMode() as debug_wrapped:
+            _ = compiled_fn_wrapped(x, y)
+        with DebugMode() as debug_unwrapped:
+            _ = compiled_fn_unwrapped(x, y)
+
+        # Wrapped version should show the HOP
+        self.assertIn("inductor_compiled_code", debug_wrapped.debug_string())
+        # Unwrapped version should not
+        self.assertNotIn("inductor_compiled_code", debug_unwrapped.debug_string())
+
+    @requires_cuda_and_triton
+    @skipIfRocm
+    def test_flex_attention_with_wrapper_basic(self):
+        """Test that flex_attention works with wrap_inductor_compiled_regions=True"""
+
+        def causal_score_mod(score, b, h, q_idx, k_idx):
+            return torch.where(q_idx >= k_idx, score, float("-inf"))
+
+        @torch.compile(
+            backend="inductor",
+            options={"wrap_inductor_compiled_regions": True},
+            fullgraph=True,
+        )
+        def fn(q, k, v):
+            return flex_attention(q, k, v, score_mod=causal_score_mod)
+
+        B, H, S, D = 2, 4, 128, 64
+        q = torch.randn(B, H, S, D, device="cuda", dtype=torch.float16)
+        k = torch.randn(B, H, S, D, device="cuda", dtype=torch.float16)
+        v = torch.randn(B, H, S, D, device="cuda", dtype=torch.float16)
+
+        # Test forward pass
+        output = fn(q, k, v)
+        self.assertEqual(output.shape, (B, H, S, D))
+
+        # Verify correctness by comparing with unwrapped version
+        @torch.compile(
+            backend="inductor",
+            options={"wrap_inductor_compiled_regions": False},
+            fullgraph=True,
+        )
+        def fn_unwrapped(q, k, v):
+            return flex_attention(q, k, v, score_mod=causal_score_mod)
+
+        output_unwrapped = fn_unwrapped(q, k, v)
+        torch.testing.assert_close(output, output_unwrapped, rtol=1e-3, atol=1e-3)
+
+    @requires_cuda_and_triton
+    @skipIfRocm
+    def test_flex_attention_wrapper_visible_in_debug_mode(self):
+        """Test that inductor_compiled_code HOP is visible to DebugMode when wrapper is enabled"""
+
+        def score_mod(score, b, h, q_idx, k_idx):
+            return score
+
+        @torch.compile(
+            backend="inductor",
+            options={"wrap_inductor_compiled_regions": True},
+            fullgraph=True,
+        )
+        def fn_wrapped(q, k, v):
+            return flex_attention(q, k, v, score_mod=score_mod)
+
+        @torch.compile(
+            backend="inductor",
+            options={"wrap_inductor_compiled_regions": False},
+            fullgraph=True,
+        )
+        def fn_unwrapped(q, k, v):
+            return flex_attention(q, k, v, score_mod=score_mod)
+
+        B, H, S, D = 2, 4, 128, 64
+        q = torch.randn(B, H, S, D, device="cuda", dtype=torch.float16)
+        k = torch.randn(B, H, S, D, device="cuda", dtype=torch.float16)
+        v = torch.randn(B, H, S, D, device="cuda", dtype=torch.float16)
+
+        # Test with wrapper enabled - should see inductor_compiled_code HOP
+        with DebugMode() as debug_wrapped:
+            _ = fn_wrapped(q, k, v)
+
+        debug_string_wrapped = debug_wrapped.debug_string()
+        self.assertIn(
+            "inductor_compiled_code",
+            debug_string_wrapped,
+            "inductor_compiled_code HOP should be visible when wrapper is enabled",
+        )
+
+        # Test with wrapper disabled - should NOT see inductor_compiled_code HOP
+        with DebugMode() as debug_unwrapped:
+            _ = fn_unwrapped(q, k, v)
+
+        debug_string_unwrapped = debug_unwrapped.debug_string()
+        self.assertNotIn(
+            "inductor_compiled_code",
+            debug_string_unwrapped,
+            "inductor_compiled_code HOP should not be visible when wrapper is disabled",
+        )
+
+    @requires_cuda_and_triton
+    @skipIfRocm
+    def test_flex_attention_wrapper_with_backward(self):
+        """Test that wrapper works correctly with backward pass"""
+
+        def score_mod(score, b, h, q_idx, k_idx):
+            return score + 0.1
+
+        @torch.compile(
+            backend="inductor",
+            options={"wrap_inductor_compiled_regions": True},
+            fullgraph=True,
+        )
+        def fn(q, k, v):
+            return flex_attention(q, k, v, score_mod=score_mod)
+
+        B, H, S, D = 2, 4, 128, 64
+        q = torch.randn(
+            B, H, S, D, device="cuda", dtype=torch.float16, requires_grad=True
+        )
+        k = torch.randn(
+            B, H, S, D, device="cuda", dtype=torch.float16, requires_grad=True
+        )
+        v = torch.randn(
+            B, H, S, D, device="cuda", dtype=torch.float16, requires_grad=True
+        )
+
+        # Forward and backward
+        output = fn(q, k, v)
+        loss = output.sum()
+        loss.backward()
+
+        # Verify gradients exist
+        self.assertIsNotNone(q.grad)
+        self.assertIsNotNone(k.grad)
+        self.assertIsNotNone(v.grad)
+
+        # Compare with unwrapped version
+        q2 = q.detach().clone().requires_grad_(True)
+        k2 = k.detach().clone().requires_grad_(True)
+        v2 = v.detach().clone().requires_grad_(True)
+
+        @torch.compile(
+            backend="inductor",
+            options={"wrap_inductor_compiled_regions": False},
+            fullgraph=True,
+        )
+        def fn_unwrapped(q, k, v):
+            return flex_attention(q, k, v, score_mod=score_mod)
+
+        output2 = fn_unwrapped(q2, k2, v2)
+        loss2 = output2.sum()
+        loss2.backward()
+
+        torch.testing.assert_close(q.grad, q2.grad, rtol=1e-3, atol=1e-3)
+        torch.testing.assert_close(k.grad, k2.grad, rtol=1e-3, atol=1e-3)
+        torch.testing.assert_close(v.grad, v2.grad, rtol=1e-3, atol=1e-3)
+
+    @requires_cuda_and_triton
+    @skipIfRocm
+    @inductor_config.patch("fx_graph_cache", True)
+    @inductor_config.patch("fx_graph_remote_cache", False)
+    @functorch_config.patch({"enable_autograd_cache": True})
+    def test_flex_attention_wrapper_with_cache(self):
+        """Test that wrapper works correctly with caching"""
+        from torch._functorch._aot_autograd.autograd_cache import AOTAutogradCache
+
+        def score_mod(score, b, h, q_idx, k_idx):
+            return score
+
+        def make_compiled_fn():
+            @torch.compile(
+                backend="inductor",
+                options={"wrap_inductor_compiled_regions": True},
+                fullgraph=True,
+            )
+            def fn(q, k, v):
+                return flex_attention(q, k, v, score_mod=score_mod)
+
+            return fn
+
+        B, H, S, D = 2, 4, 128, 64
+        q = torch.randn(B, H, S, D, device="cuda", dtype=torch.float16)
+        k = torch.randn(B, H, S, D, device="cuda", dtype=torch.float16)
+        v = torch.randn(B, H, S, D, device="cuda", dtype=torch.float16)
+
+        # Clear all caches
+        counters.clear()
+        torch._inductor.codecache.FxGraphCache.clear()
+        AOTAutogradCache.clear()
+        torch._dynamo.reset()
+        torch._inductor.codecache.PyCodeCache.cache_clear(purge=True)
+
+        # First call - cache miss
+        fn1 = make_compiled_fn()
+        with DebugMode() as debug_mode1:
+            result1 = fn1(q, k, v)
+
+        # Verify wrapper is visible in DebugMode
+        self.assertIn("inductor_compiled_code", debug_mode1.debug_string())
+
+        # Verify cache miss
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
+
+        # Clear dynamo and codecache (but not FX or AOT autograd cache)
+        torch._dynamo.reset()
+        torch._inductor.codecache.PyCodeCache.cache_clear(purge=True)
+
+        # Second call - cache hit
+        fn2 = make_compiled_fn()
+        with DebugMode() as debug_mode2:
+            result2 = fn2(q, k, v)
+
+        # Verify wrapper is still visible after loading from cache
+        self.assertIn(
+            "inductor_compiled_code",
+            debug_mode2.debug_string(),
+            "Wrapper should be applied even when loading from cache",
+        )
+
+        # Verify cache hit
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
+
+        # Verify correctness
+        torch.testing.assert_close(result1, result2)
+
+    @requires_cuda_and_triton
+    @skipIfRocm
+    def test_flex_attention_with_sac_must_save(self):
+        """
+        Test that SAC policy MUST_SAVE for flex_attention_hop
+        prevents recomputation during backward when used with wrapper.
+
+        This verifies that flex_attention works correctly with SAC when
+        wrap_inductor_compiled_regions is enabled.
+        """
+
+        def score_mod(score, b, h, q_idx, k_idx):
+            return score
+
+        # SAC policy: MUST_SAVE flex_attention_hop
+        def policy_fn(ctx, op, *args, **kwargs):
+            if op == flex_attention_hop:
+                return CheckpointPolicy.MUST_SAVE
+            return CheckpointPolicy.PREFER_RECOMPUTE
+
+        def gn(q, k, v):
+            return flex_attention(q, k, v, score_mod=score_mod)
+
+        def fn(q, k, v):
+            context_fn = functools.partial(
+                create_selective_checkpoint_contexts, policy_fn
+            )
+            return checkpoint(
+                gn,
+                q,
+                k,
+                v,
+                use_reentrant=False,
+                context_fn=context_fn,
+            )
+
+        B, H, S, D = 2, 4, 128, 64
+        q = torch.randn(
+            B, H, S, D, device="cuda", dtype=torch.float16, requires_grad=True
+        )
+        k = torch.randn(
+            B, H, S, D, device="cuda", dtype=torch.float16, requires_grad=True
+        )
+        v = torch.randn(
+            B, H, S, D, device="cuda", dtype=torch.float16, requires_grad=True
+        )
+
+        # Forward compiler: should see flex_attention_hop once
+        fw_compiler = functools.partial(
+            count_ops,
+            freq=1,
+            op=flex_attention_hop,
+        )
+
+        # Backward compiler: should NOT see flex_attention_hop
+        # because MUST_SAVE means it was saved, not recomputed
+        bw_compiler = functools.partial(
+            count_ops,
+            freq=0,
+            op=flex_attention_hop,
+        )
+
+        backend = aot_autograd(
+            fw_compiler=fw_compiler,
+            bw_compiler=bw_compiler,
+            partition_fn=min_cut_rematerialization_partition,
+        )
+
+        # Use config.patch to enable wrapping at inductor level
+        with inductor_config.patch({"wrap_inductor_compiled_regions": True}):
+            compiled_fn = torch.compile(
+                fn,
+                backend=backend,
+                fullgraph=True,
+            )
+
+            output = compiled_fn(q, k, v)
+            loss = output.sum()
+            loss.backward()
+
+        # Verify gradients exist
+        self.assertIsNotNone(q.grad)
+        self.assertIsNotNone(k.grad)
+        self.assertIsNotNone(v.grad)
+
+    @requires_cuda_and_triton
+    @skipIfRocm
+    def test_flex_attention_with_sac_prefer_recompute(self):
+        """
+        Test that SAC policy PREFER_RECOMPUTE for flex_attention_hop
+        causes recomputation during backward when used with wrapper.
+
+        This verifies that flex_attention is properly recomputed when SAC
+        policy specifies PREFER_RECOMPUTE.
+        """
+
+        def score_mod(score, b, h, q_idx, k_idx):
+            return score
+
+        # SAC policy: PREFER_RECOMPUTE flex_attention_hop
+        def policy_fn(ctx, op, *args, **kwargs):
+            if op == flex_attention_hop:
+                # this would be very weird IRL fwiw, just testing
+                return CheckpointPolicy.PREFER_RECOMPUTE
+            return CheckpointPolicy.PREFER_RECOMPUTE
+
+        def gn(q, k, v):
+            return flex_attention(q, k, v, score_mod=score_mod)
+
+        def fn(q, k, v):
+            context_fn = functools.partial(
+                create_selective_checkpoint_contexts, policy_fn
+            )
+            return checkpoint(
+                gn,
+                q,
+                k,
+                v,
+                use_reentrant=False,
+                context_fn=context_fn,
+            )
+
+        B, H, S, D = 2, 4, 128, 64
+        q = torch.randn(
+            B, H, S, D, device="cuda", dtype=torch.float16, requires_grad=True
+        )
+        k = torch.randn(
+            B, H, S, D, device="cuda", dtype=torch.float16, requires_grad=True
+        )
+        v = torch.randn(
+            B, H, S, D, device="cuda", dtype=torch.float16, requires_grad=True
+        )
+
+        # Forward compiler: should see flex_attention_hop once
+        fw_compiler = functools.partial(
+            count_ops,
+            freq=1,
+            op=flex_attention_hop,
+        )
+
+        # Backward compiler: should see flex_attention_hop once
+        # because PREFER_RECOMPUTE means it gets recomputed
+        bw_compiler = functools.partial(
+            count_ops,
+            freq=1,
+            op=flex_attention_hop,
+        )
+
+        backend = aot_autograd(
+            fw_compiler=fw_compiler,
+            bw_compiler=bw_compiler,
+            partition_fn=min_cut_rematerialization_partition,
+        )
+
+        # Use config.patch to enable wrapping at inductor level
+        with inductor_config.patch({"wrap_inductor_compiled_regions": True}):
+            compiled_fn = torch.compile(
+                fn,
+                backend=backend,
+                fullgraph=True,
+            )
+
+            output = compiled_fn(q, k, v)
+            loss = output.sum()
+            loss.backward()
+
+        # Verify gradients exist
+        self.assertIsNotNone(q.grad)
+        self.assertIsNotNone(k.grad)
+        self.assertIsNotNone(v.grad)
+
+    @requires_cuda_and_triton
+    def test_sac_outer_compile_inner_basic(self):
+        """
+        Test SAC(compile(foo)) pattern - SAC on eager code with inner compiled region.
+
+        This is different from compile(SAC(foo)) - here the checkpoint region itself
+        is NOT compiled, but it contains a compiled function inside it.
+
+        The inner compiled function should be wrapped when wrap_inductor_compiled_regions
+        is enabled, making it visible to SAC's dispatch modes.
+        """
+
+        # Inner compiled function with wrapping enabled
+        @torch.compile(
+            backend="inductor",
+            options={"wrap_inductor_compiled_regions": True},
+            fullgraph=True,
+        )
+        def inner_compiled_matmul(x, y):
+            return torch.matmul(x, y)
+
+        # SAC policy: save matmul operations
+        def policy_fn(ctx, op, *args, **kwargs):
+            # When the compiled region is wrapped in inductor_compiled_code HOP,
+            # SAC should be able to see it and apply policy
+            from torch._higher_order_ops.wrap import inductor_compiled_code
+
+            if op == inductor_compiled_code:
+                return CheckpointPolicy.MUST_SAVE
+            return CheckpointPolicy.PREFER_RECOMPUTE
+
+        # Eager checkpointed function that calls compiled code
+        def checkpointed_fn(x, y):
+            # This compiled call should be wrapped in inductor_compiled_code HOP
+            a = inner_compiled_matmul(x, y)
+            b = torch.relu(a)
+            return b
+
+        x = torch.randn(4, 4, device="cuda", requires_grad=True)
+        y = torch.randn(4, 4, device="cuda", requires_grad=True)
+
+        # Clone for comparison
+        x_eager = x.detach().clone().requires_grad_(True)
+        y_eager = y.detach().clone().requires_grad_(True)
+
+        # SAC(compile(foo)) - checkpoint the eager function with inner compiled region
+        context_fn = functools.partial(create_selective_checkpoint_contexts, policy_fn)
+
+        # Test with DebugMode to verify the HOP is visible
+        with DebugMode() as debug_mode:
+            output = checkpoint(
+                checkpointed_fn,
+                x,
+                y,
+                use_reentrant=False,
+                context_fn=context_fn,
+            )
+            loss = output.sum()
+            loss.backward()
+
+        debug_string = debug_mode.debug_string()
+
+        # inductor_compiled_code HOP should be visible to DebugMode
+        self.assertIn(
+            "inductor_compiled_code",
+            debug_string,
+            "inductor_compiled_code HOP should be visible when inner compiled function "
+            "is called from eager checkpoint region",
+        )
+
+        # Verify correctness against eager
+        a_eager = torch.matmul(x_eager, y_eager)
+        b_eager = torch.relu(a_eager)
+        loss_eager = b_eager.sum()
+        loss_eager.backward()
+
+        self.assertEqual(output, b_eager)
+        self.assertEqual(x.grad, x_eager.grad)
+        self.assertEqual(y.grad, y_eager.grad)
+
+    @requires_cuda_and_triton
+    def test_wrap_no_dispatch_mode_no_hop_invoked(self):
+        """
+        Test that without TorchDispatchMode, the HOP is NOT invoked.
+
+        Even when wrap_inductor_compiled_regions=True, if there's no active
+        TorchDispatchMode, the wrapper should not invoke the HOP (optimization).
+        This verifies that we're not paying the HOP overhead unnecessarily.
+        """
+        from unittest.mock import patch
+
+        from torch._higher_order_ops.wrap import inductor_compiled_code
+
+        # Patch it in the output_code module where it's imported and used
+        patch_path = "torch._inductor.output_code.inductor_compiled_code"
+
+        # Test WITHOUT dispatch mode - HOP should NOT be called
+        with patch(patch_path, wraps=inductor_compiled_code) as mock_hop:
+
+            @torch.compile(
+                backend="inductor",
+                options={"wrap_inductor_compiled_regions": True},
+                fullgraph=True,
+            )
+            def fn(x, y):
+                return torch.matmul(x, y)
+
+            x = torch.randn(4, 4, device="cuda")
+            y = torch.randn(4, 4, device="cuda")
+            expected = torch.matmul(x, y)
+
+            result_without = fn(x, y)
+
+            # Verify HOP was NOT called
+            mock_hop.assert_not_called()
+            self.assertEqual(result_without, expected)
+
+        # Test WITH DebugMode - HOP SHOULD be called
+        with patch(patch_path, wraps=inductor_compiled_code) as mock_hop:
+
+            @torch.compile(
+                backend="inductor",
+                options={"wrap_inductor_compiled_regions": True},
+                fullgraph=True,
+            )
+            def fn2(x, y):
+                return torch.matmul(x, y)
+
+            x2 = torch.randn(4, 4, device="cuda")
+            y2 = torch.randn(4, 4, device="cuda")
+            expected2 = torch.matmul(x2, y2)
+
+            with DebugMode():
+                result_with = fn2(x2, y2)
+
+            # Verify HOP WAS called
+            mock_hop.assert_called()
+            self.assertEqual(result_with, expected2)
+
+    @requires_cuda_and_triton
+    @skipIfRocm
+    def test_sac_outer_compile_inner_flex_attention(self):
+        """
+        Test SAC(compile(foo)) with flex_attention - the key motivating use case.
+
+        Pattern: Eager checkpoint region containing compiled flex_attention.
+        This is the pattern where users want SAC to control compiled flex_attention.
+        """
+
+        def score_mod(score, b, h, q_idx, k_idx):
+            return score
+
+        # Policy: save the compiled flex_attention region
+        def policy_fn(ctx, op, *args, **kwargs):
+            from torch._higher_order_ops.wrap import inductor_compiled_code
+
+            # When flex_attention is compiled with wrapping, its compiled kernel
+            # should be wrapped in inductor_compiled_code HOP
+            if op == inductor_compiled_code:
+                return CheckpointPolicy.MUST_SAVE
+            # Also handle the flex_attention_hop itself
+            if op == flex_attention_hop:
+                return CheckpointPolicy.MUST_SAVE
+            return CheckpointPolicy.PREFER_RECOMPUTE
+
+        # Eager function that calls flex_attention (which internally compiles)
+        def checkpointed_flex_fn(q, k, v):
+            # flex_attention internally uses torch.compile, so with
+            # wrap_inductor_compiled_regions enabled, its compiled regions
+            # should be wrapped in the HOP
+            output = flex_attention(q, k, v, score_mod=score_mod)
+            return output
+
+        B, H, S, D = 2, 4, 128, 64
+        q = torch.randn(
+            B, H, S, D, device="cuda", dtype=torch.float16, requires_grad=True
+        )
+        k = torch.randn(
+            B, H, S, D, device="cuda", dtype=torch.float16, requires_grad=True
+        )
+        v = torch.randn(
+            B, H, S, D, device="cuda", dtype=torch.float16, requires_grad=True
+        )
+
+        # Enable wrapping at the inductor config level so that flex_attention's
+        # internal compilation will wrap compiled regions
+        with inductor_config.patch({"wrap_inductor_compiled_regions": True}):
+            context_fn = functools.partial(
+                create_selective_checkpoint_contexts, policy_fn
+            )
+
+            # SAC(compile(foo)) - eager checkpoint with inner compiled flex_attention
+            output = checkpoint(
+                checkpointed_flex_fn,
+                q,
+                k,
+                v,
+                use_reentrant=False,
+                context_fn=context_fn,
+            )
+            loss = output.sum()
+            loss.backward()
+
+        # Verify gradients exist
+        self.assertIsNotNone(q.grad)
+        self.assertIsNotNone(k.grad)
+        self.assertIsNotNone(v.grad)
+
+        # Verify correctness by comparing with non-checkpointed version
+        q2 = q.detach().clone().requires_grad_(True)
+        k2 = k.detach().clone().requires_grad_(True)
+        v2 = v.detach().clone().requires_grad_(True)
+
+        with inductor_config.patch({"wrap_inductor_compiled_regions": True}):
+            output2 = flex_attention(q2, k2, v2, score_mod=score_mod)
+            loss2 = output2.sum()
+            loss2.backward()
+
+        torch.testing.assert_close(output, output2, rtol=1e-3, atol=1e-3)
+        torch.testing.assert_close(q.grad, q2.grad, rtol=1e-3, atol=1e-3)
+        torch.testing.assert_close(k.grad, k2.grad, rtol=1e-3, atol=1e-3)
+        torch.testing.assert_close(v.grad, v2.grad, rtol=1e-3, atol=1e-3)
+
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()
diff --git a/test/dynamo_expected_failures/CPython313-test_math-FMATests.test_fma_infinities b/test/dynamo_expected_failures/CPython313-test_math-FMATests.test_fma_infinities
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_math-FMATests.test_fma_nan_results b/test/dynamo_expected_failures/CPython313-test_math-FMATests.test_fma_nan_results
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_math-FMATests.test_fma_overflow b/test/dynamo_expected_failures/CPython313-test_math-FMATests.test_fma_overflow
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_math-FMATests.test_fma_single_round b/test/dynamo_expected_failures/CPython313-test_math-FMATests.test_fma_single_round
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_math-FMATests.test_fma_zero_result b/test/dynamo_expected_failures/CPython313-test_math-FMATests.test_fma_zero_result
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_math-FMATests.test_random b/test/dynamo_expected_failures/CPython313-test_math-FMATests.test_random
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_sorted_iterators b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_sorted_iterators
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_sorted_iterators b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_sorted_iterators
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestGraphs.test_cuboctahedron b/test/dynamo_expected_failures/CPython313-test_set-TestGraphs.test_cuboctahedron
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect b/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
index a404e15a977ee..12f6ba2228db8 100644
--- a/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
+++ b/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
@@ -23,7 +23,7 @@ torch.fx.graph.Graph.node_copy(self, node: torch.fx.node.Node, arg_transform: Ca
 torch.fx.graph.Graph.output(self, result: 'Argument', type_expr: Optional[Any] = None)
 torch.fx.graph.Graph.placeholder(self, name: str, type_expr: Optional[Any] = None, default_value: Any) -> torch.fx.node.Node
 torch.fx.graph.Graph.print_tabular(self)
-torch.fx.graph.Graph.python_code(self, root_module: str, verbose: bool = False, include_stride: bool = False, include_device: bool = False, colored: bool = False, expanded_def: bool = False) -> torch.fx.graph.PythonCode
+torch.fx.graph.Graph.python_code(self, root_module: str, verbose: bool = False, include_stride: bool = False, include_device: bool = False, colored: bool = False, expanded_def: bool = False, record_func: bool = False) -> torch.fx.graph.PythonCode
 torch.fx.graph_module.GraphModule.__init__(self, root: Union[torch.nn.modules.module.Module, Dict[str, Any]], graph: torch.fx.graph.Graph, class_name: str = 'GraphModule')
 torch.fx.graph_module.GraphModule.add_submodule(self, target: str, m: torch.nn.modules.module.Module) -> bool
 torch.fx.graph_module.GraphModule.delete_all_unused_submodules(self) -> None
diff --git a/test/export/test_experimental.py b/test/export/test_experimental.py
index d67175f8aa3da..5efbb13c25fd2 100644
--- a/test/export/test_experimental.py
+++ b/test/export/test_experimental.py
@@ -3,6 +3,7 @@
 import copy
 import types
 import unittest
+import warnings
 from dataclasses import dataclass
 from typing import Dict, List, Tuple
 
@@ -18,6 +19,9 @@
 from torch.testing._internal.common_utils import TEST_CUDA
 
 
+GLOBAL_LIST = []
+
+
 @unittest.skipIf(not torch._dynamo.is_dynamo_supported(), "dynamo isn't supported")
 class TestExperiment(TestCase):
     def test_joint_basic(self) -> None:
@@ -349,6 +353,18 @@ def generate(self, *, input_tensor, input_tensor2):
         res2 = p.generate(input_tensor=inp, input_tensor2=inp2)
         self.assertTrue(torch.allclose(res, res2))
 
+    def test_side_effect(self):
+        global_env = []
+
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                global_env.append(x)
+                return x.sin()
+
+        with torch._dynamo.config.patch(replay_side_effects=False):
+            _ = dynamo_graph_capture_for_export(Foo())(torch.randn(4, 4))
+            self.assertEqual(len(global_env), 0)
+
     def test_export_add_in_out_info(self):
         class Foo(torch.nn.Module):
             def forward(self, dct, lst, bleh):
@@ -375,9 +391,9 @@ def forward(self, dct, lst, bleh):
         export_inputs = ((dct, lst, 56), {})
         eager_inputs = copy.deepcopy(export_inputs)
 
-        from torch._dynamo.functional_export import _dynamo_graph_capture_for_export
+        from torch._dynamo.functional_export import dynamo_graph_capture_for_export
 
-        graph_module = _dynamo_graph_capture_for_export(Foo())(
+        graph_module = dynamo_graph_capture_for_export(Foo())(
             *export_inputs[0], **export_inputs[1]
         )
 
@@ -394,9 +410,9 @@ def forward(self, x):
         export_inputs = ((torch.randn(4, 4),), {})
         eager_inputs = copy.deepcopy(export_inputs)
 
-        from torch._dynamo.functional_export import _dynamo_graph_capture_for_export
+        from torch._dynamo.functional_export import dynamo_graph_capture_for_export
 
-        graph_module = _dynamo_graph_capture_for_export(Foo())(
+        graph_module = dynamo_graph_capture_for_export(Foo())(
             *export_inputs[0], **export_inputs[1]
         )
 
@@ -440,6 +456,31 @@ def make_inputs():
         test_inputs = make_inputs()
         self.assertEqual(gm(*test_inputs), foo(*test_inputs))
 
+    def test_dynamo_graph_capture_with_call_override(self):
+        class _InterestingModule(torch.nn.Module):
+            def __init__(self, module):
+                super().__init__()
+                self._module = module
+
+            def __call__(self, *args, **kwargs):
+                return self._module(*args, **kwargs)
+
+        class MyModel(torch.nn.Module):
+            def forward(self, x):
+                return x + 1
+
+        foo = _InterestingModule(MyModel())
+
+        def make_inputs():
+            return (torch.randn(2, 3),)
+
+        trace_inputs = make_inputs()
+        gm = dynamo_graph_capture_for_export(foo)(*trace_inputs)
+        test_inputs = make_inputs()
+        self.assertEqual(gm(*test_inputs), foo(*test_inputs))
+        self.assertEqual(len(list(gm.buffers())), len(list(foo.buffers())))
+        self.assertEqual(len(list(gm.parameters())), len(list(foo.parameters())))
+
     def test_dynamo_graph_capture_custom_pytree_type(self):
         import torch.utils._pytree as pytree
 
@@ -573,9 +614,9 @@ def forward(self, args_0):
     _tree_leaf_0, _tree_leaf_1, = pytree.tree_leaves((self, args_0,))
     L_args_0_ , = self._in_shuffle_graph(_tree_leaf_0, _tree_leaf_1)
     l_args_0_ = L_args_0_
-    add = l_args_0_ + 1
+    add = l_args_0_ + 1;  add = None
     mul = l_args_0_ * 2;  l_args_0_ = None
-    return pytree.tree_unflatten(self._out_shuffle_graph(_tree_leaf_0, _tree_leaf_1, mul, add), self._out_spec)""",
+    return pytree.tree_unflatten(self._out_shuffle_graph(_tree_leaf_0, _tree_leaf_1, mul), self._out_spec)""",
         )
         self.assertEqual(gm(*test_inputs), foo(*test_inputs))
 
@@ -599,6 +640,34 @@ def make_inputs():
         self.assertEqual(len(list(gm.buffers())), len(list(foo.buffers())))
         self.assertEqual(len(list(gm.parameters())), len(list(foo.parameters())))
 
+    def test_dynamo_graph_capture_side_effects(self):
+        GLOBAL_LIST.clear()
+
+        def foo(x):
+            z = x + 1
+            GLOBAL_LIST.append(z)
+            return z
+
+        def make_inputs():
+            return (torch.randn(2, 3),)
+
+        trace_inputs = make_inputs()
+        with warnings.catch_warnings(record=True) as w:
+            gm = dynamo_graph_capture_for_export(foo)(*trace_inputs)
+            cnt = 0
+            for entry in w:
+                if "While compiling, we found certain side effects happened" in str(
+                    entry.message
+                ):
+                    cnt += 1
+            self.assertEqual(cnt, 1)
+        self.assertEqual(len(GLOBAL_LIST), 0)
+        test_inputs = make_inputs()
+        gm_results = gm(*test_inputs)
+        self.assertEqual(len(GLOBAL_LIST), 0)
+        self.assertEqual(gm_results, foo(*test_inputs))
+        self.assertEqual(len(GLOBAL_LIST), 1)
+
     @unittest.skipIf(not TEST_CUDA, "CUDA not available")
     def test_dynamo_graph_capture_fx_graph_annotate_overlap_pass(self):
         class DummyOp(torch.autograd.Function):
diff --git a/test/export/test_export.py b/test/export/test_export.py
index 3908f03b11e55..204d458e77704 100755
--- a/test/export/test_export.py
+++ b/test/export/test_export.py
@@ -11,6 +11,7 @@
 import operator
 import os
 import re
+import sys
 import traceback
 import unittest
 import warnings
@@ -585,6 +586,7 @@ def forward(self, x, y):
         inp = ([torch.ones(1, 3)], torch.ones(1, 3))
         self._test_export_same_as_eager(f, inp)
 
+    @testing.expectedFailureStrictV2
     @skipIfCrossRef
     def test_custom_tag_metadata_re_export(self):
         class Foo(torch.nn.Module):
@@ -721,6 +723,45 @@ def example_inputs(self):
                 )
                 self.assertEqual(node.meta["from_node"][-1].graph_id, graph_id)
 
+    def test_annotate_on_assert(self):
+        # nodes added in `apply_runtime_assertion_pass` will be annotated
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                with torch.fx.traceback.annotate({"moo": 0}):
+                    x = torch.cat([x, x])
+                    b = y.item()
+                    torch._check(b >= x.shape[0])
+                    return x * b
+
+        with torch.fx.traceback.preserve_node_meta():
+            ep = torch.export.export(
+                M(),
+                (torch.randn(3), torch.tensor(6)),
+                dynamic_shapes={"x": {0: Dim("b")}, "y": None},
+            )
+
+        # clean up _torchdynamo related meta data as it could vary depending on the caller
+        # https://github.com/pytorch/pytorch/issues/167432
+        for node in ep.graph.nodes:
+            if "custom" in node.meta:
+                node.meta["custom"] = {
+                    k: v
+                    for k, v in node.meta["custom"].items()
+                    if "_torchdynamo_disable" not in k
+                }
+
+        custom_metadata = torch.fx.traceback._get_custom_metadata(ep.module())
+
+        self.assertExpectedInline(
+            str(custom_metadata),
+            """\
+('call_function', 'cat', {'moo': 0})
+('call_function', 'item', {'moo': 0})
+('call_function', 'ge_1', {'moo': 0})
+('call_function', '_assert_scalar_default', {'moo': 0})
+('call_function', 'mul', {'moo': 0})""",
+        )
+
     @requires_gpu
     def test_flex_attention_export(self):
         from torch.nn.attention.flex_attention import create_block_mask, flex_attention
@@ -990,6 +1031,7 @@ def forward(self, x):
         dynamic_shapes = {"x": (dim0_x, dim1_x)}
         export(Foo(), inputs, dynamic_shapes=dynamic_shapes)
 
+    @testing.expectedFailureStrictV2
     def test_no_tensor_computation(self):
         class Module(torch.nn.Module):
             def forward(self, x, y):
@@ -1193,8 +1235,14 @@ def forward(self, x):
     %p_block_linear2_bias : [num_users=1] = placeholder[target=p_block_linear2_bias]
     %x : [num_users=1] = placeholder[target=x]
     %wrap_body0 : [num_users=1] = get_attr[target=wrap_body0]
-    %tag_activation_checkpoint : [num_users=1] = call_function[target=torch.ops.higher_order.tag_activation_checkpoint](args = (%wrap_body0, %x, %p_block_linear1_weight, %p_block_linear1_bias, %p_block_linear2_weight, %p_block_linear2_bias), kwargs = {})
+    %tag_activation_checkpoint : [num_users=7] = call_function[target=torch.ops.higher_order.tag_activation_checkpoint](args = (%wrap_body0, %x, %p_block_linear1_weight, %p_block_linear1_bias, %p_block_linear2_weight, %p_block_linear2_bias), kwargs = {})
     %getitem : [num_users=1] = call_function[target=operator.getitem](args = (%tag_activation_checkpoint, 0), kwargs = {})
+    %getitem_1 : [num_users=0] = call_function[target=operator.getitem](args = (%tag_activation_checkpoint, 1), kwargs = {})
+    %getitem_2 : [num_users=0] = call_function[target=operator.getitem](args = (%tag_activation_checkpoint, 2), kwargs = {})
+    %getitem_3 : [num_users=0] = call_function[target=operator.getitem](args = (%tag_activation_checkpoint, 3), kwargs = {})
+    %getitem_4 : [num_users=0] = call_function[target=operator.getitem](args = (%tag_activation_checkpoint, 4), kwargs = {})
+    %getitem_5 : [num_users=0] = call_function[target=operator.getitem](args = (%tag_activation_checkpoint, 5), kwargs = {})
+    %getitem_6 : [num_users=0] = call_function[target=operator.getitem](args = (%tag_activation_checkpoint, 6), kwargs = {})
     return (getitem,)""",
         )
 
@@ -1203,14 +1251,14 @@ def forward(self, x):
             """\
 graph():
     %arg0_1 : [num_users=1] = placeholder[target=arg0_1]
-    %arg1_1 : [num_users=1] = placeholder[target=arg1_1]
-    %arg2_1 : [num_users=1] = placeholder[target=arg2_1]
-    %arg3_1 : [num_users=1] = placeholder[target=arg3_1]
-    %arg4_1 : [num_users=1] = placeholder[target=arg4_1]
-    %linear : [num_users=1] = call_function[target=torch.ops.aten.linear.default](args = (%arg0_1, %arg1_1, %arg2_1), kwargs = {})
-    %relu : [num_users=1] = call_function[target=torch.ops.aten.relu.default](args = (%linear,), kwargs = {})
+    %arg1_1 : [num_users=2] = placeholder[target=arg1_1]
+    %arg2_1 : [num_users=2] = placeholder[target=arg2_1]
+    %arg3_1 : [num_users=2] = placeholder[target=arg3_1]
+    %arg4_1 : [num_users=2] = placeholder[target=arg4_1]
+    %linear : [num_users=2] = call_function[target=torch.ops.aten.linear.default](args = (%arg0_1, %arg1_1, %arg2_1), kwargs = {})
+    %relu : [num_users=2] = call_function[target=torch.ops.aten.relu.default](args = (%linear,), kwargs = {})
     %linear_1 : [num_users=1] = call_function[target=torch.ops.aten.linear.default](args = (%relu, %arg3_1, %arg4_1), kwargs = {})
-    return (linear_1,)""",
+    return (linear_1, arg1_1, arg2_1, linear, relu, arg3_1, arg4_1)""",
         )
 
         stack = contextlib.ExitStack()
@@ -1319,6 +1367,7 @@ def forward(self, x):
         # instead of the scripted function, so we get x.sin()
         self.assertEqual(res, x.sin())
 
+    @testing.expectedFailureStrictV2
     def test_no_tensor_computation_2(self):
         class Module(torch.nn.Module):
             def forward(self, x, y):
@@ -1337,6 +1386,7 @@ def forward(self, x, y):
     return (x,)""",
         )
 
+    @testing.expectedFailureStrictV2
     def test_no_tensor_computation_3(self):
         class Module(torch.nn.Module):
             def forward(self, x, y):
@@ -1355,6 +1405,7 @@ def forward(self, x, y):
     return (5,)""",
         )
 
+    @testing.expectedFailureStrictV2
     def test_no_tensor_computation_4(self):
         class Module(torch.nn.Module):
             def forward(self, x, y):
@@ -1897,6 +1948,7 @@ def forward(self, x: torch.Tensor, as_tuple: bool) -> torch.Tensor:
         for vr_upper in vr_upper_bounds:
             self.assertEqual(vr_upper, 1)
 
+    @testing.expectedFailureStrictV2
     def test_detect_leak_strict(self):
         class Foo(torch.nn.Module):
             def __init__(self):
@@ -2645,6 +2697,7 @@ def annotate_split_points(mod: torch.nn.Module, spec):
             gm = export(m, (torch.rand(64, 64),))
             torch.export.unflatten(gm)
 
+    @testing.expectedFailureStrictV2
     def test_unflatten_closure(self):
         class Dummy(torch.nn.Module):
             def forward(self, fn, x):
@@ -4150,6 +4203,7 @@ def forward(self, a, b, c):
             if str(sym) in ["u0", "s0"]:
                 self.assertEqual(vr.lower, 1)
 
+    @testing.expectedFailureStrictV2
     def test_duplicate_modules_with_non_persistent_buffers(self):
         class FooWithBuf(torch.nn.Module):
             def __init__(self):
@@ -4793,6 +4847,7 @@ def test_export_custom_decomp_table_container_methods(self):
             table.materialize()
             self.assertFalse(torch.ops.mylib.foo123.default in table)
 
+    @testing.expectedFailureStrictV2
     def test_if_post_autograd_op_preserved(self):
         class Foo(torch.nn.Module):
             def forward(self, x):
@@ -6093,26 +6148,19 @@ def forward(self, x, y, fixes):
         retry_export(
             cf_implicitsize(),
             (torch.tensor(2), torch.randn(10)),
-            fixes=[
-                # Could not guard on data-dependent expression u0 < 0
-                "torch._check(i >= 0)",
-            ],
+            fixes=[],
         )
 
         class cf_stacklist(torch.nn.Module):
             def forward(self, xs, y, fixes):
                 i = y.item()
                 eval(fixes)
-                # instead of xs[i]
                 return torch.stack(xs, 0).narrow(0, i, 1).squeeze()
 
         retry_export(
             cf_stacklist(),
             ([torch.ones(5) * i for i in range(10)], torch.tensor(2)),
-            fixes=[
-                # Could not guard on data-dependent expression u0 < 0
-                "torch._check(i >= 0)",
-            ],
+            fixes=[],
         )
 
         class cf_tensorsplit(torch.nn.Module):
@@ -6166,7 +6214,12 @@ def test_no_suggested_fixes_for_data_dependent_errors(self):
         class cf_stacklist(torch.nn.Module):
             def forward(self, xs, y):
                 # y.item() is not a local, so we can't suggest a fix
-                return torch.stack(xs, 0).narrow(0, y.item(), 1).squeeze()
+                if y.item() < 0:
+                    return (
+                        torch.stack(xs, 0).narrow(0, y.item() + xs.size(), 1).squeeze()
+                    )
+                else:
+                    return torch.stack(xs, 0).narrow(0, y.item(), 1).squeeze()
 
         with self.assertRaisesRegex(
             error_type,
@@ -6196,7 +6249,18 @@ class cf_stacklist_udd(torch.nn.Module):
             def forward(self, xs, y):
                 box = Box(y.item())
                 # box.content is not a local, so we can't suggest a fix
-                return torch.stack(xs, 0).narrow(0, box.content, 1).squeeze()
+                if box.content < 0:
+                    return (
+                        torch.stack(xs, 0)
+                        .narrow(0, box.content + xs.size(), 1)
+                        .squeeze()
+                    )
+                else:
+                    return (
+                        torch.stack(xs, 0)
+                        .narrow(0, box.content + xs.size(), 1)
+                        .squeeze()
+                    )
 
         with self.assertRaisesRegex(
             error_type,
@@ -7182,6 +7246,7 @@ def forward(self, arg1, arg2, *args, kw1, kw2, **kwargs):
     @testing.expectedFailureSerDer  # we don't save placeholder metadata
     @testing.expectedFailureCppSerDes  # we don't save placeholder metadata
     @testing.expectedFailureSerDerNonStrict
+    @testing.expectedFailureStrictV2
     def test_linear_conv(self):
         strict = True
 
@@ -8780,6 +8845,7 @@ def forward(self, x, y):
             )
         )
 
+    @testing.expectedFailureStrictV2
     def test_automatic_constrain_size(self):
         class M(torch.nn.Module):
             def forward(self, x, y):
@@ -8891,6 +8957,7 @@ def body_fn(idx, x):
         ):
             ep.graph_module.while_loop_body_graph_0(torch.tensor([5]), torch.zeros(1))
 
+    @testing.expectedFailureStrictV2
     def test_constrain_decomp(self) -> None:
         class M(torch.nn.Module):
             def __init__(self) -> None:
@@ -9529,6 +9596,7 @@ def forward(self, x):
         self.assertTrue(torch.allclose(ep.module()(xs), module_out))
 
     @requires_cuda_and_triton
+    @testing.expectedFailureStrictV2
     def test_export_associative_scan_lifted_buffers(self):
         if "cpp_runtime_nonstrict" in self.id():
             self.skipTest("TODO Unexpected success in OSS but not in fbcode.")
@@ -9619,6 +9687,7 @@ def forward(self, xs, y):
                 len([node for node in gm.graph.nodes if node.op == "placeholder"]), 2
             )
 
+    @testing.expectedFailureStrictV2
     def test_no_check_is_size_error(self):
         class Module(torch.nn.Module):
             def forward(self, x):
@@ -9772,6 +9841,7 @@ def forward(self, x):
         self.assertEqual(len(ep.graph_signature.input_specs), 4)
         self.assertTrue(torch.allclose(ep.module()(*inp), transform.module()(*inp)))
 
+    @testing.expectedFailureStrictV2
     def test_tensor_attribute_zero_args(self):
         class Foo(torch.nn.Module):
             def __init__(self, value):
@@ -9785,6 +9855,7 @@ def forward(self):
         ep = export(m, ())
         self.assertEqual(ep.graph_signature.lifted_tensor_constants, ["x"])
 
+    @testing.expectedFailureStrictV2
     def test_preserve_shape_dynamism_for_unused_inputs(self):
         torch.export.register_dataclass(
             Inp3,
@@ -9954,6 +10025,7 @@ def forward(self, p_lin_weight, p_lin_bias, x):
         )
 
     @unittest.skipIf(IS_FBCODE, "We can't customize decomp in fbcode")
+    @testing.expectedFailureStrictV2
     def test_export_decomp_torture_case_2(self):
         class MyLinear(torch.nn.Module):
             def __init__(self) -> None:
@@ -10089,6 +10161,7 @@ def forward(self, a, b):
             # expected 4, but got 7
             ep_v2.module()(*test_inp)
 
+    @testing.expectedFailureStrictV2
     def test_constant_output(self):
         class ModuleConstant(torch.nn.Module):
             def __init__(self) -> None:
@@ -10173,6 +10246,7 @@ def dynamify_inp(x):
             # expected >= 3, but got 2
             ep.module()(*test_inp)
 
+    @testing.expectedFailureStrictV2
     def test_nested_module(self):
         class M1(torch.nn.Module):
             def forward(self, x):
@@ -10210,6 +10284,7 @@ def forward(self, x):
         unflattened = unflatten(ep)
         self.assertTrue(torch.allclose(unflattened(*inps), M2()(*inps)))
 
+    @testing.expectedFailureStrictV2
     def test_nested_module_with_init_buffer(self):
         class M1(torch.nn.Module):
             def __init__(self) -> None:
@@ -10337,6 +10412,7 @@ def forward(self, x, d):
         ep = export(m, sample_inputs)
         self.assertEqual(ep.module()(*sample_inputs), m(*sample_inputs))
 
+    @testing.expectedFailureStrictV2
     def test_lazy_module_kwargs(self):
         class LazyModule(torch.nn.modules.lazy.LazyModuleMixin, torch.nn.Module):
             def initialize_parameters(self, *args, **kwargs):
@@ -12190,8 +12266,15 @@ class Foo(torch.nn.Module):
             def forward(self, x):
                 return x + 2
 
-        def fancy_forward(x, y):
-            return x + 2 + y
+        if sys.version_info >= (3, 14):
+            # functools.partial is now a method descriptor:
+            # https://docs.python.org/3/whatsnew/3.14.html#changes-in-the-python-api
+            def fancy_forward(self, x, y):
+                return x + 2 + y
+        else:
+
+            def fancy_forward(x, y):
+                return x + 2 + y
 
         Foo.forward = functools.partial(fancy_forward, y=torch.randn(4, 4))
         x = torch.randn(4, 4)
@@ -12210,6 +12293,7 @@ def forward(self, x, y, z):
         ep.module()(x)
 
     @testing.expectedFailureCppRuntime
+    @testing.expectedFailureStrictV2
     def test_symint_input_basic(self):
         class M(torch.nn.Module):
             def forward(self, x, y):
@@ -12929,6 +13013,7 @@ def forward(self, p, a, b):
         ufm = torch.export.unflatten(ep)
         self.assertTrue(torch.allclose(ufm(*inp), epm(*inp)))
 
+    @testing.expectedFailureStrictV2
     def test_unflatten_multiple_graphs_shared_submodule(self):
         class N(torch.nn.Module):
             def forward(self, x, b):
@@ -13980,6 +14065,7 @@ def forward(self, x):
     return (foo_functional,)""",
         )
 
+    @testing.expectedFailureStrictV2
     def test_placeholder_naming_order(self):
         # See https://github.com/pytorch/pytorch/issues/143732
 
@@ -14031,6 +14117,7 @@ def forward(self, baba, *, start, end):
         ).run_decompositions()
         ep.module()(torch.ones(4, 4), **kwargs)
 
+    @testing.expectedFailureStrictV2
     def test_placeholder_naming_order_variadic(self):
         class Mod(torch.nn.Module):
             def forward(self, a, b, c, **kwargs):
@@ -14055,6 +14142,7 @@ def forward(self, x):
         ):
             export(Foo(), (torch.randn(4, 4),), strict=False)
 
+    @testing.expectedFailureStrictV2
     def test_placeholder_naming_collisions(self):
         # test collisions between nested user inputs
         class Foo(torch.nn.Module):
@@ -14127,6 +14215,7 @@ def forward(self, mul, add, add_1):
         self.assertEqual(expected_names_and_ops, real_names_and_ops)
 
     @skipIfCrossRef  # Dynamo changes the order of ops under Torch function modes
+    @testing.expectedFailureStrictV2
     def test_placeholder_naming_collisions_hoo_subgraphs(self):
         # test collisions between user inputs, top-level nodes, and HOO subgraph nodes
         class Foo(torch.nn.Module):
@@ -14204,6 +14293,7 @@ def forward(self, input, true_graph, body_graph):
         ]
         self.assertEqual(expected_getattr_names, real_getattr_names)
 
+    @testing.expectedFailureStrictV2
     def test_constant_input_naming(self):
         class Foo(torch.nn.Module):
             def forward(self, x, y, div="floor"):
@@ -14895,6 +14985,7 @@ def forward(self, x, y):
         ]
         self.assertEqual(len(repeat_nodes), 0)
 
+    @testing.expectedFailureStrictV2
     def test_checks_to_constrain_range(self):
         class Foo(torch.nn.Module):
             def forward(self, x, y):
@@ -15220,15 +15311,16 @@ class Foo(torch.nn.Module):
             def forward(self, block):
                 return block.a + block.b
 
-        from torch._dynamo.functional_export import _dynamo_graph_capture_for_export
+        from torch._dynamo.functional_export import dynamo_graph_capture_for_export
 
         with self.assertRaisesRegex(
             torch._dynamo.exc.UserError, "It looks like one of the inputs with type"
         ):
-            _dynamo_graph_capture_for_export(Foo())(
+            dynamo_graph_capture_for_export(Foo())(
                 Block(torch.randn(4, 4), torch.randn(4, 4))
             )
 
+    @testing.expectedFailureStrictV2
     def test_enum_str(self):
         class TensorDim(str, enum.Enum):
             DDP = "ddp"
@@ -15390,6 +15482,7 @@ def forward(self, x):
     return (getitem_3, cos_1)""",
             )
 
+    @testing.expectedFailureStrictV2
     def test_run_decompositions_keep_metadata(self):
         """Make sure the metadata is kept after exported program run_decompositions."""
 
@@ -15419,6 +15512,7 @@ def forward(self, x, y):
         for node in decomposed_program.graph.nodes:
             self.assertEqual(node.meta["custom"]["my_field"], "dummy")
 
+    @testing.expectedFailureStrictV2
     def test_run_decompositions_keep_tensor_constant_metadata(self):
         """Make sure the metadata of tensor constants are kept after run_decompositions."""
 
@@ -16050,6 +16144,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
     @testing.expectedFailureSerDer  # T195866111
     @testing.expectedFailureSerDerNonStrict
+    @testing.expectedFailureStrictV2
     def test_hints_wrapper(self):
         strict = True
 
@@ -16624,6 +16719,7 @@ def forward(self, args_0):
     return (abs_1,)""",
         )
 
+    @testing.expectedFailureStrictV2
     def test_sdpa_gqa(self):
         from torch.nn.attention import sdpa_kernel, SDPBackend
 
diff --git a/test/export/test_export_with_inline_and_install.py b/test/export/test_export_with_inline_and_install.py
deleted file mode 100644
index bb5ad8b63ae1b..0000000000000
--- a/test/export/test_export_with_inline_and_install.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Owner(s): ["oncall: export"]
-
-
-from torch._dynamo import config as dynamo_config
-from torch._dynamo.testing import make_test_cls_with_patches
-from torch._export import config as export_config
-
-
-try:
-    from . import test_export, testing
-except ImportError:
-    import test_export  # @manual=fbcode//caffe2/test:test_export-library
-    import testing  # @manual=fbcode//caffe2/test:test_export-library
-
-from torch.export import export
-
-
-test_classes = {}
-
-
-def mocked_strict_export(*args, **kwargs):
-    # If user already specified strict, don't make it strict
-    if "strict" in kwargs:
-        return export(*args, **kwargs)
-    return export(*args, **kwargs, strict=True)
-
-
-def make_dynamic_cls(cls):
-    # Some test check for ending in suffix; need to make
-    # the `_strict` for end of string as a result
-    suffix = test_export.INLINE_AND_INSTALL_STRICT_SUFFIX
-
-    cls_prefix = "InlineAndInstall"
-
-    cls_a = testing.make_test_cls_with_mocked_export(
-        cls,
-        "StrictExport",
-        suffix,
-        mocked_strict_export,
-        xfail_prop="_expected_failure_strict",
-    )
-    test_class = make_test_cls_with_patches(
-        cls_a,
-        cls_prefix,
-        "",
-        (export_config, "use_new_tracer_experimental", True),
-        (dynamo_config, "install_free_tensors", True),
-        (dynamo_config, "inline_inbuilt_nn_modules", True),
-        xfail_prop="_expected_failure_inline_and_install",
-    )
-
-    test_classes[test_class.__name__] = test_class
-    # REMOVING THIS LINE WILL STOP TESTS FROM RUNNING
-    globals()[test_class.__name__] = test_class
-    test_class.__module__ = __name__
-    return test_class
-
-
-tests = [
-    test_export.TestDynamismExpression,
-    test_export.TestExport,
-]
-for test in tests:
-    make_dynamic_cls(test)
-del test
-
-
-if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
-
-    run_tests()
diff --git a/test/export/test_lift_unlift.py b/test/export/test_lift_unlift.py
index af892a96feb5e..4ab1d17ca898f 100644
--- a/test/export/test_lift_unlift.py
+++ b/test/export/test_lift_unlift.py
@@ -138,6 +138,7 @@ def gen_graph_signature(self) -> ExportGraphSignature:
 
 class TestLift(TestCase):
     def setUp(self):
+        super().setUp()
         load_torchbind_test_lib()
 
     def test_lift_basic(self):
@@ -360,6 +361,7 @@ def forward(self, x):
 
 class ConstantAttrMapTest(TestCase):
     def setUp(self):
+        super().setUp()
         load_torchbind_test_lib()
 
     def test_dict_api(self):
diff --git a/test/export/test_nativert.py b/test/export/test_nativert.py
index 20f61ad03fffb..6a40c98638901 100644
--- a/test/export/test_nativert.py
+++ b/test/export/test_nativert.py
@@ -2,7 +2,6 @@
 
 
 import copy
-import pathlib
 import tempfile
 import unittest
 
@@ -97,55 +96,55 @@ def run_with_nativert(ep):
     MODEL_NAME = "forward"
 
     # TODO Does named tempfile have collision?
-    with tempfile.NamedTemporaryFile(suffix=".pt2", delete=False) as f:
+    with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
         torch.export.pt2_archive._package.package_pt2(
             f, exported_programs={MODEL_NAME: ep_infer}
         )
         filename = f.name
 
-    try:
-        ep_args, ep_kwargs = ep_infer.example_inputs
-        ep_args_copied, ep_kwargs_copied = (
-            copy.deepcopy(ep_args),
-            copy.deepcopy(ep_kwargs),
-        )
-        torch.manual_seed(0)
         try:
-            flat_expected = pytree.tree_leaves(
-                ep_infer.module()(*ep_args_copied, **ep_kwargs_copied)
-            )
-        except Exception as e:
-            raise unittest.case.SkipTest(str(e)) from e
-
-        model_runner = PyModelRunner(filename, MODEL_NAME)
-        torch.manual_seed(0)
-        if _is_supported_types((ep_args, ep_kwargs)):
-            results = model_runner.run(*ep_args, **ep_kwargs)
-        else:
-            results = model_runner.run_with_flat_inputs_and_outputs(
-                *pytree.tree_leaves((ep_args, ep_kwargs))
+            ep_args, ep_kwargs = ep_infer.example_inputs
+            ep_args_copied, ep_kwargs_copied = (
+                copy.deepcopy(ep_args),
+                copy.deepcopy(ep_kwargs),
             )
-        flat_results = pytree.tree_leaves(results)
-        assert len(flat_results) == len(flat_expected)
-        for result, expected in zip(flat_results, flat_expected):
-            assert type(result) is type(expected)
-            if isinstance(result, torch.Tensor) and isinstance(expected, torch.Tensor):
-                assert result.shape == expected.shape
-                assert result.dtype == expected.dtype
-                assert result.device == expected.device
-                torch.testing.assert_close(result, expected, equal_nan=True)
+            torch.manual_seed(0)
+            try:
+                flat_expected = pytree.tree_leaves(
+                    ep_infer.module()(*ep_args_copied, **ep_kwargs_copied)
+                )
+            except Exception as e:
+                raise unittest.case.SkipTest(str(e)) from e
+
+            model_runner = PyModelRunner(filename, MODEL_NAME)
+            torch.manual_seed(0)
+            if _is_supported_types((ep_args, ep_kwargs)):
+                results = model_runner.run(*ep_args, **ep_kwargs)
             else:
-                assert result == expected
-    except RuntimeError as e:
-        # User need to register pytree type on the cpp side, which
-        # cannot be tested in python unittest.
-        if "Unknown pytree node type" in str(e):
-            pass
-        else:
-            raise e
-    finally:
-        pathlib.Path(filename).unlink(missing_ok=True)
-    return ep
+                results = model_runner.run_with_flat_inputs_and_outputs(
+                    *pytree.tree_leaves((ep_args, ep_kwargs))
+                )
+            flat_results = pytree.tree_leaves(results)
+            assert len(flat_results) == len(flat_expected)
+            for result, expected in zip(flat_results, flat_expected):
+                assert type(result) is type(expected)
+                if isinstance(result, torch.Tensor) and isinstance(
+                    expected, torch.Tensor
+                ):
+                    assert result.shape == expected.shape
+                    assert result.dtype == expected.dtype
+                    assert result.device == expected.device
+                    torch.testing.assert_close(result, expected, equal_nan=True)
+                else:
+                    assert result == expected
+        except RuntimeError as e:
+            # User need to register pytree type on the cpp side, which
+            # cannot be tested in python unittest.
+            if "Unknown pytree node type" in str(e):
+                pass
+            else:
+                raise e
+        return ep
 
 
 def mocked_nativert_export_strict(*args, **kwargs):
@@ -287,7 +286,7 @@ def test_aoti(self, device, m, sample_inputs):
         )
 
         # package everything needed for the NativeRT to execute the AOTI delegate
-        with tempfile.NamedTemporaryFile(suffix=".pt2", delete=False) as f:
+        with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
             package_nativert_with_aoti_delegate(
                 f,
                 MODEL_NAME,
@@ -298,50 +297,48 @@ def test_aoti(self, device, m, sample_inputs):
             )
             filename = f.name
 
-        try:
-            ep_args, ep_kwargs = aoti_delegate_ep.example_inputs
-            ep_args_copied, ep_kwargs_copied = (
-                copy.deepcopy(ep_args),
-                copy.deepcopy(ep_kwargs),
-            )
-            torch.manual_seed(0)
             try:
-                flat_expected = pytree.tree_leaves(
-                    aoti_delegate_ep.module()(*ep_args_copied, **ep_kwargs_copied)
+                ep_args, ep_kwargs = aoti_delegate_ep.example_inputs
+                ep_args_copied, ep_kwargs_copied = (
+                    copy.deepcopy(ep_args),
+                    copy.deepcopy(ep_kwargs),
                 )
-            except Exception as e:
-                raise unittest.case.SkipTest(str(e)) from e
-
-            model_runner = PyModelRunner(filename, f"{MODEL_NAME}-{BACKEND_ID}")
-            torch.manual_seed(0)
-            if _is_supported_types((ep_args, ep_kwargs)):
-                results = model_runner.run(*ep_args, **ep_kwargs)
-            else:
-                results = model_runner.run_with_flat_inputs_and_outputs(
-                    *pytree.tree_leaves((ep_args, ep_kwargs))
-                )
-            flat_results = pytree.tree_leaves(results)
-            assert len(flat_results) == len(flat_expected)
-            for result, expected in zip(flat_results, flat_expected):
-                assert type(result) is type(expected)
-                if isinstance(result, torch.Tensor) and isinstance(
-                    expected, torch.Tensor
-                ):
-                    assert result.shape == expected.shape
-                    assert result.dtype == expected.dtype
-                    assert result.device == expected.device
-                    torch.testing.assert_close(result, expected, equal_nan=True)
+                torch.manual_seed(0)
+                try:
+                    flat_expected = pytree.tree_leaves(
+                        aoti_delegate_ep.module()(*ep_args_copied, **ep_kwargs_copied)
+                    )
+                except Exception as e:
+                    raise unittest.case.SkipTest(str(e)) from e
+
+                model_runner = PyModelRunner(filename, f"{MODEL_NAME}-{BACKEND_ID}")
+                torch.manual_seed(0)
+                if _is_supported_types((ep_args, ep_kwargs)):
+                    results = model_runner.run(*ep_args, **ep_kwargs)
                 else:
-                    assert result == expected
-        except RuntimeError as e:
-            # User need to register pytree type on the cpp side, which
-            # cannot be tested in python unittest.
-            if "Unknown pytree node type" in str(e):
-                pass
-            else:
-                raise e
-        finally:
-            pathlib.Path(filename).unlink(missing_ok=True)
+                    results = model_runner.run_with_flat_inputs_and_outputs(
+                        *pytree.tree_leaves((ep_args, ep_kwargs))
+                    )
+                flat_results = pytree.tree_leaves(results)
+                assert len(flat_results) == len(flat_expected)
+                for result, expected in zip(flat_results, flat_expected):
+                    assert type(result) is type(expected)
+                    if isinstance(result, torch.Tensor) and isinstance(
+                        expected, torch.Tensor
+                    ):
+                        assert result.shape == expected.shape
+                        assert result.dtype == expected.dtype
+                        assert result.device == expected.device
+                        torch.testing.assert_close(result, expected, equal_nan=True)
+                    else:
+                        assert result == expected
+            except RuntimeError as e:
+                # User need to register pytree type on the cpp side, which
+                # cannot be tested in python unittest.
+                if "Unknown pytree node type" in str(e):
+                    pass
+                else:
+                    raise e
 
 
 if is_fbcode():
diff --git a/test/export/test_serialize.py b/test/export/test_serialize.py
index 472ddcf556f83..1280ab45f2a82 100644
--- a/test/export/test_serialize.py
+++ b/test/export/test_serialize.py
@@ -38,6 +38,7 @@
     _to_json_bytes,
     canonicalize,
     deserialize,
+    deserialize_torch_artifact,
     ExportedProgramDeserializer,
     ExportedProgramSerializer,
     GraphModuleSerializer,
@@ -1904,6 +1905,16 @@ def forward(self, x):
 
         self.assertTrue(torch.allclose(ep.module()(*inp), loaded_ep.module()(*inp)))
 
+    def test_deserialize_torch_artifact_dict(self):
+        data = {"key": torch.tensor([1, 2, 3])}
+        buf = io.BytesIO()
+        torch.save(data, buf)
+        serialized = buf.getvalue()
+        result = deserialize_torch_artifact(serialized)
+
+        self.assertIsInstance(result, dict)
+        self.assertTrue(torch.equal(result["key"], torch.tensor([1, 2, 3])))
+
     @unittest.skipIf(IS_WINDOWS, "Cannot modify file in windows")
     def test_save_file(self):
         class Foo(torch.nn.Module):
@@ -2010,7 +2021,6 @@ def forward(self, x):
         save(ep, buffer)
         buffer.seek(0)
         loaded_ep = load(buffer)
-
         inp = (torch.tensor(1),)
         self.assertTrue(torch.allclose(ep.module()(*inp), loaded_ep.module()(*inp)))
 
diff --git a/test/export/test_sparse.py b/test/export/test_sparse.py
index e94601cd5af87..c8d799a0254b0 100644
--- a/test/export/test_sparse.py
+++ b/test/export/test_sparse.py
@@ -96,7 +96,7 @@ def forward(self, x):
 )
 class TestSparseProp(TestCase):
     def setUp(self):
-        TestCase.setUp(self)
+        super().setUp()
 
     def assertEqualMeta(self, x, y):
         self.assertIsInstance(x, FakeTensor)
diff --git a/test/export/test_strict_export_v2.py b/test/export/test_strict_export_v2.py
index 53935d70bb49c..94c72093f124b 100644
--- a/test/export/test_strict_export_v2.py
+++ b/test/export/test_strict_export_v2.py
@@ -15,7 +15,7 @@
 
 def mocked_strict_export_v2(*args, **kwargs):
     # If user already specified strict, don't make it strict
-    with config.patch(use_new_tracer_experimental=True):
+    with config.patch(use_legacy_dynamo_graph_capture=False):
         if "strict" in kwargs:
             return export(*args, **kwargs)
         return export(*args, **kwargs, strict=True)
diff --git a/test/export/test_upgrader.py b/test/export/test_upgrader.py
index 0c36b28750f90..88f4c4e2fa435 100644
--- a/test/export/test_upgrader.py
+++ b/test/export/test_upgrader.py
@@ -8,6 +8,7 @@
 
 class TestUpgrader(TestCase):
     def setUp(self) -> None:
+        super().setUp()
         # Register example upgraders dynamically
         torch._C._export.register_example_upgraders()
 
diff --git a/test/functorch/dim/test_getsetitem.py b/test/functorch/dim/test_getsetitem.py
index ae7ed0283c753..d91078deafd74 100644
--- a/test/functorch/dim/test_getsetitem.py
+++ b/test/functorch/dim/test_getsetitem.py
@@ -8,6 +8,7 @@ class TestGetSetItem(TestCase):
     """Comprehensive tests for first-class dimension indexing operations."""
 
     def setUp(self):
+        super().setUp()
         """Set up common test fixtures."""
         self.batch, self.height, self.width = dims(3)
 
diff --git a/test/functorch/test_ac_logging.py b/test/functorch/test_ac_logging.py
index cb65f028a00f3..4ac195c826545 100644
--- a/test/functorch/test_ac_logging.py
+++ b/test/functorch/test_ac_logging.py
@@ -13,6 +13,7 @@
 
 class TestAcLogging(TestCase):
     def setUp(self) -> None:
+        super().setUp()
         self.graph: MagicMock = MagicMock(spec=Graph)
         self.node1: MagicMock = MagicMock(spec=Node)
         self.node2: MagicMock = MagicMock(spec=Node)
diff --git a/test/functorch/test_aot_joint_with_descriptors.py b/test/functorch/test_aot_joint_with_descriptors.py
index 7949d2bb46cbf..7e41c2ff004f1 100644
--- a/test/functorch/test_aot_joint_with_descriptors.py
+++ b/test/functorch/test_aot_joint_with_descriptors.py
@@ -13,7 +13,7 @@
 import torch.nn as nn
 import torch.utils._pytree as pytree
 from torch._decomp import decomposition_table
-from torch._dynamo.functional_export import _dynamo_graph_capture_for_export
+from torch._dynamo.functional_export import dynamo_graph_capture_for_export
 from torch._dynamo.testing import normalize_gm
 from torch._functorch._aot_autograd.descriptors import (
     BufferAOTInput,
@@ -48,17 +48,13 @@
 
 def graph_capture(model, inputs, with_export):
     gm = model
-    fake_mode = None
+    tracing_context = None
     if with_export:
-        with (
-            torch._dynamo.config.patch(install_free_tensors=True),
-            fx_traceback.preserve_node_meta(),
-        ):
-            # TODO: switch to use the official graph_capture API once it is ready
-            gm = _dynamo_graph_capture_for_export(model)(*inputs)
-            fake_mode = gm.meta.get("fake_mode", None)
-
-    with tracing(TracingContext(fake_mode)):
+        with fx_traceback.preserve_node_meta():
+            gm = dynamo_graph_capture_for_export(model)(*inputs)
+            tracing_context = gm.meta.get("tracing_context", None)
+
+    with tracing(tracing_context):
         with ExitStack() as stack:
             joint_with_descriptors = aot_export_joint_with_descriptors(
                 stack,
@@ -325,7 +321,7 @@ def forward(self, x, *, scale):
         inputs = (torch.randn(4, 3),)
         kwargs = {"scale": torch.tensor(2.0)}
 
-        gm = _dynamo_graph_capture_for_export(model)(*inputs, **kwargs)
+        gm = dynamo_graph_capture_for_export(model)(*inputs, **kwargs)
 
         with ExitStack() as stack:
             # Export joint with descriptors
@@ -356,8 +352,8 @@ def forward(
         primals,
         tangents,
     ):
-        primals_1: "f32[2, 3]"  # ParamAOTInput(target='L__self___linear_weight')
-        primals_2: "f32[2]"  # ParamAOTInput(target='L__self___linear_bias')
+        primals_1: "f32[2, 3]"  # ParamAOTInput(target='linear.weight')
+        primals_2: "f32[2]"  # ParamAOTInput(target='linear.bias')
         primals_3: "f32[4, 3]"  # PlainAOTInput(idx=0)
         primals_4: "f32[]"  # PlainAOTInput(idx=1)
         tangents_1: "f32[4, 2]"  # TangentAOTInput(output=PlainAOTOutput(idx=0))
@@ -379,8 +375,8 @@ def forward(
         transpose_3: "f32[2, 3]" = torch.ops.prims.transpose.default(transpose_2, [1, 0]);  transpose_2 = None
         return pytree.tree_unflatten([
             mul_2,  # PlainAOTOutput(idx=0)
-            transpose_3,  # GradAOTOutput(grad_of=ParamAOTInput(target='L__self___linear_weight'))
-            as_strided,  # GradAOTOutput(grad_of=ParamAOTInput(target='L__self___linear_bias'))
+            transpose_3,  # GradAOTOutput(grad_of=ParamAOTInput(target='linear.weight'))
+            as_strided,  # GradAOTOutput(grad_of=ParamAOTInput(target='linear.bias'))
             None,  # None
             None,  # None
         ], self._out_spec)""",
@@ -1063,9 +1059,11 @@ def forward(self, x):
             str(custom_metadata),
             """\
 ('call_function', 'new_empty', {'pp_stage': 0})
+('get_attr', '_tensor_constant0', {'pp_stage': 0})
 ('call_function', 'index_put', {'pp_stage': 0})
 ('call_function', 'slice_2', {'pp_stage': 0})
 ('call_function', 'slice_backward', {'pp_stage': 0})
+('get_attr', '_tensor_constant0_1', {'pp_stage': 0})
 ('call_function', 'index', {'pp_stage': 0})""",
         )
 
@@ -1082,7 +1080,7 @@ def forward(self, x):
 
         model = SimpleLinear()
         inputs = (torch.randn(4, 3),)
-        gm = _dynamo_graph_capture_for_export(model)(*inputs)
+        gm = dynamo_graph_capture_for_export(model)(*inputs)
         fake_mode = gm.meta.get("fake_mode", None)
 
         with tracing(TracingContext(fake_mode)):
@@ -1094,6 +1092,57 @@ def forward(self, x):
                 )
         self.assertEqual(joint._aot_state.fw_metadata.static_input_indices, [0, 1])
 
+    def test_no_annotation_on_gradient_acc_nodes(self):
+        """Test basic linear module with aot_export_joint_with_descriptors"""
+
+        class SimpleLinear(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = nn.Linear(3, 2)
+                self.linear2 = nn.Linear(3, 2)
+
+            def forward(self, x):
+                with fx_traceback.annotate({"test": 1}):
+                    return self.linear(x) - self.linear2(x)
+
+        model = SimpleLinear()
+        inputs = (torch.randn(4, 3, requires_grad=True),)
+        graph_module = graph_capture(model, inputs, True)
+        add_nodes = graph_module.graph.find_nodes(
+            op="call_function", target=torch.ops.aten.add.Tensor
+        )
+        self.assertEqual(len(add_nodes), 1)
+        gradient_acc_node = add_nodes[0]
+        self.assertTrue(gradient_acc_node.meta["is_gradient_acc"])
+        self.assertEqual(gradient_acc_node.meta.get("custom", {}), {})
+        custom_metadata = fx_traceback._get_custom_metadata(graph_module)
+        self.assertExpectedInline(
+            str(custom_metadata),
+            """\
+('call_function', 't', {'test': 1})
+('call_function', 'addmm', {'test': 1})
+('call_function', 't_1', {'test': 1})
+('call_function', 'addmm_1', {'test': 1})
+('call_function', 'sub', {'test': 1})
+('call_function', 'neg', {'test': 1})
+('call_function', 't_2', {'test': 1})
+('call_function', 'mm', {'test': 1})
+('call_function', 't_3', {'test': 1})
+('call_function', 'mm_1', {'test': 1})
+('call_function', 't_4', {'test': 1})
+('call_function', 'sum_1', {'test': 1})
+('call_function', 'view', {'test': 1})
+('call_function', 't_5', {'test': 1})
+('call_function', 't_6', {'test': 1})
+('call_function', 'mm_2', {'test': 1})
+('call_function', 't_7', {'test': 1})
+('call_function', 'mm_3', {'test': 1})
+('call_function', 't_8', {'test': 1})
+('call_function', 'sum_2', {'test': 1})
+('call_function', 'view_1', {'test': 1})
+('call_function', 't_9', {'test': 1})""",
+        )
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index fba7a96288caf..6cae42d8929da 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -167,6 +167,14 @@ def _pack_fp8_wrap(x):
     if not x.dtype.is_floating_point:
         return x
 
+    if type(x) is not torch.Tensor:
+        # Check only during compilation
+        # Test calls hooks to get reference output
+        ctx = torch._functorch._aot_autograd.graph_compile._get_saved_tensor_hook_context()
+        assert ctx["_fw_graph"] is not None
+        assert ctx["_bw_graph"] is not None
+        assert ctx["_node"] is not None
+
     return (x.dtype, x.to(torch.float8_e5m2))
 
 
@@ -176,6 +184,13 @@ def _unpack_fp8_wrap(x):
         return x
 
     dtype, tensor = x
+    if type(tensor) is not torch.Tensor:
+        # Check only during compilation
+        # Test calls hooks to get reference output
+        ctx = torch._functorch._aot_autograd.graph_compile._get_saved_tensor_hook_context()
+        assert ctx["_fw_graph"] is not None
+        assert ctx["_bw_graph"] is not None
+        assert ctx["_node"] is not None
     return tensor.to(dtype)
 
 
@@ -8111,7 +8126,7 @@ def fn(x):
     xfail("corrcoef"),
     xfail("quantile"),
     xfail("nanquantile"),
-    xfail("narrow"),
+    skip("narrow"),
     xfail("istft"),
     xfail("linalg.eig"),
     skip("as_strided_scatter"),
diff --git a/test/functorch/test_control_flow.py b/test/functorch/test_control_flow.py
index 5034661fa3e05..f83f059663149 100644
--- a/test/functorch/test_control_flow.py
+++ b/test/functorch/test_control_flow.py
@@ -942,9 +942,7 @@ def false_fn(x):
         b = torch.randn(4, requires_grad=True)
         c = torch.randn(4, requires_grad=True)
 
-        for pred, fn in zip(
-            [torch.tensor(False), torch.tensor(True)], [false_fn, true_fn]
-        ):
+        for pred in [torch.tensor(False), torch.tensor(True)]:
             with self.assertRaisesRegex(
                 torch._dynamo.exc.UncapturedHigherOrderOpError,
                 "Cond doesn't work unless it is captured completely with torch.compile",
@@ -3066,13 +3064,9 @@ def run_test_and_get_grads_loss(model, initial_hs, inputs):
         ).to(DEVICE)
 
         # Test 3 models: RNNScanList, RNNScanTensor, RNNLoop
-        models = [
-            ("ScanList", RNNScanList),
-            ("ScanTensor", RNNScanTensor),
-            ("Loop", RNNLoop),
-        ]
+        models = [RNNScanList, RNNScanTensor, RNNLoop]
 
-        for model_name, model_class in models:
+        for model_class in models:
             # Create uncompiled model
             model_uc = model_class().to(DEVICE)
             uncompiled_grads, uncompiled_loss = run_test_and_get_grads_loss(
@@ -7538,7 +7532,7 @@ def foo(x):
 
         inps = (torch.ones(3, 4), torch.ones(3, 5), torch.ones(5, 4), torch.ones(5, 3))
         for inp in inps:
-            gm = make_fx(foo, tracing_mode="symbolic")(torch.ones(3, 4))
+            gm = make_fx(foo, tracing_mode="symbolic")(inp)
             self.assertExpectedInline(
                 gm.code.strip(),
                 """\
diff --git a/test/functorch/xfail_suggester.py b/test/functorch/xfail_suggester.py
index cab6b018d5782..8efd8dfe398f2 100644
--- a/test/functorch/xfail_suggester.py
+++ b/test/functorch/xfail_suggester.py
@@ -73,7 +73,7 @@ def parse_namespace(base):
         "sparse_": "sparse",
         "special_": "special",
     }
-    for heading in mappings.keys():
+    for heading in mappings:
         if base.startswith(heading):
             return mappings[heading], base[len(heading) :]
     return None, base
diff --git a/test/fx/test_fx_split_node_finder.py b/test/fx/test_fx_split_node_finder.py
index a139626968ca5..8916140aa24a3 100644
--- a/test/fx/test_fx_split_node_finder.py
+++ b/test/fx/test_fx_split_node_finder.py
@@ -27,6 +27,7 @@ def sup_f(x):
 
 class TestFxSplitNodeFinder(TestCase):
     def setUp(self):
+        super().setUp()
         self.save_path = sys.path[:]
         self.tmpdir = tempfile.mkdtemp()
         sys.path.insert(0, self.tmpdir)
diff --git a/test/fx/test_graph_pickler.py b/test/fx/test_graph_pickler.py
index ae299140d48a7..d37ebc1108a23 100644
--- a/test/fx/test_graph_pickler.py
+++ b/test/fx/test_graph_pickler.py
@@ -66,7 +66,7 @@ class GraphPicklerCpuTests(TestCase):
 class TestGraphPickler(TestCase):
     def setUp(self):
         torch._dynamo.reset()
-        TestCase.setUp(self)
+        super().setUp()
 
         self._stack = contextlib.ExitStack()
         self._stack.enter_context(
diff --git a/test/fx/test_net_min_base.py b/test/fx/test_net_min_base.py
index 75382304e1950..7e164e7262902 100644
--- a/test/fx/test_net_min_base.py
+++ b/test/fx/test_net_min_base.py
@@ -14,6 +14,7 @@
 
 class TestNetMinBaseBlock(TestCase):
     def setUp(self) -> None:
+        super().setUp()
         # Setup test fixtures for each test method
 
         class SimpleModule(torch.nn.Module):
diff --git a/test/higher_order_ops/test_invoke_subgraph.py b/test/higher_order_ops/test_invoke_subgraph.py
index 700751942ba13..329f20f81cdb5 100644
--- a/test/higher_order_ops/test_invoke_subgraph.py
+++ b/test/higher_order_ops/test_invoke_subgraph.py
@@ -899,14 +899,14 @@ def forward(self, L_x_: "f32[8]", L_y_: "f32[8]"):
     class subgraph_0(torch.nn.Module):
         def forward(self, l_x_: "f32[8]", l_y_: "f32[8]"):
             mul: "f32[8]" = torch.mul(l_x_, l_y_);  l_x_ = l_y_ = None
-            child: "f32[8]" = mul * 2;  mul = None
-            return (child,)
+            mul_1: "f32[8]" = mul * 2;  mul = None
+            return (mul_1,)
 
     class subgraph_1(torch.nn.Module):
         def forward(self, a: "f32[8]", l_y_: "f32[8]"):
             mul: "f32[8]" = torch.mul(a, l_y_);  a = l_y_ = None
-            child: "f32[8]" = mul * 3;  mul = None
-            return (child,)
+            mul_1: "f32[8]" = mul * 3;  mul = None
+            return (mul_1,)
 """,
             )
 
@@ -983,20 +983,20 @@ def forward(self, L_x_: "f32[8]", L_y_: "f32[8]"):
 
         subgraph_0 = self.subgraph_0
         invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_, l_y_);  subgraph_0 = l_x_ = None
-        x: "f32[8]" = invoke_subgraph[0];  invoke_subgraph = None
+        getitem: "f32[8]" = invoke_subgraph[0];  invoke_subgraph = None
         subgraph_1 = self.subgraph_0
-        invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(subgraph_1, 'subgraph_0', x, l_y_);  subgraph_1 = x = None
-        x_1: "f32[8]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
+        invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(subgraph_1, 'subgraph_0', getitem, l_y_);  subgraph_1 = getitem = None
+        getitem_1: "f32[8]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
         subgraph_2 = self.subgraph_0
-        invoke_subgraph_2 = torch.ops.higher_order.invoke_subgraph(subgraph_2, 'subgraph_0', x_1, l_y_);  subgraph_2 = x_1 = None
-        x_2: "f32[8]" = invoke_subgraph_2[0];  invoke_subgraph_2 = None
+        invoke_subgraph_2 = torch.ops.higher_order.invoke_subgraph(subgraph_2, 'subgraph_0', getitem_1, l_y_);  subgraph_2 = getitem_1 = None
+        getitem_2: "f32[8]" = invoke_subgraph_2[0];  invoke_subgraph_2 = None
         subgraph_3 = self.subgraph_0
-        invoke_subgraph_3 = torch.ops.higher_order.invoke_subgraph(subgraph_3, 'subgraph_0', x_2, l_y_);  subgraph_3 = x_2 = None
-        x_3: "f32[8]" = invoke_subgraph_3[0];  invoke_subgraph_3 = None
+        invoke_subgraph_3 = torch.ops.higher_order.invoke_subgraph(subgraph_3, 'subgraph_0', getitem_2, l_y_);  subgraph_3 = getitem_2 = None
+        getitem_3: "f32[8]" = invoke_subgraph_3[0];  invoke_subgraph_3 = None
         subgraph_4 = self.subgraph_0
-        invoke_subgraph_4 = torch.ops.higher_order.invoke_subgraph(subgraph_4, 'subgraph_0', x_3, l_y_);  subgraph_4 = x_3 = l_y_ = None
-        x_4: "f32[8]" = invoke_subgraph_4[0];  invoke_subgraph_4 = None
-        return (x_4,)
+        invoke_subgraph_4 = torch.ops.higher_order.invoke_subgraph(subgraph_4, 'subgraph_0', getitem_3, l_y_);  subgraph_4 = getitem_3 = l_y_ = None
+        getitem_4: "f32[8]" = invoke_subgraph_4[0];  invoke_subgraph_4 = None
+        return (getitem_4,)
 
     class subgraph_0(torch.nn.Module):
         def forward(self, l_x_: "f32[8]", l_y_: "f32[8]"):
@@ -1495,9 +1495,9 @@ def forward(self, L_x_: "f32[8, 8]"):
 
     class subgraph_0(torch.nn.Module):
         def forward(self, l_x_: "f32[8, 8]"):
-            child: "f32[8, 8]" = l_x_ * 2
-            child_1: "f32[8, 8]" = l_x_ * 3;  l_x_ = None
-            return (child, child_1)
+            mul: "f32[8, 8]" = l_x_ * 2
+            mul_1: "f32[8, 8]" = l_x_ * 3;  l_x_ = None
+            return (mul, mul_1)
 """,
             )
 
@@ -2504,6 +2504,107 @@ def f(x, other):
                 self.assertEqual(f(x, other), f_compile(x, other))
                 self.assertTrue(called)
 
+    def test_udf_output(self):
+        class Foo:
+            def __init__(self, a, b):
+                self.a = a
+                self.b = b
+
+        @nested_compile_region
+        def gn(x, y):
+            a = torch.sin(x)
+            b = torch.cos(y)
+            return Foo(a, b)
+
+        def fn(x, y):
+            foo1 = gn(x, y)
+            foo2 = gn(foo1.a, y)
+            return foo1.b + foo2.a  # + foo2.b
+
+        backend = AotEagerAndRecordGraphs()
+
+        opt_fn = torch.compile(fn, backend=backend, fullgraph=True)
+
+        x = torch.randn(8, 8, requires_grad=True)
+        y = torch.randn(8, 8, requires_grad=True)
+        x_clone = x.detach().clone().requires_grad_(True)
+        y_clone = y.detach().clone().requires_grad_(True)
+
+        ref = fn(x, y)
+        res = opt_fn(x_clone, y_clone)
+
+        ref.sum().backward()
+        res.sum().backward()
+
+        self.assertEqual(ref, res)
+        self.assertEqual(x.grad, x_clone.grad)
+
+        if not TEST_WITH_CROSSREF:
+            self.assertExpectedInline(
+                normalize_gm(backend.graphs[0].print_readable(print_output=False)),
+                """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[8, 8]", L_y_: "f32[8, 8]"):
+        l_x_ = L_x_
+        l_y_ = L_y_
+
+        subgraph_0 = self.subgraph_0
+        invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_, l_y_);  subgraph_0 = l_x_ = None
+        getitem: "f32[8, 8]" = invoke_subgraph[0]
+        getitem_1: "f32[8, 8]" = invoke_subgraph[1];  invoke_subgraph = None
+        subgraph_1 = self.subgraph_0
+        invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(subgraph_1, 'subgraph_0', getitem, l_y_);  subgraph_1 = getitem = l_y_ = None
+        getitem_2: "f32[8, 8]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
+
+        add: "f32[8, 8]" = getitem_1 + getitem_2;  getitem_1 = getitem_2 = None
+        return (add,)
+
+    class subgraph_0(torch.nn.Module):
+        def forward(self, l_x_: "f32[8, 8]", l_y_: "f32[8, 8]"):
+            a: "f32[8, 8]" = torch.sin(l_x_);  l_x_ = None
+
+            b: "f32[8, 8]" = torch.cos(l_y_);  l_y_ = None
+            return (a, b)
+""",
+            )
+
+    # High piority - grads are wrong
+    @unittest.expectedFailure
+    def test_grad_accuracy_check(self):
+        class Foo:
+            def __init__(self, a, b):
+                self.a = a
+                self.b = b
+
+        @nested_compile_region
+        def gn(x):
+            a = torch.sin(x)
+            b = torch.cos(x)
+            return (a, b)
+
+        def fn(x):
+            foo1 = gn(x)
+            foo2 = gn(foo1[0])
+            return foo1[1] + foo2[0] + foo2[1]
+
+        backend = AotEagerAndRecordGraphs()
+
+        opt_fn = torch.compile(fn, backend=backend, fullgraph=True)
+
+        x = torch.randn(8, 8, requires_grad=True)
+        x_clone = x.detach().clone().requires_grad_(True)
+        x.grad = None
+        x_clone.grad = None
+
+        ref = fn(x)
+        res = opt_fn(x_clone)
+
+        ref.sum().backward()
+        res.sum().backward()
+
+        self.assertEqual(ref, res)
+        self.assertEqual(x.grad, x_clone.grad)
+
 
 @skipIfTorchDynamo("Not a torch._dynamo test")
 @parameterized_class(
diff --git a/test/higher_order_ops/test_local_map.py b/test/higher_order_ops/test_local_map.py
index 5f37d8e1768d6..a585f2055e89f 100644
--- a/test/higher_order_ops/test_local_map.py
+++ b/test/higher_order_ops/test_local_map.py
@@ -4,8 +4,9 @@
 
 import functools
 import unittest
+from collections.abc import Callable
 from contextlib import contextmanager, ExitStack
-from typing import Any, Callable, Optional
+from typing import Any, Optional
 
 import torch
 import torch._dynamo
@@ -15,6 +16,7 @@
 import torch.fx.traceback as fx_traceback
 import torch.nn.functional as F
 from torch import nn
+from torch._dynamo.functional_export import dynamo_graph_capture_for_export
 from torch._dynamo.variables.higher_order_ops import LocalMapWrappedHigherOrderVariable
 from torch._functorch.aot_autograd import aot_export_joint_with_descriptors
 from torch._subclasses.fake_tensor import FakeTensorMode
@@ -51,24 +53,6 @@ def enable_local_map_wrapping():
         yield
 
 
-def _export(model: torch.nn.Module, inputs: tuple[Any]) -> torch.nn.Module:
-    from torch._dynamo.functional_export import _dynamo_graph_capture_for_export
-    from torch.export._trace import _restore_state_dict
-
-    """
-    Thin wrapper around graph capture output that restores the
-    original calling convention and attribute fqn. TODO:
-    1) Use bytecode for calling convention instead of pytree for more
-       seamless UX.
-    2) Attach guards
-    3) Be more careful about tensor constants names.
-    """
-    with torch._dynamo.config.patch(install_free_tensors=True):
-        gm = _dynamo_graph_capture_for_export(model)(*inputs)
-        _restore_state_dict(model, gm)
-        return gm
-
-
 def ap_style_initial_capture(
     model: torch.nn.Module, inputs_fn: Callable
 ) -> torch.nn.Module:
@@ -90,7 +74,7 @@ def ap_style_initial_capture(
         enable_local_map_wrapping(),
         torch._dynamo.utils._disable_saved_tensors_hooks_during_tracing(),
     ):
-        torch_ir_with_fqn = _export(model, inputs)
+        torch_ir_with_fqn = dynamo_graph_capture_for_export(model)(*inputs)
         unused = ExitStack()
         joint_with_descriptors = aot_export_joint_with_descriptors(
             unused,
@@ -302,47 +286,31 @@ def forward(self, L_self_modules_wq_parameters_weight_: "f32[96, 96]", L_x_: "f3
         l_self_modules_wo_parameters_weight_ = L_self_modules_wo_parameters_weight_
         l_self_modules_w1_parameters_weight_ = L_self_modules_w1_parameters_weight_
         l_self_modules_w2_parameters_weight_ = L_self_modules_w2_parameters_weight_
-
         q: "f32[8, 16, 96]" = torch._C._nn.linear(l_x_, l_self_modules_wq_parameters_weight_, None);  l_self_modules_wq_parameters_weight_ = None
-
         k: "f32[8, 16, 96]" = torch._C._nn.linear(l_x_, l_self_modules_wk_parameters_weight_, None);  l_self_modules_wk_parameters_weight_ = None
-
         v: "f32[8, 16, 96]" = torch._C._nn.linear(l_x_, l_self_modules_wv_parameters_weight_, None);  l_self_modules_wv_parameters_weight_ = None
-
         unflatten: "f32[8, 16, 16, 6]" = q.unflatten(-1, (16, -1));  q = None
         q_1: "f32[8, 16, 16, 6]" = unflatten.permute(0, 2, 1, 3);  unflatten = None
-
         unflatten_1: "f32[8, 16, 16, 6]" = k.unflatten(-1, (16, -1));  k = None
         k_1: "f32[8, 16, 16, 6]" = unflatten_1.permute(0, 2, 1, 3);  unflatten_1 = None
-
         unflatten_2: "f32[8, 16, 16, 6]" = v.unflatten(-1, (16, -1));  v = None
         v_1: "f32[8, 16, 16, 6]" = unflatten_2.permute(0, 2, 1, 3);  unflatten_2 = None
-
         subgraph_0 = self.subgraph_0
         local_map_hop = torch.ops.higher_order.local_map_hop(subgraph_0, q_1, k_1, v_1);  subgraph_0 = q_1 = k_1 = v_1 = None
-        o: "f32[8, 16, 16, 6]" = local_map_hop[0];  local_map_hop = None
-
-        permute_3: "f32[8, 16, 16, 6]" = o.permute(0, 2, 1, 3);  o = None
-        o_1: "f32[8, 16, 96]" = permute_3.flatten(-2);  permute_3 = None
-
-        o_2: "f32[8, 16, 96]" = torch._C._nn.linear(o_1, l_self_modules_wo_parameters_weight_, None);  o_1 = l_self_modules_wo_parameters_weight_ = None
-
-        o0: "f32[8, 16, 96]" = o_2 + l_x_;  o_2 = l_x_ = None
-
-        o_3: "f32[8, 16, 384]" = torch._C._nn.linear(o0, l_self_modules_w1_parameters_weight_, None);  l_self_modules_w1_parameters_weight_ = None
-
-        o_4: "f32[8, 16, 384]" = torch.nn.functional.relu(o_3);  o_3 = None
-
-        o_5: "f32[8, 16, 96]" = torch._C._nn.linear(o_4, l_self_modules_w2_parameters_weight_, None);  o_4 = l_self_modules_w2_parameters_weight_ = None
-
-        o_6: "f32[8, 16, 96]" = o0 + o_5;  o0 = o_5 = None
-        return (o_6,)
-
+        getitem: "f32[8, 16, 16, 6]" = local_map_hop[0];  local_map_hop = None
+        permute_3: "f32[8, 16, 16, 6]" = getitem.permute(0, 2, 1, 3);  getitem = None
+        o: "f32[8, 16, 96]" = permute_3.flatten(-2);  permute_3 = None
+        o_1: "f32[8, 16, 96]" = torch._C._nn.linear(o, l_self_modules_wo_parameters_weight_, None);  o = l_self_modules_wo_parameters_weight_ = None
+        o0: "f32[8, 16, 96]" = o_1 + l_x_;  o_1 = l_x_ = None
+        o_2: "f32[8, 16, 384]" = torch._C._nn.linear(o0, l_self_modules_w1_parameters_weight_, None);  l_self_modules_w1_parameters_weight_ = None
+        o_3: "f32[8, 16, 384]" = torch.nn.functional.relu(o_2);  o_2 = None
+        o_4: "f32[8, 16, 96]" = torch._C._nn.linear(o_3, l_self_modules_w2_parameters_weight_, None);  o_3 = l_self_modules_w2_parameters_weight_ = None
+        o_5: "f32[8, 16, 96]" = o0 + o_4;  o0 = o_4 = None
+        return (o_5,)
     class subgraph_0(torch.nn.Module):
         def forward(self, q_1: "f32[1, 2, 4, 6]", k_1: "f32[1, 2, 16, 6]", v_1: "f32[1, 2, 16, 6]"):
             out: "f32[1, 2, 4, 6]" = torch._C._nn.scaled_dot_product_attention(query = q_1, key = k_1, value = v_1, is_causal = False);  q_1 = k_1 = v_1 = None
-            return (out,)
-""",
+            return (out,)""",
                 ignore_empty_lines=True,
             )
 
diff --git a/test/higher_order_ops/test_print.py b/test/higher_order_ops/test_print.py
new file mode 100644
index 0000000000000..7655fc22435ce
--- /dev/null
+++ b/test/higher_order_ops/test_print.py
@@ -0,0 +1,248 @@
+# Owner(s): ["module: higher order operators"]
+import io
+from unittest.mock import patch
+
+import torch
+from torch._functorch.aot_autograd import aot_export_module
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
+    TestCase,
+)
+
+
+@instantiate_parametrized_tests
+class TestHopPrint(TestCase):
+    def test_base_print(self):
+        def f(x):
+            x = x + x
+            torch._higher_order_ops.print("moo")
+            x = x * x
+            torch._higher_order_ops.print("moo")
+            return x
+
+        x = torch.randn(3, 3)
+        with patch("sys.stdout", new_callable=io.StringIO) as mock_stdout:
+            f(x)
+            printed_output = mock_stdout.getvalue().strip()
+
+        self.assertEqual(printed_output, "moo\nmoo")
+
+    def test_para_print(self):
+        def f(x):
+            x = x + x
+            torch._higher_order_ops.print("moo {x} {y}", x=1, y=2)
+            x = x * x
+            return x
+
+        x = torch.randn(3, 3)
+        with patch("sys.stdout", new_callable=io.StringIO) as mock_stdout:
+            f(x)
+            printed_output = mock_stdout.getvalue().strip()
+
+        self.assertEqual(printed_output, "moo 1 2")
+
+        fx_f = make_fx(f)(x)
+        new_inp = torch.randn(3, 3)
+
+        with patch("sys.stdout", new_callable=io.StringIO) as mock_stdout:
+            fx_f(new_inp)
+            ori_printed_output = mock_stdout.getvalue().strip()
+
+        with patch("sys.stdout", new_callable=io.StringIO) as mock_stdout:
+            f(new_inp)
+            fx_printed_output = mock_stdout.getvalue().strip()
+
+        self.assertEqual(ori_printed_output, fx_printed_output)
+
+    def test_print_with_proxy_graph(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                torch._higher_order_ops.print("moo {x} {y}", x=1, y=2)
+                torch._higher_order_ops.print("moo {x}", x=x)
+                res = x + x
+                torch._higher_order_ops.print("moo {x} {y}", x=1, y=2)
+                torch._higher_order_ops.print("yeehop {x}", x=x.shape[0])
+                return (res,)
+
+        inputs = (torch.randn(3),)
+
+        # Without functionalization, print should just appear in the graph directly
+        gm = make_fx(M(), tracing_mode="symbolic")(*inputs)
+
+        self.assertExpectedInline(
+            str(gm.code).strip(),
+            """\
+def forward(self, arg0_1):
+    print_1 = torch.ops.higher_order.print('moo {x} {y}', x = 1, y = 2);  print_1 = None
+    print_2 = torch.ops.higher_order.print('moo {x}', x = arg0_1);  print_2 = None
+    add = torch.ops.aten.add.Tensor(arg0_1, arg0_1)
+    print_3 = torch.ops.higher_order.print('moo {x} {y}', x = 1, y = 2);  print_3 = None
+    sym_size_int = torch.ops.aten.sym_size.int(arg0_1, 0);  arg0_1 = None
+    print_4 = torch.ops.higher_order.print('yeehop {x}', x = sym_size_int);  sym_size_int = print_4 = None
+    return (add,)""",
+        )
+
+        new_inp = torch.randn(4)
+        with patch("sys.stdout", new_callable=io.StringIO) as mock_stdout:
+            gm(
+                new_inp,
+            )
+            printed_output = mock_stdout.getvalue().strip()
+
+        self.assertEqual(printed_output, f"moo 1 2\nmoo {new_inp}\nmoo 1 2\nyeehop 4")
+
+    def test_print_with_side_effect(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                torch._higher_order_ops.print("moo {x} {y}", x=1, y=2)
+                res = x + x
+                torch._higher_order_ops.print("moo {x} {y}", x=1, y=2)
+                return (res,)
+
+        inputs = (torch.randn(3),)
+
+        # With functionalization, it should appear wrapped with with_effects()
+        gm, gs = aot_export_module(M(), inputs, trace_joint=False)
+        self.assertExpectedInline(
+            str(gm.code).strip(),
+            """\
+def forward(self, arg0_1, arg1_1):
+    with_effects = torch.ops.higher_order.with_effects(arg0_1, torch.ops.higher_order.print, 'moo {x} {y}', x = 1, y = 2);  \
+arg0_1 = None
+    getitem = with_effects[0];  with_effects = None
+    add = torch.ops.aten.add.Tensor(arg1_1, arg1_1);  arg1_1 = None
+    with_effects_1 = torch.ops.higher_order.with_effects(getitem, torch.ops.higher_order.print, 'moo {x} {y}', x = 1, y = 2);  \
+getitem = None
+    getitem_2 = with_effects_1[0];  with_effects_1 = None
+    return (getitem_2, add)""",
+        )
+        self.assertEqual(len(gs.input_tokens), 1)
+        self.assertEqual(len(gs.output_tokens), 1)
+
+    def test_print_with_input_mutations(self):
+        class M(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(self, x):
+                torch._higher_order_ops.print("moo {x} {y}", x=x, y=2)
+                res = x + x
+                x.add_(res)
+                res = x + x
+                torch._higher_order_ops.print("moo {x} {y}", x=x, y=res)
+                return (res,)
+
+        inputs = (torch.randn(3),)
+
+        # With functionalization, it should appear wrapped with with_effects()
+        gm, gs = aot_export_module(M(), inputs, trace_joint=False)
+        self.assertEqual(len(gs.input_tokens), 1)
+        self.assertEqual(len(gs.output_tokens), 1)
+        self.assertEqual(len(gs.user_inputs_to_mutate), 1)
+        self.assertExpectedInline(
+            str(gm.code).strip(),
+            """\
+def forward(self, arg0_1, arg1_1):
+    with_effects = torch.ops.higher_order.with_effects(arg0_1, torch.ops.higher_order.print, 'moo {x} {y}', \
+x = arg1_1, y = 2);  arg0_1 = None
+    getitem = with_effects[0];  with_effects = None
+    add = torch.ops.aten.add.Tensor(arg1_1, arg1_1)
+    add_1 = torch.ops.aten.add.Tensor(arg1_1, add);  arg1_1 = add = None
+    add_2 = torch.ops.aten.add.Tensor(add_1, add_1)
+    with_effects_1 = torch.ops.higher_order.with_effects(getitem, torch.ops.higher_order.print, 'moo {x} {y}', \
+x = add_1, y = add_2);  getitem = None
+    getitem_2 = with_effects_1[0];  with_effects_1 = None
+    return (getitem_2, add_1, add_2)""",
+        )
+
+    def test_print_gen_schema(self):
+        from torch._higher_order_ops.print import print as print_op
+
+        # Test basic schema generation with simple kwargs int
+        format_str = "Hello {x} {y}"
+        schema = print_op.gen_schema(format_str, x=1, y=2)
+        self.assertExpectedInline(
+            str(schema),
+            """print(str format_str, *, int x, int y) -> ()""",
+        )
+        # Test schema generation with different types of inputs
+
+        # Tensor input
+        tensor = torch.randn(2, 2)
+        schema_tensor = print_op.gen_schema("Tensor: {x}", x=tensor)
+        self.assertExpectedInline(
+            str(schema_tensor),
+            """print(str format_str, *, Tensor x) -> ()""",
+        )
+
+        # TODO: Add schema support with kwargs with value of list type
+
+        # No kwargs
+        schema_no_kwargs = print_op.gen_schema("Simple message")
+        self.assertExpectedInline(
+            str(schema_no_kwargs),
+            """print(str format_str) -> ()""",
+        )
+
+    @parametrize("backend", ["eager", "aot_eager"])
+    def test_reorder_print_no_graph_break(self, backend):
+        def f(x):
+            x1 = x + x
+            torch._higher_order_ops.print("moo {x}", x=x1)
+            x2 = x1 * x1
+            torch._higher_order_ops.print("moo {x}", x=x2)
+            x3 = x2 + x2
+            return (x1, x3)
+
+        # Eager and aot_eager backend for dynamo tracing testing
+        x = torch.randn(3, 3)
+        opt_f = torch.compile(backend=backend, fullgraph=True)(f)
+        with patch("sys.stdout", new_callable=io.StringIO) as mock_stdout:
+            opt_out = opt_f(x)
+            printed_output = mock_stdout.getvalue().strip()
+            orig_out = f(x)
+
+        self.assertEqual(
+            printed_output,
+            f"moo {x * 2}\nmoo {x * 2 * x * 2}",
+        )
+        self.assertEqual(orig_out, opt_out)
+
+        x_new = torch.randn(2, 2)
+        with patch("sys.stdout", new_callable=io.StringIO) as mock_stdout:
+            opt_out = opt_f(x_new)
+            printed_output = mock_stdout.getvalue().strip()
+
+        self.assertEqual(
+            printed_output,
+            f"moo {x_new * 2}\nmoo {x_new * 2 * x_new * 2}",
+        )
+
+    @parametrize("backend", ["eager", "aot_eager"])
+    def test_constant_mutation(self, backend):
+        def f(x):
+            alist = [x]
+            alist.append(x + 1)
+            torch._higher_order_ops.print("moo {x}", x=alist[-1])
+            alist[0].sum().item()  # graph break
+            res = alist.pop()
+            torch._higher_order_ops.print("moo {x}", x=alist[-1])
+            res.sum().item()  # graph break
+            return res
+
+        inputs = (torch.tensor([1]),)
+        opt_f = torch.compile(backend=backend, fullgraph=True)(f)
+        with patch("sys.stdout", new_callable=io.StringIO) as mock_stdout:
+            opt_out = opt_f(*inputs)
+            printed_output = mock_stdout.getvalue().strip()
+            orig_out = f(*inputs)
+
+        self.assertEqual(printed_output, "moo tensor([2])\nmoo tensor([1])")
+        self.assertEqual(orig_out, opt_out)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/higher_order_ops/test_with_effects.py b/test/higher_order_ops/test_with_effects.py
index d3e5d36dbed54..2c4cf02bc1c8a 100644
--- a/test/higher_order_ops/test_with_effects.py
+++ b/test/higher_order_ops/test_with_effects.py
@@ -18,15 +18,16 @@
     nop,
 )
 from torch._functorch.aot_autograd import aot_export_module
-from torch._higher_order_ops.effects import with_effects
+from torch._higher_order_ops.effects import (
+    _EffectType,
+    _get_effect,
+    _register_effectful_op,
+    with_effects,
+)
 from torch._higher_order_ops.torchbind import enable_torchbind_tracing
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.testing import FileCheck
-from torch.testing._internal.common_cuda import (
-    _get_torch_cuda_version,
-    SM70OrLater,
-    SM80OrLater,
-)
+from torch.testing._internal.common_cuda import SM70OrLater, SM80OrLater
 from torch.testing._internal.common_quantization import skipIfNoDynamoSupport
 from torch.testing._internal.common_utils import (
     IS_WINDOWS,
@@ -300,7 +301,6 @@ def f(x):
     @unittest.skipIf(IS_WINDOWS, "triton")
     @unittest.skipIf(TEST_WITH_ROCM, "triton")
     @unittest.skipIf(not SM80OrLater, "triton")
-    @unittest.skipIf(_get_torch_cuda_version() >= (11, 7), "triton")
     @unittest.skipIf(not TEST_CUDA, "triton")
     @skipIfNoDynamoSupport
     def test_register_effectful_custom_op(self):
@@ -308,41 +308,23 @@ def test_register_effectful_custom_op(self):
             torch._dynamo.config.capture_scalar_outputs = True
             torch._dynamo.config.capture_dynamic_output_shape_ops = True
 
-            torch.library.define(
-                "mylib::record_scalar_tensor",
-                "(Tensor x, str prefix) -> ()",
-                lib=lib,
-            )
-
             # global variable to store the recorded tensor and prefix.
             recorded_dict = {}
 
-            # Pytorch custorm op implementation
-            @torch.library.impl(
-                "mylib::record_scalar_tensor",
-                "CompositeExplicitAutograd",
-                lib=lib,
-            )
-            def record_scalar_tensor(x, prefix):
+            # Pytorch custom op implementation
+            @torch.library.custom_op("mylib::record_scalar_tensor", mutates_args=())
+            def record_scalar_tensor(x: torch.Tensor, prefix: str) -> None:
                 recorded_dict[prefix] = x.clone()
                 return
 
             # Meta function of the custom op
-            @torch.library.register_fake(
-                "mylib::record_scalar_tensor",
-                lib=lib,
-            )
+            @record_scalar_tensor.register_fake
             def record_scalar_tensor_meta(x, prefix):
                 return
 
-            from torch._higher_order_ops.effects import (
-                _EffectType,
-                _register_effectful_op,
-            )
+            record_scalar_tensor.register_effect(_EffectType.ORDERED)
 
-            _register_effectful_op(
-                torch.ops.mylib.record_scalar_tensor.default, _EffectType.ORDERED
-            )
+            self.assertEqual(_get_effect(record_scalar_tensor), _EffectType.ORDERED)
 
             my_config = {}
             my_config["MockModule"] = "mean"
@@ -469,13 +451,12 @@ def foo_bwd(ctx, grad):
 
             torch.library.register_autograd("_mylib::zoo", foo_bwd, lib=lib)
 
-            from torch._higher_order_ops.effects import (
-                _EffectType,
-                _register_effectful_op,
+            torch.library._register_effectful_op(
+                torch.ops._mylib.zoo.default, _EffectType.ORDERED
+            )
+            torch.library._register_effectful_op(
+                torch.ops._mylib.zoo2.default, _EffectType.ORDERED
             )
-
-            _register_effectful_op(torch.ops._mylib.zoo.default, _EffectType.ORDERED)
-            _register_effectful_op(torch.ops._mylib.zoo2.default, _EffectType.ORDERED)
 
             def fn(x, y):
                 return torch.ops._mylib.zoo(x) + y
@@ -687,13 +668,13 @@ def foo_bwd(ctx, grad):
 
             torch.library.register_autograd("_mylib::foo", foo_bwd, lib=lib)
 
-            from torch._higher_order_ops.effects import (
-                _deregister_effectful_op,
-                _EffectType,
-                _register_effectful_op,
+            handle = _register_effectful_op(
+                torch.ops._mylib.foo.default, _EffectType.ORDERED
+            )
+            self.assertEqual(
+                _get_effect(torch.ops._mylib.foo.default), _EffectType.ORDERED
             )
 
-            _register_effectful_op(torch.ops._mylib.foo.default, _EffectType.ORDERED)
             try:
 
                 def fn(x, y):
@@ -779,17 +760,13 @@ def forward(self, tangents_1, tangents_2, tangents_token):
                     else:
                         raise NotImplementedError
             finally:
-                _deregister_effectful_op(torch.ops._mylib.foo.default)
+                handle.destroy()
+
+            self.assertEqual(_get_effect(torch.ops._mylib.foo.default), None)
 
     @skipIfNoDynamoSupport
     def test_regular_effectful_op_only_in_backward(self):
-        from torch._higher_order_ops.effects import (
-            _deregister_effectful_op,
-            _EffectType,
-            _register_effectful_op,
-        )
-
-        _register_effectful_op(torch.ops.aten.cos.default, _EffectType.ORDERED)
+        handle = _register_effectful_op(torch.ops.aten.cos.default, _EffectType.ORDERED)
         try:
 
             def fn(x):
@@ -852,17 +829,11 @@ def forward(self, primals_1, primals_2, tangents_1, tangents_2, tangents_token):
     return (mul, mul_1, getitem_2)""",
             )
         finally:
-            _deregister_effectful_op(torch.ops.aten.cos.default)
+            handle.destroy()
 
     @skipIfNoDynamoSupport
     def test_regular_effectful_op_in_forward_and_backward(self):
-        from torch._higher_order_ops.effects import (
-            _deregister_effectful_op,
-            _EffectType,
-            _register_effectful_op,
-        )
-
-        _register_effectful_op(torch.ops.aten.cos.default, _EffectType.ORDERED)
+        handle = _register_effectful_op(torch.ops.aten.cos.default, _EffectType.ORDERED)
         try:
 
             def fn(x):
@@ -897,7 +868,7 @@ def forward(self, primals_2, getitem_1, tangents_1, tangents_token):
     return (mul_1, getitem_2)""",
             )
         finally:
-            _deregister_effectful_op(torch.ops.aten.cos.default)
+            handle.destroy()
 
 
 if __name__ == "__main__":
diff --git a/test/inductor/test_analysis.py b/test/inductor/test_analysis.py
index 7d0d2356d6964..a6946cb7b31a7 100644
--- a/test/inductor/test_analysis.py
+++ b/test/inductor/test_analysis.py
@@ -274,7 +274,10 @@ def test_zip_dicts(self):
 
 
 class TestAnalysis(TestCase):
-    @skipIf(not SM80OrLater, "Requires SM80")
+    @skipIf(
+        (not torch.xpu.is_available()) and (not SM80OrLater),
+        "Requires XPU or CUDA SM80",
+    )
     def test_noop(self):
         with (
             patch("sys.stdout", new_callable=StringIO) as mock_stdout,
@@ -283,7 +286,10 @@ def test_noop(self):
             main()
             self.assertEqual(mock_stdout.getvalue(), "")
 
-    @skipIf(not SM80OrLater, "Requires SM80")
+    @skipIf(
+        (not torch.xpu.is_available()) and (not SM80OrLater),
+        "Requires XPU or CUDA SM80",
+    )
     @dtypes(torch.float, torch.double, torch.float16)
     def test_diff(self, device, dtype):
         """
@@ -334,7 +340,11 @@ def test_augment_trace_helper_unit(self):
         expected_flops = [4096000, 4096000, 223552896, 223552896, 0, 0, 0]
         verify_flops(self, expected_flops, out_profile)
 
-    @skipIf(not SM80OrLater, "Requires SM80")
+    @skipIf(
+        (not torch.xpu.is_available()) and (not SM80OrLater),
+        "Requires XPU or CUDA SM80",
+    )
+    @skipXPUIf(TEST_WITH_SLOW, "Skip because test too slow on XPU")
     @dtypes(torch.float, torch.double, torch.float16)
     @parametrize(
         "maxat",
@@ -504,7 +514,11 @@ def test_augment_trace_against_flop_counter(self, device, dtype, maxat):
         self.assertTrue(seen_baddbmm)
         self.assertTrue(seen_conv)
 
-    @skipIf(not SM80OrLater, "Requires SM80")
+    @skipIf(
+        (not torch.xpu.is_available()) and (not SM80OrLater),
+        "Requires XPU or CUDA SM80",
+    )
+    @skipXPUIf(TEST_WITH_SLOW, "Skip because test too slow on XPU")
     @dtypes(torch.float, torch.float16)
     @parametrize(
         "maxat",
@@ -554,7 +568,10 @@ def test_pointwise_bandwidth(self, device, dtype, maxat):
             if event["name"] == "triton_poi_fused_add_randn_sin_0":
                 event["args"]["kernel_num_gb"] = 0.002097168
 
-    @skipIf(not SM80OrLater, "Requires SM80")
+    @skipIf(
+        (not torch.xpu.is_available()) and (not SM80OrLater),
+        "Requires XPU or CUDA SM80",
+    )
     @dtypes(torch.float, torch.float16)
     def test_combine_profiles(self, device, dtype):
         """
@@ -630,7 +647,10 @@ def test_combine_profiles(self, device, dtype):
 
         # Verify device properties are present
         self.assertIn("deviceProperties", combined_profile)
-        self.assertGreater(len(combined_profile["deviceProperties"]), 0)
+        # XPU currently does not have the deviceProperties like CUDA.
+        # See https://github.com/intel/torch-xpu-ops/issues/2247
+        if torch.cuda.is_available():
+            self.assertGreater(len(combined_profile["deviceProperties"]), 0)
 
         # Verify some trace events from each original profile are present
         combined_event_names = {
@@ -648,7 +668,7 @@ def test_combine_profiles(self, device, dtype):
         self.assertTrue(profile3_event_names.intersection(combined_event_names))
 
 
-instantiate_device_type_tests(TestAnalysis, globals())
+instantiate_device_type_tests(TestAnalysis, globals(), allow_xpu=True)
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py
index 8f009f30a0a60..5f0447c32264e 100644
--- a/test/inductor/test_aot_inductor.py
+++ b/test/inductor/test_aot_inductor.py
@@ -246,12 +246,12 @@ def forward(self, x):
         "toolchain doesn't support ptx to fatbin",
     )
     @skipIfMPS
-    @skipIfRocm
     # Skip embed_kernel_binary == True for now as it shows random
     # failure on CI
     @common_utils.parametrize("embed_kernel_binary", [False])
     @unittest.skipIf(
-        _get_torch_cuda_version() < (12, 6), "Test is only supported on CUDA 12.6+"
+        torch.version.hip is None and _get_torch_cuda_version() < (12, 6),
+        "Test is only supported on CUDA 12.6+",
     )
     def test_simple_multi_arch(self, embed_kernel_binary):
         if self.device != GPU_TYPE:
@@ -281,7 +281,11 @@ def forward(self, x, y):
                 _, code = run_and_get_cpp_code(
                     AOTIRunnerUtil.compile, model, example_inputs
                 )
-                file_extension = ".spv" if self.device == "xpu" else ".fatbin"
+                file_extension = (
+                    ".spv"
+                    if self.device == "xpu"
+                    else (".hsaco" if torch.version.hip else ".fatbin")
+                )
                 FileCheck().check(file_extension).run(code)
 
     def test_small_constant(self):
@@ -1550,7 +1554,8 @@ def forward(self, x, y):
 
     # scaled_dot_product_flash_attention
     @unittest.skipIf(
-        not HAS_XPU_AND_TRITON and not SM80OrLater, "bfloat16 only supported in sm80+"
+        not SM80OrLater and not HAS_XPU_AND_TRITON,
+        "bfloat16 only supported in sm80+ or XPU",
     )
     def test_sdpa(self):
         class Model(torch.nn.Module):
@@ -1567,7 +1572,10 @@ def forward(self, q, k, v):
         )
         self.check_model(Model(), example_inputs)
 
-    @unittest.skipIf(not SM80OrLater, "bfloat16 only supported in sm80+")
+    @unittest.skipIf(
+        not SM80OrLater and not HAS_XPU_AND_TRITON,
+        "bfloat16 only supported in sm80+ or XPU",
+    )
     @unittest.skipIf(
         # for archs where this isn't lowered to flash attention, the math
         # backend will be used and it doesn't work for bfloat16
@@ -5922,8 +5930,8 @@ def forward_block(self, x):
     @requires_gpu
     def test_d2h_copy(self):
         # device to copy host should always have the same stride
-        if "cuda" not in self.device:
-            raise unittest.SkipTest("This test is only for CUDA")
+        if self.device not in ["cuda", "xpu"]:
+            raise unittest.SkipTest("This test is only for CUDA or XPU")
 
         class ToCpuModel(nn.Module):
             def forward(self, x):
@@ -7518,6 +7526,38 @@ def forward(self, x, y, a, b):
         eager_outputs = model(*example_inputs)
         torch.testing.assert_close(eager_outputs, compiled_outputs)
 
+    @requires_gpu
+    def test_mixed_device_1(self):
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("Mixed-device test requires GPU")
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                # Buffers are on CPU
+                self.register_buffer(
+                    "index", torch.tensor([1, 4, 1, 7], device="cpu", dtype=torch.int64)
+                )
+                self.register_buffer(
+                    "src", torch.ones(4, device="cpu", dtype=torch.int64)
+                )
+
+            def forward(self, matrix, vector):
+                # Inputs are on CUDA
+                # 1. Operation on CPU tensors
+                z = torch.zeros((vector.shape[0],), device="cpu", dtype=torch.int64)
+                scatter_result = z.scatter_add(0, self.index, self.src)
+
+                # 2. Move result to CUDA and continue on CUDA
+                v = vector + scatter_result.to(vector.dtype).to(GPU_TYPE)
+                return torch.matmul(matrix, v)
+
+        example_inputs = (
+            torch.randn(10, 10, device=self.device),
+            torch.randn(10, device=self.device),
+        )
+        self.check_model(Model(), example_inputs, move_model_to_device=False)
+
 
 class AOTInductorLoggingTest(LoggingTestCase):
     @make_logging_test(dynamic=logging.DEBUG)
diff --git a/test/inductor/test_aot_inductor_package.py b/test/inductor/test_aot_inductor_package.py
index d8b9ad5473bae..2f67758eaa24e 100644
--- a/test/inductor/test_aot_inductor_package.py
+++ b/test/inductor/test_aot_inductor_package.py
@@ -28,7 +28,7 @@
     load_weights_to_pt2_contents,
 )
 from torch.testing._internal.common_cuda import _get_torch_cuda_version
-from torch.testing._internal.common_utils import IS_FBCODE, skipIfRocm, skipIfXpu
+from torch.testing._internal.common_utils import IS_FBCODE, skipIfXpu, TEST_CUDA
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
 
 
@@ -267,9 +267,9 @@ def forward(self, x, y):
 
     @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode")
     @unittest.skipIf(
-        _get_torch_cuda_version() < (12, 6), "Test is only supported on CUDA 12.6+"
+        TEST_CUDA and _get_torch_cuda_version() < (12, 6),
+        "Test is only supported on CUDA 12.6+",
     )
-    @skipIfXpu  # build system may be different
     def test_compile_after_package(self):
         self.check_package_cpp_only()
 
@@ -315,10 +315,10 @@ def forward(self, x, y):
                 self.assertTrue(torch.allclose(actual, expected))
 
     @unittest.skipIf(
-        _get_torch_cuda_version() < (12, 6), "Test is only supported on CUDA 12.6+"
+        torch.version.hip is None and _get_torch_cuda_version() < (12, 6),
+        "Test is only supported on CUDA 12.6+",
     )
     @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode")
-    @skipIfRocm  # doesn't support multi-arch binary
     @skipIfXpu  # doesn't support multi-arch binary
     def test_compile_after_package_multi_arch(self):
         if self.device != GPU_TYPE:
@@ -457,10 +457,10 @@ def forward(self, x):
                 self.assertTrue(a_path.exists())
 
     @unittest.skipIf(
-        _get_torch_cuda_version() < (12, 6), "Test is only supported on CUDA 12.6+"
+        torch.version.hip is None and _get_torch_cuda_version() < (12, 6),
+        "Test is only supported on CUDA 12.6+",
     )
     @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode")
-    @skipIfRocm  # doesn't support multi-arch binary
     @skipIfXpu  # doesn't support multi-arch binary
     @torch._inductor.config.patch("test_configs.use_libtorch", True)
     def test_compile_with_exporter(self):
@@ -515,10 +515,10 @@ def default(*args, **kwargs):
                         )
 
     @unittest.skipIf(
-        _get_torch_cuda_version() < (12, 6), "Test is only supported on CUDA 12.6+"
+        torch.version.hip is None and _get_torch_cuda_version() < (12, 6),
+        "Test is only supported on CUDA 12.6+",
     )
     @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode")
-    @skipIfRocm  # doesn't support multi-arch binary
     @skipIfXpu  # doesn't support multi-arch binary
     @torch._inductor.config.patch("test_configs.use_libtorch", True)
     def test_compile_with_exporter_weights(self):
diff --git a/test/inductor/test_aot_inductor_utils.py b/test/inductor/test_aot_inductor_utils.py
index 2a9f593c5a6c4..cb16f46a752b8 100644
--- a/test/inductor/test_aot_inductor_utils.py
+++ b/test/inductor/test_aot_inductor_utils.py
@@ -218,6 +218,7 @@ def check_model(
     dynamic_shapes=None,
     atol=None,
     rtol=None,
+    move_model_to_device=True,
 ):
     with (
         torch.no_grad(),
@@ -229,7 +230,7 @@ def check_model(
         ),
     ):
         torch.manual_seed(0)
-        if not isinstance(model, types.FunctionType):
+        if not isinstance(model, types.FunctionType) and move_model_to_device:
             model = model.to(self.device)
 
         # For non mixed device inputs with default "cpu",set the device manually.
diff --git a/test/inductor/test_augmented_graph_helper.py b/test/inductor/test_augmented_graph_helper.py
index 92dcfa1b37b85..b9406b0cf8550 100644
--- a/test/inductor/test_augmented_graph_helper.py
+++ b/test/inductor/test_augmented_graph_helper.py
@@ -13,6 +13,7 @@ class TestAugmentedGraphHelper(TestCase):
 
     def setUp(self):
         """Create a simple graph structure for testing."""
+        super().setUp()
         # Create a torch.fx.Graph with multiple nodes
         self.graph = fx.Graph()
 
diff --git a/test/inductor/test_caching.py b/test/inductor/test_caching.py
index bcb66beea700c..aa4c3a1f229f1 100644
--- a/test/inductor/test_caching.py
+++ b/test/inductor/test_caching.py
@@ -13,7 +13,7 @@
 from shutil import rmtree
 from threading import Lock
 from time import sleep, time
-from typing import Any, Generator, Sequence, TYPE_CHECKING, Union
+from typing import Any, TYPE_CHECKING, Union
 from typing_extensions import TypeVar
 from unittest.mock import patch
 
@@ -37,6 +37,7 @@
 
 
 if TYPE_CHECKING:
+    from collections.abc import Generator, Sequence
     from pathlib import Path
 
 
diff --git a/test/inductor/test_codecache.py b/test/inductor/test_codecache.py
index 46f1ca031bf83..4b9030b5cae4b 100644
--- a/test/inductor/test_codecache.py
+++ b/test/inductor/test_codecache.py
@@ -206,6 +206,10 @@ def f(x):
                 .decode()
                 .strip()
             )
+            # XPU have extra lines, so get the last line, refer https://github.com/intel/torch-xpu-ops/issues/2261
+            if torch.xpu.is_available():
+                wrapper_path = wrapper_path.splitlines()[-1]
+                hit = hit.splitlines()[-1]
             self.assertEqual(hit, "1")
 
             with open(wrapper_path) as f:
diff --git a/test/inductor/test_combo_kernels.py b/test/inductor/test_combo_kernels.py
index 59187c7349a09..642bc75dc1b33 100644
--- a/test/inductor/test_combo_kernels.py
+++ b/test/inductor/test_combo_kernels.py
@@ -11,19 +11,19 @@
     instantiate_parametrized_tests,
     TestCase,
 )
-from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA_AND_TRITON
-from torch.testing._internal.triton_utils import requires_cuda_and_triton
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_GPU_AND_TRITON
+from torch.testing._internal.triton_utils import requires_gpu_and_triton
 
 
 aten = torch.ops.aten
 
 try:
     try:
-        from .test_torchinductor import check_model, check_model_cuda
+        from .test_torchinductor import check_model, check_model_gpu
     except ImportError:
         from test_torchinductor import (  # @manual=fbcode//caffe2/test/inductor:test_inductor-library
             check_model,
-            check_model_cuda,
+            check_model_gpu,
         )
 except (unittest.SkipTest, ImportError) as e:
     sys.stderr.write(f"{type(e)}: {e}\n")
@@ -34,7 +34,7 @@
 
 @instantiate_parametrized_tests
 class ComboKernelTests(TestCase):
-    check_model_cuda = check_model_cuda
+    check_model_gpu = check_model_gpu
     check_model_cpu = check_model
     check_kernel_count = True
 
@@ -56,7 +56,7 @@ def tearDown(self):
         torch._inductor.metrics.reset()
         super().tearDown()
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     def test_activation_functions(self):
         def test_activations(a, b, c):
             a1 = torch.nn.functional.relu(a)
@@ -65,9 +65,9 @@ def test_activations(a, b, c):
             return a1, b1, c1
 
         inps = [
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(20, 20, device="cuda"),
-            torch.rand(10, 10, device="cuda"),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
+            torch.rand(10, 10, device=GPU_TYPE),
         ]
 
         out_eager = test_activations(*inps)
@@ -76,7 +76,7 @@ def test_activations(a, b, c):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     def test_reduce_functions(self):
         def test_reduce(a, b, c, d):
             a1 = torch.sum(a, dim=0)
@@ -87,10 +87,10 @@ def test_reduce(a, b, c, d):
             return a1, b1, c1, d1
 
         inps = [
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(20, 20, device="cuda"),
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(30, 8, device="cuda"),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(30, 8, device=GPU_TYPE),
         ]
 
         out_eager = test_reduce(*inps)
@@ -99,7 +99,7 @@ def test_reduce(a, b, c, d):
         self.assertEqual(out_eager, out_compiled)
         self.assertTrue(torch._inductor.metrics.generated_kernel_count <= 2)
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     def test_mutated_args(self):
         def test_mutated(a, b, c, d):
             a.add_(1)
@@ -110,10 +110,10 @@ def test_mutated(a, b, c, d):
             return a, b, c, d
 
         inps = [
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(20, 20, device="cuda"),
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(30, 8, device="cuda"),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(30, 8, device=GPU_TYPE),
         ]
 
         out_eager = test_mutated(*inps)
@@ -122,7 +122,7 @@ def test_mutated(a, b, c, d):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     def test_reduce_split(self):
         def fn(a, b):
             a1 = torch.linalg.vector_norm(a)
@@ -130,15 +130,15 @@ def fn(a, b):
             return a1, b1
 
         inps = [
-            torch.rand(2048, 512, device="cuda"),
-            torch.rand(20, 20, device="cuda"),
+            torch.rand(2048, 512, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
         ]
         out_eager = fn(*inps)
         out_compiled = torch.compile(fn)(*inps)
 
         self.assertEqual(out_eager, out_compiled)
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     def test_2d_blocking_partitioning(self):
         def fn(a0, a1, a2, b0, b1, b2):
             c0 = torch.add(a0, b0)
@@ -146,15 +146,15 @@ def fn(a0, a1, a2, b0, b1, b2):
             c2 = torch.add(a2, b2)
             return c0, c1, c2
 
-        self.check_model_cuda(
+        self.check_model_gpu(
             fn,
             (
-                torch.rand(30, 20, device="cuda"),
-                torch.rand(40, 30, device="cuda"),
-                torch.rand(36, 40, device="cuda"),
-                torch.rand(30, 20, device="cuda"),
-                torch.rand(30, 40, device="cuda").t(),
-                torch.rand(40, 36, device="cuda").t(),
+                torch.rand(30, 20, device=GPU_TYPE),
+                torch.rand(40, 30, device=GPU_TYPE),
+                torch.rand(36, 40, device=GPU_TYPE),
+                torch.rand(30, 20, device=GPU_TYPE),
+                torch.rand(30, 40, device=GPU_TYPE).t(),
+                torch.rand(40, 36, device=GPU_TYPE).t(),
             ),
         )
 
@@ -163,7 +163,7 @@ def fn(a0, a1, a2, b0, b1, b2):
 
 @instantiate_parametrized_tests
 class ComboKernelBenchmarkTests(TestCase):
-    check_model_cuda = check_model_cuda
+    check_model_gpu = check_model_gpu
     check_model_cpu = check_model
     check_kernel_count = True
 
@@ -185,7 +185,7 @@ def tearDown(self):
         torch._inductor.metrics.reset()
         super().tearDown()
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     def test_activation_benchmark(self):
         def test_activations(a, b, c):
             a1 = torch.nn.functional.relu(a)
@@ -194,9 +194,9 @@ def test_activations(a, b, c):
             return a1, b1, c1
 
         inps = [
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(20, 20, device="cuda"),
-            torch.rand(10, 10, device="cuda"),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
+            torch.rand(10, 10, device=GPU_TYPE),
         ]
 
         out_eager = test_activations(*inps)
@@ -205,7 +205,7 @@ def test_activations(a, b, c):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 5)
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     def test_reduce_benchmark(self):
         def test_reduce(a, b, c, d):
             a1 = torch.sum(a, dim=0)
@@ -216,10 +216,10 @@ def test_reduce(a, b, c, d):
             return a1, b1, c1, d1
 
         inps = [
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(20, 20, device="cuda"),
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(30, 8, device="cuda"),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(30, 8, device=GPU_TYPE),
         ]
 
         out_eager = test_reduce(*inps)
@@ -228,7 +228,7 @@ def test_reduce(a, b, c, d):
         self.assertEqual(out_eager, out_compiled)
         self.assertTrue(4 < torch._inductor.metrics.generated_kernel_count <= 10)
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     def test_mutated_benchmark(self):
         def test_mutated(a, b, c, d):
             a.add_(1)
@@ -239,10 +239,10 @@ def test_mutated(a, b, c, d):
             return a, b, c, d
 
         inps = [
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(20, 20, device="cuda"),
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(30, 8, device="cuda"),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(30, 8, device=GPU_TYPE),
         ]
 
         out_eager = test_mutated(*inps)
@@ -251,7 +251,7 @@ def test_mutated(a, b, c, d):
         self.assertEqual(out_eager, out_compiled)
         self.assertTrue(torch._inductor.metrics.generated_kernel_count in [6, 9])
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     def test_round_robin_dispatch(self):
         # combo kernel dispatch strategy: round robin
         def test_mutated(a, b, c, d):
@@ -263,10 +263,10 @@ def test_mutated(a, b, c, d):
             return a, b, c, d
 
         inps = [
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(20, 5, device="cuda"),
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(5, 18, device="cuda"),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 5, device=GPU_TYPE),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(5, 18, device=GPU_TYPE),
         ]
 
         out_eager = test_mutated(*inps)
@@ -275,7 +275,7 @@ def test_mutated(a, b, c, d):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 6)
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     def test_2d_blocking_benchmark(self):
         def fn(a0, a1, a2, b0, b1, b2):
             c0 = torch.add(a0, b0)
@@ -283,28 +283,28 @@ def fn(a0, a1, a2, b0, b1, b2):
             c2 = torch.add(a2, b2)
             return c0, c1, c2
 
-        self.check_model_cuda(
+        self.check_model_gpu(
             fn,
             (
-                torch.rand(30, 20, device="cuda"),
-                torch.rand(40, 30, device="cuda"),
-                torch.rand(36, 40, device="cuda"),
-                torch.rand(30, 20, device="cuda"),
-                torch.rand(30, 40, device="cuda").t(),
-                torch.rand(40, 36, device="cuda").t(),
+                torch.rand(30, 20, device=GPU_TYPE),
+                torch.rand(40, 30, device=GPU_TYPE),
+                torch.rand(36, 40, device=GPU_TYPE),
+                torch.rand(30, 20, device=GPU_TYPE),
+                torch.rand(30, 40, device=GPU_TYPE).t(),
+                torch.rand(40, 36, device=GPU_TYPE).t(),
             ),
         )
 
         self.assertTrue(7 <= torch._inductor.metrics.generated_kernel_count <= 8)
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     def test_persistent_reduction_no_x_dim(self):
         def fn(x, y):
             return x.sum(1), y.sum(1)
 
         inps = (
-            torch.rand(16, 256, device="cuda"),
-            torch.rand(32, 256, device="cuda"),
+            torch.rand(16, 256, device=GPU_TYPE),
+            torch.rand(32, 256, device=GPU_TYPE),
         )
         torch._dynamo.mark_dynamic(inps[0], 0, min=1, max=256)
         torch._dynamo.mark_dynamic(inps[1], 0, min=1, max=256)
@@ -317,7 +317,7 @@ def fn(x, y):
 
 @instantiate_parametrized_tests
 class ComboKernelDynamicShapesTests(TestCase):
-    check_model_cuda = check_model_cuda
+    check_model_gpu = check_model_gpu
     check_model_cpu = check_model
     check_kernel_count = True
 
@@ -347,7 +347,7 @@ def tearDown(self):
         torch._inductor.metrics.reset()
         super().tearDown()
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     def test_dynamic_shapes_activations(self):
         def test_activations(a, b, c):
             a1 = torch.nn.functional.relu(a)
@@ -356,9 +356,9 @@ def test_activations(a, b, c):
             return a1, b1, c1
 
         inps = [
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(20, 20, device="cuda"),
-            torch.rand(10, 10, device="cuda"),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
+            torch.rand(10, 10, device=GPU_TYPE),
         ]
 
         out_eager = test_activations(*inps)
@@ -367,7 +367,7 @@ def test_activations(a, b, c):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 5)
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     def test_dynamic_shapes_2d_blocking(self):
         def fn(a0, a1, a2, b0, b1, b2):
             c0 = torch.add(a0, b0)
@@ -375,21 +375,21 @@ def fn(a0, a1, a2, b0, b1, b2):
             c2 = torch.add(a2, b2)
             return c0, c1, c2
 
-        self.check_model_cuda(
+        self.check_model_gpu(
             fn,
             (
-                torch.rand(30, 20, device="cuda"),
-                torch.rand(40, 30, device="cuda"),
-                torch.rand(36, 40, device="cuda"),
-                torch.rand(30, 20, device="cuda"),
-                torch.rand(30, 40, device="cuda").t(),
-                torch.rand(40, 36, device="cuda").t(),
+                torch.rand(30, 20, device=GPU_TYPE),
+                torch.rand(40, 30, device=GPU_TYPE),
+                torch.rand(36, 40, device=GPU_TYPE),
+                torch.rand(30, 20, device=GPU_TYPE),
+                torch.rand(30, 40, device=GPU_TYPE).t(),
+                torch.rand(40, 36, device=GPU_TYPE).t(),
             ),
         )
 
         self.assertTrue(7 <= torch._inductor.metrics.generated_kernel_count <= 8)
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     def test_dynamic_shapes_reduce(self):
         def test_reduce(a, b, c, d):
             a1 = torch.sum(a, dim=0)
@@ -400,10 +400,10 @@ def test_reduce(a, b, c, d):
             return a1, b1, c1, d1
 
         inps = [
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(20, 20, device="cuda"),
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(30, 8, device="cuda"),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(30, 8, device=GPU_TYPE),
         ]
 
         out_eager = test_reduce(*inps)
@@ -412,7 +412,7 @@ def test_reduce(a, b, c, d):
         self.assertEqual(out_eager, out_compiled)
         self.assertTrue(4 < torch._inductor.metrics.generated_kernel_count <= 10)
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     def test_dynamic_shapes_mutated(self):
         # combo kernel dispatch strategy: round robin
         def test_mutated(a, b, c, d):
@@ -424,10 +424,10 @@ def test_mutated(a, b, c, d):
             return a, b, c, d
 
         inps = [
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(20, 5, device="cuda"),
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(5, 18, device="cuda"),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 5, device=GPU_TYPE),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(5, 18, device=GPU_TYPE),
         ]
 
         out_eager = test_mutated(*inps)
@@ -436,7 +436,7 @@ def test_mutated(a, b, c, d):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 6)
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     @torch._inductor.config.patch("combo_kernels_autotune", 0)
     def test_dynamic_shapes_activations_no_autotune(self):
         def test_activations(a, b, c):
@@ -446,9 +446,9 @@ def test_activations(a, b, c):
             return a1, b1, c1
 
         inps = [
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(20, 20, device="cuda"),
-            torch.rand(10, 10, device="cuda"),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
+            torch.rand(10, 10, device=GPU_TYPE),
         ]
 
         out_eager = test_activations(*inps)
@@ -457,7 +457,7 @@ def test_activations(a, b, c):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 5)
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     @torch._dynamo.config.patch("automatic_dynamic_shapes", True)
     @torch._dynamo.config.patch("assume_static_by_default", True)
     def test_dynamic_shapes_persistent_reduction_no_x_dim(self):
@@ -465,8 +465,8 @@ def fn(x, y):
             return x.sum(1), y.sum(1)
 
         inps = (
-            torch.rand(16, 256, device="cuda"),
-            torch.rand(32, 256, device="cuda"),
+            torch.rand(16, 256, device=GPU_TYPE),
+            torch.rand(32, 256, device=GPU_TYPE),
         )
         torch._dynamo.mark_dynamic(inps[0], 0, min=1, max=256)
         torch._dynamo.mark_dynamic(inps[1], 0, min=1, max=256)
@@ -476,7 +476,7 @@ def fn(x, y):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 4)
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     @torch._dynamo.config.patch("automatic_dynamic_shapes", True)
     @torch._dynamo.config.patch("assume_static_by_default", True)
     def test_dynamic_shapes_persistent_reduction_no_x_dim_2(self):
@@ -484,8 +484,8 @@ def fn(x, y):
             return x.sum(2), y.sum(2)
 
         inps = (
-            torch.rand(8, 16, 256, device="cuda"),
-            torch.rand(8, 32, 256, device="cuda"),
+            torch.rand(8, 16, 256, device=GPU_TYPE),
+            torch.rand(8, 32, 256, device=GPU_TYPE),
         )
         torch._dynamo.mark_dynamic(inps[0], (0, 1), min=1, max=256)
         torch._dynamo.mark_dynamic(inps[1], (0, 1), min=1, max=256)
@@ -495,7 +495,7 @@ def fn(x, y):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 4)
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     @torch._dynamo.config.patch("automatic_dynamic_shapes", True)
     @torch._dynamo.config.patch("assume_static_by_default", True)
     def test_dynamic_shapes_2d_blocking_round_robin(self):
@@ -506,12 +506,12 @@ def fn(a0, a1, a2, b0, b1, b2):
             return c0, c1, c2
 
         inps = (
-            torch.rand(20, 30, device="cuda"),
-            torch.rand(30, 30, device="cuda"),
-            torch.rand(40, 32, device="cuda"),
-            torch.rand(30, 20, device="cuda").t(),
-            torch.rand(30, 30, device="cuda").t(),
-            torch.rand(32, 40, device="cuda").t(),
+            torch.rand(20, 30, device=GPU_TYPE),
+            torch.rand(30, 30, device=GPU_TYPE),
+            torch.rand(40, 32, device=GPU_TYPE),
+            torch.rand(30, 20, device=GPU_TYPE).t(),
+            torch.rand(30, 30, device=GPU_TYPE).t(),
+            torch.rand(32, 40, device=GPU_TYPE).t(),
         )
 
         out_eager = fn(*inps)
@@ -522,19 +522,19 @@ def fn(a0, a1, a2, b0, b1, b2):
         torch._inductor.metrics.reset()
 
         inps = (
-            torch.rand(24, 30, device="cuda"),
-            torch.rand(32, 30, device="cuda"),
-            torch.rand(48, 32, device="cuda"),
-            torch.rand(30, 24, device="cuda").t(),
-            torch.rand(30, 32, device="cuda").t(),
-            torch.rand(32, 48, device="cuda").t(),
+            torch.rand(24, 30, device=GPU_TYPE),
+            torch.rand(32, 30, device=GPU_TYPE),
+            torch.rand(48, 32, device=GPU_TYPE),
+            torch.rand(30, 24, device=GPU_TYPE).t(),
+            torch.rand(30, 32, device=GPU_TYPE).t(),
+            torch.rand(32, 48, device=GPU_TYPE).t(),
         )
         out_compiled = compiled(*inps)
         out_eager = fn(*inps)
         self.assertEqual(out_eager, out_compiled)
         self.assertTrue(5 <= torch._inductor.metrics.generated_kernel_count <= 6)
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     @torch._dynamo.config.patch("automatic_dynamic_shapes", True)
     @torch._dynamo.config.patch("assume_static_by_default", True)
     @torch._inductor.config.patch("triton.autotune_at_compile_time", True)
@@ -543,9 +543,9 @@ def fn(x, y, z):
             return x.sum(1), y.mean(1), z.max(1)
 
         inps = (
-            torch.rand(16, 128, device="cuda"),
-            torch.rand(32, 128, device="cuda"),
-            torch.rand(32, 256, device="cuda"),
+            torch.rand(16, 128, device=GPU_TYPE),
+            torch.rand(32, 128, device=GPU_TYPE),
+            torch.rand(32, 256, device=GPU_TYPE),
         )
         torch._dynamo.mark_dynamic(inps[0], 0, min=1, max=256)
         torch._dynamo.mark_dynamic(inps[1], 0, min=1, max=256)
@@ -555,15 +555,15 @@ def fn(x, y, z):
 
         self.assertEqual(out_eager, out_compiled)
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     def test_helper_fn_defined(self):
         def fn(x, y, z):
             return x.sum(1), y.mean(1), z.cumsum(1)
 
         inps = (
-            torch.rand(16, 128, device="cuda"),
-            torch.rand(32, 128, device="cuda"),
-            torch.rand(32, 256, device="cuda"),
+            torch.rand(16, 128, device=GPU_TYPE),
+            torch.rand(32, 128, device=GPU_TYPE),
+            torch.rand(32, 256, device=GPU_TYPE),
         )
 
         out_eager = fn(*inps)
@@ -577,5 +577,5 @@ def fn(x, y, z):
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
-    if HAS_CPU or HAS_CUDA_AND_TRITON:
+    if HAS_CPU or HAS_GPU_AND_TRITON:
         run_tests(needs="filelock")
diff --git a/test/inductor/test_compile.py b/test/inductor/test_compile.py
index 6908936eca3f3..35a8c51d15206 100644
--- a/test/inductor/test_compile.py
+++ b/test/inductor/test_compile.py
@@ -136,12 +136,59 @@ def test_inductor_via_op_with_multiple_outputs(self):
         mod_opt = inductor.compile(mod, inp)
         self.assertEqual(mod(*inp), mod_opt(*inp))
 
+    @mock.patch.dict(os.environ, {"TORCHINDUCTOR_DEBUG_COMPILE": "1"})
+    def test_inductor_generate_debug_compile(self):
+        cpp_code = """
+        int main(){
+            return 0;
+        }
+        """
+
+        _, source_path = write(
+            cpp_code,
+            "cpp",
+        )
+        build_option = CppOptions()
+        cpp_builder = CppBuilder(
+            name="test_compile",
+            sources=source_path,
+            output_dir=os.path.dirname(source_path),
+            BuildOption=build_option,
+        )
+        cpp_builder.build()
+        binary_path = cpp_builder.get_target_file_path()
+
+        """
+        When we turn on generate debug compile.
+        On Windows, it should create a [module_name].pdb file. It helps debug by WinDBG.
+        On Linux, it should create some debug sections in binary file.
+        """
+
+        def check_linux_debug_section(module_path: str):
+            check_cmd = shlex.split(f"readelf -S {module_path}")
+            output = safe_command_output(check_cmd)
+            has_debug_sym = ".debug_info" in output
+            self.assertEqual(has_debug_sym, True)
+
+        def check_windows_pdb_exist(module_path: str):
+            file_name_no_ext = os.path.splitext(module_path)[0]
+            file_name_pdb = f"{file_name_no_ext}.pdb"
+            has_pdb_file = os.path.exists(file_name_pdb)
+            self.assertEqual(has_pdb_file, True)
+
+        if _IS_WINDOWS:
+            check_windows_pdb_exist(binary_path)
+        elif _IS_MACOS:
+            pass  # MacOS not sure that if it should be works.
+        else:
+            check_linux_debug_section(binary_path)
+
     @mock.patch.dict(os.environ, {"TORCHINDUCTOR_DEBUG_SYMBOL": "1"})
     def test_inductor_generate_debug_symbol(self):
         cpp_code = """
-int main(){
-    return 0;
-}
+        int main(){
+            return 0;
+        }
         """
 
         _, source_path = write(
diff --git a/test/inductor/test_compile_subprocess.py b/test/inductor/test_compile_subprocess.py
index dc730e408b706..071a0bf8a782c 100644
--- a/test/inductor/test_compile_subprocess.py
+++ b/test/inductor/test_compile_subprocess.py
@@ -62,6 +62,8 @@
     "test_remove_noop_slice_scatter": TestFailure(("xpu"), is_skip=True),
     "test_remove_noop_view_default": TestFailure(("xpu"), is_skip=True),
     "test_remove_noop_view_dtype": TestFailure(("xpu"), is_skip=True),
+    # can not pickle ParametrizedConv2d
+    "test_weight_norm_conv2d": TestFailure(("cpu", "cuda"), is_skip=True),
 }
 
 
@@ -70,7 +72,7 @@ def setUp(self):
         torch._dynamo.reset()
         FxCompile._reset_stats()
 
-        TestCase.setUp(self)
+        super().setUp()
 
         self._stack = contextlib.ExitStack()
         self._stack.enter_context(
diff --git a/test/inductor/test_compile_worker.py b/test/inductor/test_compile_worker.py
index 50a389e8663f9..270b15fdf49d8 100644
--- a/test/inductor/test_compile_worker.py
+++ b/test/inductor/test_compile_worker.py
@@ -4,6 +4,7 @@
 import tempfile
 from threading import Event
 
+import torch._inductor.config as config
 from torch._inductor.compile_worker.subproc_pool import (
     raise_testexc,
     SubprocException,
@@ -16,9 +17,12 @@
 
 
 class TestCompileWorker(TestCase):
+    def make_pool(self, size):
+        return SubprocPool(size)
+
     @skipIfWindows(msg="pass_fds not supported on Windows.")
     def test_basic_jobs(self):
-        pool = SubprocPool(2)
+        pool = self.make_pool(2)
         try:
             a = pool.submit(operator.add, 100, 1)
             b = pool.submit(operator.sub, 100, 1)
@@ -29,7 +33,7 @@ def test_basic_jobs(self):
 
     @skipIfWindows(msg="pass_fds not supported on Windows.")
     def test_exception(self):
-        pool = SubprocPool(2)
+        pool = self.make_pool(2)
         try:
             a = pool.submit(raise_testexc)
             with self.assertRaisesRegex(
@@ -42,7 +46,7 @@ def test_exception(self):
 
     @skipIfWindows(msg="pass_fds not supported on Windows.")
     def test_crash(self):
-        pool = SubprocPool(2)
+        pool = self.make_pool(2)
         try:
             with self.assertRaises(Exception):
                 a = pool.submit(os._exit, 1)
@@ -58,12 +62,29 @@ def test_crash(self):
 
     @skipIfWindows(msg="pass_fds not supported on Windows.")
     def test_quiesce(self):
+        pool = self.make_pool(2)
+        try:
+            a = pool.submit(operator.add, 100, 1)
+            pool.quiesce()
+            pool.wakeup()
+            b = pool.submit(operator.sub, 100, 1)
+            self.assertEqual(a.result(), 101)
+            self.assertEqual(b.result(), 99)
+        finally:
+            pool.shutdown()
+
+    @skipIfWindows(msg="pass_fds not supported on Windows.")
+    def test_quiesce_repeatedly(self):
         pool = SubprocPool(2)
         try:
             a = pool.submit(operator.add, 100, 1)
             pool.quiesce()
             pool.wakeup()
             b = pool.submit(operator.sub, 100, 1)
+            pool.quiesce()
+            pool.quiesce()
+            pool.wakeup()
+            b = pool.submit(operator.sub, 100, 1)
             self.assertEqual(a.result(), 101)
             self.assertEqual(b.result(), 99)
         finally:
@@ -75,7 +96,7 @@ def test_logging(self):
         os.environ["ROLE_RANK"] = "0"
         with tempfile.NamedTemporaryFile(delete=True) as temp_log:
             os.environ["TORCHINDUCTOR_WORKER_LOGPATH"] = temp_log.name
-            pool = SubprocPool(2)
+            pool = self.make_pool(2)
             try:
                 pool.submit(operator.add, 100, 1)
                 self.assertEqual(os.path.exists(temp_log.name), True)
@@ -83,6 +104,12 @@ def test_logging(self):
                 pool.shutdown()
 
 
+@config.patch("quiesce_async_compile_time", 0.1)
+class TestCompileWorkerWithTimer(TestCompileWorker):
+    def make_pool(self, size):
+        return SubprocPool(size, quiesce=True)
+
+
 class TestTimer(TestCase):
     def test_basics(self):
         done = Event()
diff --git a/test/inductor/test_compiled_autograd.py b/test/inductor/test_compiled_autograd.py
index 3001f86f4cfce..ede884e0f52bb 100644
--- a/test/inductor/test_compiled_autograd.py
+++ b/test/inductor/test_compiled_autograd.py
@@ -45,6 +45,7 @@
     parametrize,
     scoped_load_inline,
     skipIfWindows,
+    skipIfXpu,
 )
 from torch.testing._internal.hop_db import hop_db
 from torch.testing._internal.inductor_utils import (
@@ -52,9 +53,13 @@
     HAS_CPU,
     HAS_CUDA_AND_TRITON,
     HAS_GPU,
+    HAS_XPU_AND_TRITON,
 )
 from torch.testing._internal.logging_utils import logs_to_string
-from torch.testing._internal.triton_utils import requires_cuda_and_triton
+from torch.testing._internal.triton_utils import (
+    requires_cuda_and_triton,
+    requires_gpu_and_triton,
+)
 from torch.utils._python_dispatch import TorchDispatchMode
 
 
@@ -2617,7 +2622,7 @@ def test_autograd_cpp_node_saved_dynamic(self, load_inline, is_traceable):
         )
 
         def fn():
-            for i in [10, 100, 10, 20, 10]:
+            for i in [10, 30, 10, 20, 10]:
                 x = torch.ones(i, i, requires_grad=True)
                 out = module.custom_op_backed_by_autograd_fn(x)
                 loss = out.sum()
@@ -3049,13 +3054,14 @@ def test_cudagraphs_cpu_graph(self):
 
         self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)
 
-    @requires_cuda_and_triton
+    @skipIfXpu(msg="cudagraphs not supported on xpu for now!")
+    @requires_gpu_and_triton
     def test_cudagraphs_sdpa(self):
         query = torch.rand(
-            32, 8, 128, 64, dtype=torch.float16, device="cuda", requires_grad=True
+            32, 8, 128, 64, dtype=torch.float16, device=GPU_TYPE, requires_grad=True
         )
-        key = torch.rand(32, 8, 128, 64, dtype=torch.float16, device="cuda")
-        value = torch.rand(32, 8, 128, 64, dtype=torch.float16, device="cuda")
+        key = torch.rand(32, 8, 128, 64, dtype=torch.float16, device=GPU_TYPE)
+        value = torch.rand(32, 8, 128, 64, dtype=torch.float16, device=GPU_TYPE)
         out = torch.nn.functional.scaled_dot_product_attention(query, key, value)
 
         with (
@@ -3747,7 +3753,7 @@ def inner_compiler(gm_, example_inputs_):
         self.assertTrue(isinstance(view_nodes[0].args[1][0], torch.fx.Node))
         self.assertTrue(isinstance(view_nodes[1].args[1][0], torch.fx.Node))
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     def test_flex_attention(self):
         def _squared(score, b, h, m, n):
             """Joint graph needed for correctness"""
@@ -3765,7 +3771,7 @@ def fwd_bwd(x: torch.Tensor):
                     a * b,
                     b,
                     dtype=torch.bfloat16,
-                    device="cuda",
+                    device=GPU_TYPE,
                     requires_grad=True,
                 )
                 fwd_bwd(v)
@@ -5222,6 +5228,7 @@ def wrap_test_class(orig_cls):
         "test_reentrant_with_callbacks_both_depths",  # queue_callback
         "test_reentrant_with_callbacks_depth_0",  # queue_callback
         "test_reentrant_with_callbacks_depth_1",  # queue_callback
+        "test_checkpoint_graph_execution_group",  # Attempted to call function marked as skipped
         "test_current_graph_task_execution_order",  # nodes are already freed by the time dynamo traces the lifted hook
         "test_autograd_inplace_views_cross_dtype",  # view_fn not supported by compiled autograd
         "test_post_accumulate_grad_hook_ordering",  # accuracy error
@@ -5332,12 +5339,13 @@ def wrap_test_class(orig_cls):
 test_autograd = load_test_module("test_autograd")
 test_custom_ops = load_test_module("test_custom_ops")
 test_higher_order_ops = load_test_module("dynamo/test_higher_order_ops")
-
-TestAutogradWithCompiledAutograd = wrap_test_class(test_autograd.TestAutograd)
+if not HAS_XPU_AND_TRITON:
+    TestAutogradWithCompiledAutograd = wrap_test_class(test_autograd.TestAutograd)
 TestNestedCheckpointWithCompiledAutograd = wrap_test_class(
     test_autograd.TestNestedCheckpoint
 )
-TestCustomOpWithCompiledAutograd = wrap_test_class(test_custom_ops.TestCustomOp)
+if not HAS_XPU_AND_TRITON:
+    TestCustomOpWithCompiledAutograd = wrap_test_class(test_custom_ops.TestCustomOp)
 HigherOrderOpTestsWithCompiledAutograd = wrap_test_class(
     test_higher_order_ops.HigherOrderOpTests
 )
@@ -5366,6 +5374,7 @@ def tearDown(self) -> None:
         super(TestCase, self).tearDown()
         reset()
 
+    @skipIfXpu(msg="NotImplementedError: The operator 'testlib::mutating_custom_op'")
     @ops(
         list(filter(lambda op: op.name not in xfail_hops, hop_db)),
         allowed_dtypes=(torch.float,),
@@ -5418,7 +5427,7 @@ def fn(x):
             self.assertEqual(expected, actual)
 
 
-instantiate_device_type_tests(TestCompiledAutogradOpInfo, globals())
+instantiate_device_type_tests(TestCompiledAutogradOpInfo, globals(), allow_xpu=True)
 instantiate_parametrized_tests(TestCompiledAutograd)
 
 if __name__ == "__main__":
diff --git a/test/inductor/test_compiled_optimizers.py b/test/inductor/test_compiled_optimizers.py
index df93e7e1e4d61..595d1ccb3da66 100644
--- a/test/inductor/test_compiled_optimizers.py
+++ b/test/inductor/test_compiled_optimizers.py
@@ -65,7 +65,11 @@
     HAS_GPU,
     has_triton,
 )
-from torch.testing._internal.triton_utils import requires_cuda_and_triton, requires_gpu
+from torch.testing._internal.triton_utils import (
+    requires_cuda_and_triton,
+    requires_gpu,
+    requires_gpu_and_triton,
+)
 
 
 def get_inputs(optim):
@@ -320,7 +324,7 @@ def build_opt_kwarg_db():
                     continue
 
                 if has_tensor_lr:
-                    for scheduler_cls in LR_SCHEDULER_TO_KWARGS.keys():
+                    for scheduler_cls in LR_SCHEDULER_TO_KWARGS:
                         name_w_scheduler = name + f"_{scheduler_cls.__name__.lower()}"
                         compiled_opt_db.append(
                             (
@@ -946,7 +950,7 @@ def test_S429861(self):
             kwargs = aot_graph_input_parser(forward)
             torch.compile(forward)(**kwargs)
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     def test_foreach_map_adam(self):
         params = [
             torch.rand(
diff --git a/test/inductor/test_control_flow.py b/test/inductor/test_control_flow.py
index a3c81bdfd15b0..b3d1a5f2529a1 100644
--- a/test/inductor/test_control_flow.py
+++ b/test/inductor/test_control_flow.py
@@ -20,9 +20,11 @@
 from torch.testing._internal.triton_utils import requires_gpu
 
 
-def _prepend_product_of_values(inputs, possible_values, num_to_prepend=1):
+def _prepend_product_of_values(inputs, possible_values, num_to_prepend=1, device=None):
     result = []
-    device = inputs[0].device
+    if len(inputs) != 0:
+        device = inputs[0].device
+    assert device
     # iterate over the cartesian product of predicate values
     for values in itertools.product(*([possible_values] * num_to_prepend)):
         prepended = [torch.tensor(v, device=device) for v in values]
@@ -30,8 +32,8 @@ def _prepend_product_of_values(inputs, possible_values, num_to_prepend=1):
     return result
 
 
-def prepend_predicates(inputs, num_predicates=1):
-    return _prepend_product_of_values(inputs, [False, True], num_predicates)
+def prepend_predicates(inputs, num_predicates=1, device=None):
+    return _prepend_product_of_values(inputs, [False, True], num_predicates, device)
 
 
 def prepend_counters(inputs, num_counters=1, counter_values=(0, 1, 5)):
@@ -308,7 +310,9 @@ def _run_test(
                     torch._dynamo.mark_dynamic(inp, 0)
 
         for inputs in input_sets:
-            for inputs_with_predicates in prepend_predicates(inputs, num_predicates):
+            for inputs_with_predicates in prepend_predicates(
+                inputs, num_predicates, device=device
+            ):
                 cloned_inputs = [inp.clone() for inp in inputs_with_predicates]
                 result = model(*inputs_with_predicates)
                 result_compiled = compiled_model(*inputs_with_predicates)
@@ -768,6 +772,26 @@ def test_cond_select_with_input_idx(self, device, dynamic):
             dynamic=dynamic,
         )
 
+    @requires_gpu
+    def test_output_on_different_device(self):
+        class FactoryBranches(torch.nn.Module):
+            def forward(self, pred):
+                tensor = torch.cond(
+                    pred,
+                    lambda: torch.tensor([1, 2, 3, 4, 5], dtype=torch.float32).to(
+                        GPU_TYPE
+                    ),
+                    lambda: torch.zeros(5, dtype=torch.float32).to(GPU_TYPE),
+                )
+                return tensor + 1
+
+        self._run_test(
+            model=FactoryBranches(),
+            inputs=(),
+            device="cpu",  # device for predicate
+            dynamic=True,
+        )
+
 
 class WhileLoopModels:
     class Simple(torch.nn.Module):
diff --git a/test/inductor/test_cpu_repro.py b/test/inductor/test_cpu_repro.py
index 937208d9fd531..ba9dc93c651cf 100644
--- a/test/inductor/test_cpu_repro.py
+++ b/test/inductor/test_cpu_repro.py
@@ -726,8 +726,7 @@ def test_lstm_packed(
             seq_len,
         )
 
-    @parametrize(
-        "unbatched, input_size, hidden_size, num_layers, bidirectional, bias, empty_state, batch_first, batch_size, seq_len",
+    _test_lstm_packed_change_input_sizes_cpu_params = list(
         itertools.product(
             *[
                 [False],
@@ -741,7 +740,12 @@ def test_lstm_packed(
                 [2],
                 [3],
             ]
-        ),
+        )
+    )
+
+    @parametrize(
+        "unbatched, input_size, hidden_size, num_layers, bidirectional, bias, empty_state, batch_first, batch_size, seq_len",
+        _test_lstm_packed_change_input_sizes_cpu_params,
     )
     def test_lstm_packed_change_input_sizes_cpu(
         self,
@@ -1988,6 +1992,20 @@ def test_tile2d_store_channel_shuffle_cl_quant_output_uint8(self):
     def test_tile2d_store_channel_shuffle_cl_quant_output_int8(self):
         self._test_tile2d_store_channel_shuffle_cl_quant_output_helper(torch.int8)
 
+    @requires_vectorization
+    def test_to_channels_last_fp8(self):
+        def fn(x):
+            return x.to(memory_format=torch.channels_last)
+
+        for dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
+            torch._dynamo.reset()
+            metrics.reset()
+            self.common(
+                fn,
+                (torch.randn(20, 16, 48, 48).to(dtype=dtype),),
+            )
+            check_metrics_vec_kernel_count(2)
+
     def _test_dequant_relu_quant_dequant_relu_quant_lowering_helper(self, dtype):
         def fn(
             x,
@@ -2729,6 +2747,18 @@ def test_large_mean(self):
             actual = torch.compile(op)(t)
             self.assertEqual(expected, actual)
 
+    def test_outer_mean_large_size(self):
+        def fn(x):
+            x = x.flatten()
+            x_one = torch.ones_like(x)
+            x = torch.outer(x, x_one)
+            return torch.mean(x, dim=1)
+
+        x = torch.randn(2, 2, 64, 64)
+        expected = fn(x)
+        actual = torch.compile(fn)(x)
+        self.assertEqual(expected, actual, atol=1e-4, rtol=1e-4)
+
     @unittest.skipIf(IS_FBCODE, "Not yet runnable in fbcode")
     @requires_vectorization
     @patch("torch.cuda.is_available", lambda: False)
@@ -3278,6 +3308,15 @@ def fn(x):
         metrics.reset()
         self.common(fn, (x,))
 
+    def test_softmax_with_zero_dim(self):
+        def fn(x):
+            x = torch.softmax(x, 0)
+            return x
+
+        x = torch.rand([], dtype=torch.bfloat16)
+        metrics.reset()
+        self.common(fn, (x,))
+
     @config.patch({"fx_graph_cache": False, "fx_graph_remote_cache": False})
     def test_local_buffer_in_outer_loop_fusion(self):
         def fn(x):
@@ -4410,16 +4449,17 @@ def __init__(self):
             def forward(self, x):
                 return self.gn(x)
 
-        for dynamic in [True, False]:
-            torch._dynamo.reset()
-            metrics.reset()
-            mod = M().eval()
-            x = torch.randn(1, 32, 128, 128, 128)
-            with torch.no_grad():
-                expected = mod(x)
-                compiled_m = torch.compile(mod, dynamic=dynamic)
-                actual = compiled_m(x)
-                self.assertEqual(expected, actual)
+        for simdlen, dynamic in itertools.product([None, 0], [True, False]):
+            with config.patch({"cpp.simdlen": simdlen}):
+                torch._dynamo.reset()
+                metrics.reset()
+                mod = M().eval()
+                x = torch.randn(1, 32, 128, 128, 128)
+                with torch.no_grad():
+                    expected = mod(x)
+                    compiled_m = torch.compile(mod, dynamic=dynamic)
+                    actual = compiled_m(x)
+                    self.assertEqual(expected, actual)
 
     @torch._dynamo.config.patch(
         capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True
diff --git a/test/inductor/test_cpu_select_algorithm.py b/test/inductor/test_cpu_select_algorithm.py
index 4e1c48496ebc5..ca520ab66bcc2 100644
--- a/test/inductor/test_cpu_select_algorithm.py
+++ b/test/inductor/test_cpu_select_algorithm.py
@@ -2697,6 +2697,32 @@ def forward(self, x):
             self.common(mod, (u,), atol=atol, rtol=rtol)
         self.assertEqual(counters["inductor"]["cpp_templated_kernel_counter"], 1)
 
+    @patches
+    @torch.no_grad
+    @unittest.skipIf(not TEST_MKL, "Test requires MKL")
+    @parametrize("bs", (5,))
+    @parametrize("Mdim", (16,))
+    @parametrize("Kdim", (32,))
+    @parametrize("Ndim", (64,))
+    @dtypes(torch.float)
+    def test_bmm_with_broadcasted_mat1(self, bs, Mdim, Kdim, Ndim, dtype):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, w):
+                assert x.dim() == 2, f"Expected x to be 2D, got {x.dim()}D"
+                x_expanded = x.unsqueeze(0).expand(bs, -1, -1)
+                return x_expanded @ w
+
+        counters.clear()
+        u = torch.randn(Mdim, Kdim).to(dtype=dtype)
+        v = torch.randn(bs, Kdim, Ndim).to(dtype=dtype)
+        mod = M().to(dtype=dtype).eval()
+        with verify(dtype) as (atol, rtol):
+            self.common(mod, (u, v), atol=atol, rtol=rtol)
+        self.assertEqual(counters["inductor"]["cpp_templated_kernel_counter"], 1)
+
     @patches
     @torch.no_grad
     @unittest.skipIf(not TEST_MKL, "Test requires MKL")
diff --git a/test/inductor/test_cuda_repro.py b/test/inductor/test_cuda_repro.py
index 1804f4692124f..2640f65116f4b 100644
--- a/test/inductor/test_cuda_repro.py
+++ b/test/inductor/test_cuda_repro.py
@@ -2231,32 +2231,6 @@ def f(arg0_1, arg1_1):
             out = f(x, y)
             self.assertEqual(torch.compile(f)(x, y), out)
 
-    @unittest.skipIf(
-        not config.is_fbcode(),
-        "bfloat16 atomic add is only supported in fbcode today #97016",
-    )
-    @skipCUDAIf(
-        not SM90OrLater, "uses bfloat16 atomic add instrs which requires SM >= 90"
-    )
-    @config.patch({"bfloat16_atomic_adds_enabled": False})
-    def test_atomic_add_bfloat16_config(self):
-        def f(x, y):
-            return torch.index_select(x, 0, y)
-
-        x = torch.randn(
-            2000, 384, dtype=torch.bfloat16, device="cuda", requires_grad=True
-        )
-        y = torch.ones(713268, dtype=torch.int64, device="cuda")
-        x_ref = x.clone().detach().requires_grad_(True)
-        y_ref = y.clone().detach()
-
-        out, (_, bw_code) = run_fw_bw_and_get_code(lambda: torch.compile(f)(x, y))
-        fc = FileCheck()
-        fc.check_not("tl.atomic_add")
-        fc.run(bw_code)
-
-        self.assertEqual(f(x_ref, y_ref), out)
-
     @skipCUDAIf(
         not SM90OrLater, "uses bfloat16 atomic add instrs which requires SM >= 90"
     )
diff --git a/test/inductor/test_cudagraph_trees.py b/test/inductor/test_cudagraph_trees.py
index db15ff03e0c11..934f969543b2a 100644
--- a/test/inductor/test_cudagraph_trees.py
+++ b/test/inductor/test_cudagraph_trees.py
@@ -4101,6 +4101,53 @@ def foo(x):
             compiled_out = compiled_foo(x)
             self.assertEqual(eager_out, compiled_out)
 
+        # Use autotune_at_compile_time=True to test standalone_compile
+        @parametrize("autotune_at_compile_time", [True, False])
+        @config.patch("graph_partition", True)
+        def test_graph_partition_kernel_reuse(self, autotune_at_compile_time):
+            def foo(x):
+                # partition 1
+                x1 = x @ x
+                y1 = x1 + 1
+                z_cpu = y1.cpu() + 1
+                # partition 2
+                # partition 2 should reuse the fused triton kernel generated
+                # in partition 1
+                x2 = z_cpu.to("cuda") @ z_cpu.to("cuda")
+                y2 = x2 + 1
+                return y1, y2
+
+            with config.patch(
+                "triton.autotune_at_compile_time", autotune_at_compile_time
+            ):
+                compiled_foo = torch.compile(foo)
+                x = torch.randn((20, 20), device="cuda")
+                eager_out = foo(x)
+                compiled_out, code = run_and_get_code(compiled_foo, x)
+                self.assertEqual(eager_out, compiled_out)
+
+                if autotune_at_compile_time:
+                    # auto-tuning block should only appear once. We generate auto-tuning code
+                    # for all the kernels no matter if they are defined in the main graph or
+                    # subgraph, to avoid the overhead of executing multiple auto-tuning code blocks.
+                    FileCheck().check_count(
+                        "Compile-time auto-tuning block", 1, exactly=True
+                    ).run(code[0])
+                    # triton_poi_fused_add_ should appear twice, first in the auto-tuning block,
+                    # and then in the main code block
+                    FileCheck().check_count(
+                        "def triton_poi_fused_add_", 2, exactly=True
+                    ).run(code[0])
+                    # cpu kernel definition should only appence once, not in the auto-tuning block
+                    FileCheck().check_count(
+                        "cpp_fused__to_copy_add_1 = ", 1, exactly=True
+                    ).run(code[0])
+                else:
+                    # triton_poi_fused_add_ should appear once, because of kernel reuse
+                    FileCheck().check_count(
+                        "def triton_poi_fused_add_", 1, exactly=True
+                    ).run(code[0])
+
         def test_meta_tensor(self):
             def foobar(x, y):
                 return x * 2, y * 3
diff --git a/test/inductor/test_custom_lowering.py b/test/inductor/test_custom_lowering.py
index 1a91cc50e4eeb..ccde337878631 100644
--- a/test/inductor/test_custom_lowering.py
+++ b/test/inductor/test_custom_lowering.py
@@ -4,8 +4,9 @@
 from unittest import skipIf
 
 import torch
+from torch._inductor import config
 from torch._inductor.ir import Pointwise
-from torch._inductor.lowering import make_pointwise, register_lowering
+from torch._inductor.lowering import make_fallback, make_pointwise, register_lowering
 from torch._inductor.test_case import TestCase as InductorTestCase
 from torch._inductor.virtualized import ops
 from torch.testing._internal.common_utils import skipIfRocm, skipIfXpu
@@ -208,7 +209,7 @@ def fn(inp, offsets, max_seq_len):
 
     @requires_gpu()
     @skipIfRocm
-    @skipIfXpu
+    @skipIfXpu(msg="https://github.com/intel/torch-xpu-ops/issues/2328")
     @skipIf(GPU_TYPE == "mps", "Not applicable to MPS")
     def test_tanh_approx(self):
         def fn(inp):
@@ -223,7 +224,7 @@ def fn(inp):
 
     @requires_gpu()
     @skipIfRocm
-    @skipIfXpu
+    @skipIfXpu(msg="https://github.com/intel/torch-xpu-ops/issues/2328")
     @skipIf(GPU_TYPE == "mps", "Not applicable to MPS")
     def test_multi_inp_asm(self):
         def fn(a, b):
@@ -237,6 +238,17 @@ def fn(a, b):
         out2 = fn_opt(a, b)
         self.assertEqual(out1, out2)
 
+    @config.patch(joint_graph_constant_folding=False)
+    def test_constant_creation(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                return x + torch.tensor(1)
+
+        make_fallback(torch.ops.aten.lift_fresh_copy.default)
+        self.assertTrue(
+            torch.allclose(torch.compile(M())(torch.ones(3)), torch.ones(3) + 1)
+        )
+
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/test/inductor/test_custom_op_autotune.py b/test/inductor/test_custom_op_autotune.py
index adc46a0f390a4..c148c69468902 100644
--- a/test/inductor/test_custom_op_autotune.py
+++ b/test/inductor/test_custom_op_autotune.py
@@ -216,115 +216,6 @@ def _(input_tensor: torch.Tensor, weight: torch.Tensor, eps: float = 1e-8):
                 test_rmsnorm_op, (input_tensor, weight), expected, f"RMSNorm_{i}"
             )
 
-    @skipIfXpu
-    def test_mlp_custom_op_autotune(self):
-        """Test MLP autotuning with method parameter controlling different decomposition variants.
-
-        Validates parametric tuning where the same decomposition function uses different
-        algorithmic approaches based on a method parameter (standard matmul, batched mm, fused weights).
-        """
-        test_op_name = f"test_lib::mlp_{id(self)}"
-
-        def mlp_variants(
-            input_tensor: torch.Tensor,
-            gate_weight: torch.Tensor,
-            up_weight: torch.Tensor,
-            down_weight: torch.Tensor,
-            method: int = 0,
-        ) -> torch.Tensor:
-            """MLP implementation with different computational approaches controlled by method parameter."""
-
-            if method == 0:
-                gate_proj = torch.matmul(input_tensor, gate_weight)
-                up_proj = torch.matmul(input_tensor, up_weight)
-                gated = torch.relu(gate_proj) * up_proj
-                return torch.matmul(gated, down_weight)
-
-            elif method == 1:
-                batch_shape = input_tensor.shape[:-1]
-                hidden_dim = input_tensor.shape[-1]
-                output_dim = down_weight.shape[-1]
-
-                input_2d = input_tensor.view(-1, hidden_dim)
-
-                gate_proj = torch.mm(input_2d, gate_weight)
-                up_proj = torch.mm(input_2d, up_weight)
-
-                gated = torch.relu(gate_proj) * up_proj
-                output_2d = torch.mm(gated, down_weight)
-
-                return output_2d.view(*batch_shape, output_dim)
-
-        @torch.library.custom_op(test_op_name, mutates_args=())
-        def test_mlp_op(
-            input_tensor: torch.Tensor,
-            gate_weight: torch.Tensor,
-            up_weight: torch.Tensor,
-            down_weight: torch.Tensor,
-            method: int = 0,
-        ) -> torch.Tensor:
-            return mlp_variants(
-                input_tensor, gate_weight, up_weight, down_weight, method=method
-            )
-
-        @test_mlp_op.register_fake
-        def _(
-            input_tensor: torch.Tensor,
-            gate_weight: torch.Tensor,
-            up_weight: torch.Tensor,
-            down_weight: torch.Tensor,
-            method: int = 0,
-        ):
-            return torch.empty(
-                input_tensor.shape[:-1] + (down_weight.shape[-1],),
-                device=input_tensor.device,
-                dtype=input_tensor.dtype,
-            )
-
-        # Use explicit config with method parameter as tuning knob
-        register_custom_op_autotuning(
-            test_mlp_op,
-            configs=[
-                CustomOpConfig(method=0),
-                CustomOpConfig(method=1),
-            ],
-            name="test_mlp_autotuned",
-            input_gen_fns={
-                "input_tensor": lambda fake_tensor: torch.randn_like(
-                    fake_tensor, device=self.device
-                )
-                * 0.1,
-                "gate_weight": lambda fake_tensor: torch.randn_like(
-                    fake_tensor, device=self.device
-                )
-                * 0.05,
-                "up_weight": lambda fake_tensor: torch.randn_like(
-                    fake_tensor, device=self.device
-                )
-                * 0.05,
-                "down_weight": lambda fake_tensor: torch.randn_like(
-                    fake_tensor, device=self.device
-                )
-                * 0.05,
-            },
-        )
-
-        # Create test inputs
-        input_tensor, gate_weight, up_weight, down_weight = self._create_mlp_inputs()
-
-        # Test that all method variants produce numerically equivalent results
-        expected = mlp_variants(
-            input_tensor, gate_weight, up_weight, down_weight, method=0
-        )
-
-        # Test autotuning
-        self._run_autotune_test(
-            test_mlp_op,
-            (input_tensor, gate_weight, up_weight, down_weight),
-            expected,
-            "MLP",
-        )
-
     def _create_decompose_k_inputs(self, m=256, k=65536, n=1024):
         """Create test inputs for decompose_k matrix multiplication - divisible by all k_splits values."""
         # Ensure k is divisible by all k_splits values: [2, 32, 64, 128, 256]
@@ -335,12 +226,12 @@ def _create_decompose_k_inputs(self, m=256, k=65536, n=1024):
 
     @skipIfXpu
     def test_decompose_k_custom_op_autotune(self):
-        """Test decompose_k autotuning with parametric tuning for k_splits values.
+        """Test decompose_k autotuning with epilogue fusion (matmul + bias + relu + scale).
 
-        Validates numerical parameter sweep where k_splits controls how the K dimension
-        is decomposed for matrix multiplication (k_splits in [32, 64, 128, 256]).
+        Validates that the custom op encapsulates the entire fused operation with parametric
+        tuning for k_splits values controlling how the K dimension is decomposed.
         """
-        test_op_name = f"test_lib::decompose_k_{id(self)}"
+        test_op_name = f"test_lib::matmul_relu_epilogue_{id(self)}"
 
         def decompose_k_implementation(
             a: torch.Tensor, b: torch.Tensor, k_splits: int = 4
@@ -363,19 +254,23 @@ def decompose_k_implementation(
             return torch.sum(result, dim=0)  # [m, n]
 
         @torch.library.custom_op(test_op_name, mutates_args=())
-        def test_decompose_k_op(
-            a: torch.Tensor, b: torch.Tensor, k_splits: int = 4
+        def matmul_relu_epilogue_op(
+            a: torch.Tensor, b: torch.Tensor, bias: torch.Tensor, k_splits: int = 4
         ) -> torch.Tensor:
-            """Matrix multiply with k-way decomposition - custom op using the decomposition."""
-            return decompose_k_implementation(a, b, k_splits)
-
-        @test_decompose_k_op.register_fake
-        def _(a: torch.Tensor, b: torch.Tensor, k_splits: int = 4):
+            """Matmul with decompose_k + bias + relu + scale (complete epilogue fusion)."""
+            matmul_result = decompose_k_implementation(a, b, k_splits)
+            biased = matmul_result + bias
+            activated = torch.relu(biased)
+            scaled = activated * 2.0
+            return scaled
+
+        @matmul_relu_epilogue_op.register_fake
+        def _(a: torch.Tensor, b: torch.Tensor, bias: torch.Tensor, k_splits: int = 4):
             return torch.empty(a.shape[0], b.shape[1], device=a.device, dtype=a.dtype)
 
-        # Register autotuning with different k_splits values using decomposition function
+        # Register autotuning with different k_splits values
         register_custom_op_autotuning(
-            test_decompose_k_op,
+            matmul_relu_epilogue_op,
             configs=[
                 CustomOpConfig(k_splits=2),
                 CustomOpConfig(k_splits=4),
@@ -385,7 +280,7 @@ def _(a: torch.Tensor, b: torch.Tensor, k_splits: int = 4):
                 CustomOpConfig(k_splits=64),
                 CustomOpConfig(k_splits=128),
             ],
-            name="test_decompose_k_autotuned",
+            name="matmul_relu_epilogue_autotuned",
             input_gen_fns={
                 "a": lambda fake_tensor: torch.randn_like(
                     fake_tensor, device=self.device
@@ -395,12 +290,45 @@ def _(a: torch.Tensor, b: torch.Tensor, k_splits: int = 4):
                     fake_tensor, device=self.device
                 )
                 * 0.1,
+                "bias": lambda fake_tensor: torch.randn_like(
+                    fake_tensor, device=self.device
+                )
+                * 0.1,
             },
         )
 
+        # Create test inputs
         a, b = self._create_decompose_k_inputs()
-        expected = a @ b
-        self._run_autotune_test(test_decompose_k_op, (a, b), expected, "DecomposeK")
+        bias = torch.randn(b.shape[1], device=self.device, dtype=self.dtype) * 0.1
+
+        # Compile the model using the custom op
+        @torch.compile
+        def test_model(a, b, bias):
+            return matmul_relu_epilogue_op(a, b, bias)
+
+        torch._dynamo.reset()
+
+        with config.patch(
+            max_autotune=True,
+            benchmark_fusion=True,
+        ):
+            compiled_result = test_model(a, b, bias)
+
+        def reference_model(a, b, bias):
+            matmul_result = a @ b
+            biased = matmul_result + bias
+            activated = torch.relu(biased)
+            scaled = activated * 2.0
+            return scaled
+
+        expected = reference_model(a, b, bias)
+
+        torch.testing.assert_close(
+            compiled_result,
+            expected,
+            rtol=2e-1,
+            atol=5e-1,
+        )
 
     @skipIfXpu
     def test_multi_parameter_tuning(self):
diff --git a/test/inductor/test_deterministic.py b/test/inductor/test_deterministic.py
index 0de777dd81b5c..7e79100f4c053 100644
--- a/test/inductor/test_deterministic.py
+++ b/test/inductor/test_deterministic.py
@@ -1,5 +1,9 @@
 # Owner(s): ["module: inductor"]
 import contextlib
+import os
+import subprocess
+import sys
+import tempfile
 import unittest
 
 import torch
@@ -10,10 +14,11 @@
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
+    skipIfXpu,
 )
 from torch.testing._internal.inductor_utils import (
     GPU_TYPE,
-    HAS_CUDA_AND_TRITON,
+    HAS_GPU_AND_TRITON,
     IS_BIG_GPU,
 )
 
@@ -38,6 +43,7 @@ def test_use_deterministic_algorithsm(self):
         finally:
             torch.use_deterministic_algorithms(old_val, warn_only=True)
 
+    @skipIfXpu(msg="pad_mm is not enabled for XPU.")
     @parametrize("deterministic", [False, True])
     def test_mm_padding(self, deterministic):
         with inductor_config.patch(deterministic=deterministic):
@@ -104,7 +110,65 @@ def foo(x):
             else:
                 self.assertTrue(counters["inductor"]["coordesc_tuning_bench"] > 0)
 
+    @parametrize("model_name", ["GoogleFnet", "BertForMaskedLM", "DistillGPT2"])
+    @parametrize("training_or_inference", ["training", "inference"])
+    @parametrize("precision", ["float32", "bfloat16", "float16", "amp"])
+    def test_run2run_determinism(self, model_name, training_or_inference, precision):
+        """
+        Test run2run determinism for a few huggingface models.
+
+        The test assumes benchmarks/dynamo/huggingface.py can be found from
+        the current working directory.
+        """
+
+        if not os.path.exists("benchmarks/dynamo/huggingface.py"):
+            self.skipTest("Skip due to benchmarks/dynamo/huggingface.py not found.")
+
+        def _setup_env(env):
+            env["TORCHINDUCTOR_FORCE_DISABLE_CACHES"] = "1"  # disable autotune cache
+            env["TORCHINDUCTOR_FX_GRAPH_REMOTE_CACHE"] = "0"
+            env["TORCHINDUCTOR_FX_GRAPH_CACHE"] = "0"
+            if enable_determinism:
+                env["TORCHINDUCTOR_DETERMINISTIC"] = "1"
+
+        # set to false if you want to check how the test fails without
+        # the deterministic mode
+        enable_determinism = True
+        with tempfile.TemporaryDirectory() as tmpdir:
+            saved_pkl = os.path.join(tmpdir, "saved.pkl")
+            cmd = (
+                f"{sys.executable} benchmarks/dynamo/huggingface.py --backend inductor"
+                + f" --{precision} --accuracy --only {model_name} --{training_or_inference}"
+                + f" --disable-cudagraphs --save-model-outputs-to={saved_pkl}"
+            )
+            print("Command", cmd)
+            env = os.environ.copy()
+            _setup_env(env)
+            out = subprocess.run(cmd.split(), capture_output=True, env=env)
+
+            # We don't check the accuracy against eager here because some
+            # of the combination between model and precision can not
+            # pass that accuracy test. But it's still valuable to make
+            # sure we generate bitwise equivalent result from run to run.
+            # self.assertTrue("pass" in out.stdout.decode())
+
+            cmd = (
+                f"{sys.executable} benchmarks/dynamo/huggingface.py --backend inductor"
+                + f" --{precision} --accuracy --only {model_name} --{training_or_inference}"
+                + f" --disable-cudagraphs --compare-model-outputs-with={saved_pkl}"
+            )
+            print("Command", cmd)
+
+            # distort benchmarking results
+            env["TORCHINDUCTOR_DISTORT_BENCHMARKING_RESULT"] = "inverse"
+            out = subprocess.run(cmd.split(), capture_output=True, env=env)
+            self.assertTrue(
+                "The result is bitwise equivalent to the previously saved result"
+                in out.stdout.decode(),
+                f"stdout: {out.stdout.decode()}, stderr: {out.stderr.decode()}",
+            )
+
 
 if __name__ == "__main__":
-    if HAS_CUDA_AND_TRITON:
+    if HAS_GPU_AND_TRITON:
         run_tests()
diff --git a/test/inductor/test_device_assert.py b/test/inductor/test_device_assert.py
index f3c1422995014..c5dfd8de26f0b 100644
--- a/test/inductor/test_device_assert.py
+++ b/test/inductor/test_device_assert.py
@@ -10,7 +10,10 @@
     parametrize,
     skipIfRocm,
 )
-from torch.testing._internal.triton_utils import requires_cuda_and_triton
+from torch.testing._internal.triton_utils import requires_gpu_and_triton
+
+
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 
 
 @instantiate_parametrized_tests
@@ -55,14 +58,14 @@ def func_inline():
         f_c = torch.compile(func_inline, backend=backend)
         f_c()
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     @skipIfRocm
     @torch._inductor.config.patch(force_disable_caches=True)
     def test_assert_fusion(self):
         torch._logging.set_logs(inductor_metrics=True)
 
         def func():
-            a = torch.tensor([1.0, 2.0], device="cuda")
+            a = torch.tensor([1.0, 2.0], device=device_type)
             result = torch.all(a > 0)
             assert result, "should throw"
 
@@ -74,13 +77,13 @@ def func():
         self.assertEqual(metrics.generated_kernel_count, 1)
         torch._logging.set_logs()
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     @skipIfRocm
     @torch._inductor.config.patch(force_disable_caches=True)
     def test_run_assert_triton(self):
         @torch.compile(backend="inductor")
         def fn():
-            a = torch.tensor([1.0, 2.0], device="cuda")
+            a = torch.tensor([1.0, 2.0], device=device_type)
             result = torch.all(a > 0)
             assert result, "should throw"
 
diff --git a/test/inductor/test_distributed_patterns.py b/test/inductor/test_distributed_patterns.py
index e067bdfedc090..05ef76d6636ac 100644
--- a/test/inductor/test_distributed_patterns.py
+++ b/test/inductor/test_distributed_patterns.py
@@ -7,7 +7,7 @@
 from torch._dynamo import compiled_autograd
 from torch._dynamo.test_case import run_tests, TestCase
 from torch._dynamo.testing import CompileCounter
-from torch.testing._internal.common_utils import IS_MACOS, skipIfXpu
+from torch.testing._internal.common_utils import IS_MACOS
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, requires_gpu
 
 
@@ -483,7 +483,6 @@ def test_fake_distributed_aot_eager(self):
         # Recompile on grad==None/grad!=None
         self.assertEqual(bw_cnt.frame_count, 2)
 
-    @skipIfXpu
     @requires_gpu()
     @torch._functorch.config.patch(recompute_views=True)
     def test_fake_distributed_inductor(self):
diff --git a/test/inductor/test_external_callables.py b/test/inductor/test_external_callables.py
index a8aab1c00d80b..3e2b68e26c432 100644
--- a/test/inductor/test_external_callables.py
+++ b/test/inductor/test_external_callables.py
@@ -5,6 +5,10 @@
 from torch._inductor import config
 from torch._inductor.test_case import run_tests, TestCase
 from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_utils import TEST_XPU
+
+
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 
 
 class MatMulModule(torch.nn.Module):
@@ -68,13 +72,13 @@ def test_matmul_dup(self):
             msg=f"torch.compile(..., external_matmul = {matmul_dup}) failed",
         )
 
-    @unittest.skipIf(not TEST_CUDA, "CUDA not found")
+    @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "CUDA and XPU not found")
     @unittest.skipIf(
         torch.cuda.is_available() and torch.cuda.get_device_capability() < (7, 0),
         "Triton does not support device capability < 7.0",
     )
     def test_matmul_cuda(self):
-        device = torch.device("cuda")
+        device = torch.device(device_type)
         x = (torch.eye(128, 128) * 2).to(device=device)
         opt_fn = torch.compile(
             MatMulModule().to(device),
diff --git a/test/inductor/test_flex_attention.py b/test/inductor/test_flex_attention.py
index a1e5aa3cebc45..7a2f9ecdeae8b 100644
--- a/test/inductor/test_flex_attention.py
+++ b/test/inductor/test_flex_attention.py
@@ -3249,7 +3249,14 @@ def test_strided_backwards(self, device):
         V_sliced = V[:, :, :-128]
 
         out_eager = flex_attention(Q, K_sliced, V_sliced)
-        out_compiled = func(Q, K_sliced, V_sliced)
+
+        out_compiled, code = run_and_get_code(func, Q, K_sliced, V_sliced)
+
+        # Make sure flex attention kernels have flex_attention in name
+        FileCheck().check_regex("triton_tem_fused_flex_attention.*").run(code[0])
+        FileCheck().check_regex("triton_tem_fused_flex_attention_backward.*").run(
+            code[1]
+        )
 
         grad = torch.rand_like(out_eager)
 
@@ -5807,11 +5814,11 @@ def causal_mask(b, h, q_idx, kv_idx):
 
         from torch.utils._pytree import GetAttrKey
 
-        for key, tensor in tensors_with_keys:
+        for key, _tensor in tensors_with_keys:
             self.assertIsInstance(key, GetAttrKey)
             self.assertIsNotNone(key)
 
-        for key, value in context_with_keys:
+        for key, _value in context_with_keys:
             self.assertIsInstance(key, GetAttrKey)
             self.assertIsNotNone(key)
 
diff --git a/test/inductor/test_fp8.py b/test/inductor/test_fp8.py
index f26a2347e4e86..f1067b8ffebb3 100644
--- a/test/inductor/test_fp8.py
+++ b/test/inductor/test_fp8.py
@@ -6,6 +6,7 @@
 
 import torch
 from torch import Tensor
+from torch._C import FileCheck
 from torch._inductor import config, utils
 from torch._inductor.pattern_matcher import PatternMatcherPass
 from torch._inductor.test_case import run_tests, TestCase
@@ -29,7 +30,6 @@
     HAS_CPU,
     HAS_CUDA_AND_TRITON,
 )
-from torch.testing._internal.jit_utils import FileCheck
 from torch.utils._triton import has_triton_tma_device
 
 
@@ -953,6 +953,240 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
         self.assertEqual(y_compiled.dtype, dtype)
         torch.testing.assert_close(y_eager, y_compiled, rtol=5e-2, atol=0.07)
 
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
+    @torch._inductor.config.patch("emulate_precision_casts", True)
+    def test_mx_fusion(self):
+        # Register fake_scaled_mm custom op scoped to this test
+        with torch.library._scoped_library("test_fp8", "FRAGMENT") as lib:
+            # Define the op schema
+            lib.define(
+                "fake_scaled_mm(Tensor mat_a, Tensor mat_b, Tensor scale_a, Tensor scale_b, "
+                "Tensor? bias=None, Tensor? scale_result=None, ScalarType? out_dtype=None, "
+                "bool use_fast_accum=False) -> Tensor"
+            )
+            input_values = []
+
+            # Register CUDA implementation
+            @torch.library.impl(lib, "fake_scaled_mm", "CUDA")
+            def fake_scaled_mm_impl(
+                mat_a,
+                mat_b,
+                scale_a,
+                scale_b,
+                bias=None,
+                scale_result=None,
+                out_dtype=None,
+                use_fast_accum=False,
+            ):
+                """Software-emulated scaled_mm for testing without CUDA 12.8"""
+                out_dtype = out_dtype or torch.bfloat16
+                # just using add, because without real dtypes,
+                # was seeing overflow/instability
+                nonlocal input_values
+                input_values.append((mat_a, mat_b, scale_a, scale_b))
+                result = mat_a.to(torch.float32) + mat_b.to(torch.float32)
+                if bias is not None:
+                    result = result + bias.to(torch.float32)
+                return result.to(out_dtype)
+
+            # Register fake implementation
+            @torch.library.impl(lib, "fake_scaled_mm", "Meta")
+            def fake_scaled_mm_meta(
+                mat_a,
+                mat_b,
+                scale_a,
+                scale_b,
+                bias=None,
+                scale_result=None,
+                out_dtype=None,
+                use_fast_accum=False,
+            ):
+                """FakeTensor implementation"""
+                out_dtype = out_dtype or torch.bfloat16
+                M, K = mat_a.shape
+                K2, N = mat_b.shape
+                torch._check(
+                    K == K2,
+                    lambda: f"Incompatible shapes: {mat_a.shape} @ {mat_b.shape}",
+                )
+                return torch.empty((M, N), dtype=out_dtype, device=mat_a.device)
+
+            def forward(
+                arg0_1,
+                arg1_1,
+            ):
+                view = torch.ops.aten.reshape.default(arg0_1, [8192, 256, 32])
+                abs_1 = torch.ops.aten.abs.default(view)
+                amax = torch.ops.aten.amax.default(abs_1, [-1])
+                unsqueeze = torch.ops.aten.unsqueeze.default(amax, -1)
+                view_1 = torch.ops.aten.view.dtype(unsqueeze, torch.int32)
+                bitwise_right_shift = torch.ops.aten.bitwise_right_shift.Tensor_Scalar(
+                    view_1, 23
+                )
+                bitwise_and = torch.ops.aten.bitwise_and.Scalar(
+                    bitwise_right_shift, 255
+                )
+                sub = torch.ops.aten.sub.Tensor(bitwise_and, 127)
+                sub_1 = torch.ops.aten.sub.Tensor(sub, 8)
+                clamp_min = torch.ops.aten.clamp_min.default(sub_1, -127)
+                clamp_max = torch.ops.aten.clamp_max.default(clamp_min, 128)
+                add = torch.ops.aten.add.Tensor(clamp_max, 127)
+                convert_element_type = torch.ops.prims.convert_element_type.default(
+                    add, torch.uint8
+                )
+                isnan = torch.ops.aten.isnan.default(unsqueeze)
+                scalar_tensor = torch.ops.aten.scalar_tensor.default(
+                    255, dtype=torch.uint8, layout=torch.strided, device="cuda"
+                )
+                where = torch.ops.aten.where.self(
+                    isnan, scalar_tensor, convert_element_type
+                )
+                convert_element_type_1 = torch.ops.prims.convert_element_type.default(
+                    where, torch.int32
+                )
+                bitwise_left_shift = torch.ops.aten.bitwise_left_shift.Tensor_Scalar(
+                    convert_element_type_1, 23
+                )
+                view_2 = torch.ops.aten.view.dtype(bitwise_left_shift, torch.float32)
+                clamp_min_1 = torch.ops.aten.clamp_min.default(
+                    view_2, 1.1754943508222875e-38
+                )
+                div = torch.ops.aten.div.Tensor(view, clamp_min_1)
+                clamp_min_2 = torch.ops.aten.clamp_min.default(div, -448.0)
+                clamp_max_1 = torch.ops.aten.clamp_max.default(clamp_min_2, 448.0)
+                convert_element_type_2 = torch.ops.prims.convert_element_type.default(
+                    clamp_max_1, torch.float8_e4m3fn
+                )
+                view_3 = torch.ops.aten.reshape.default(
+                    convert_element_type_2, [8192, 8192]
+                )
+                convert_element_type_2 = None
+                view_4 = torch.ops.aten.view.dtype(where, torch.float8_e8m0fnu)
+                squeeze = torch.ops.aten.squeeze.dim(view_4, -1)
+
+                view_5 = torch.ops.aten.reshape.default(arg1_1, [8192, 256, 32])
+                abs_2 = torch.ops.aten.abs.default(view_5)
+                amax_1 = torch.ops.aten.amax.default(abs_2, [-1])
+                unsqueeze_1 = torch.ops.aten.unsqueeze.default(amax_1, -1)
+                view_6 = torch.ops.aten.view.dtype(unsqueeze_1, torch.int32)
+                bitwise_right_shift_1 = (
+                    torch.ops.aten.bitwise_right_shift.Tensor_Scalar(view_6, 23)
+                )
+                bitwise_and_1 = torch.ops.aten.bitwise_and.Scalar(
+                    bitwise_right_shift_1, 255
+                )
+                sub_2 = torch.ops.aten.sub.Tensor(bitwise_and_1, 127)
+                sub_3 = torch.ops.aten.sub.Tensor(sub_2, 8)
+                clamp_min_3 = torch.ops.aten.clamp_min.default(sub_3, -127)
+                clamp_max_2 = torch.ops.aten.clamp_max.default(clamp_min_3, 128)
+                add_1 = torch.ops.aten.add.Tensor(clamp_max_2, 127)
+                convert_element_type_3 = torch.ops.prims.convert_element_type.default(
+                    add_1, torch.uint8
+                )
+                isnan_1 = torch.ops.aten.isnan.default(unsqueeze_1)
+                unsqueeze_1 = None
+                scalar_tensor_1 = torch.ops.aten.scalar_tensor.default(
+                    255, dtype=torch.uint8, layout=torch.strided, device="cuda"
+                )
+                where_1 = torch.ops.aten.where.self(
+                    isnan_1, scalar_tensor_1, convert_element_type_3
+                )
+                convert_element_type_4 = torch.ops.prims.convert_element_type.default(
+                    where_1, torch.int32
+                )
+                bitwise_left_shift_1 = torch.ops.aten.bitwise_left_shift.Tensor_Scalar(
+                    convert_element_type_4, 23
+                )
+                convert_element_type_4 = None
+                view_7 = torch.ops.aten.view.dtype(bitwise_left_shift_1, torch.float32)
+                bitwise_left_shift_1 = None
+                clamp_min_4 = torch.ops.aten.clamp_min.default(
+                    view_7, 1.1754943508222875e-38
+                )
+                div_1 = torch.ops.aten.div.Tensor(view_5, clamp_min_4)
+                clamp_min_5 = torch.ops.aten.clamp_min.default(div_1, -448.0)
+                clamp_max_3 = torch.ops.aten.clamp_max.default(clamp_min_5, 448.0)
+                convert_element_type_5 = torch.ops.prims.convert_element_type.default(
+                    clamp_max_3, torch.float8_e4m3fn
+                )
+                view_8 = torch.ops.aten.reshape.default(
+                    convert_element_type_5, [8192, 8192]
+                )
+                view_9 = torch.ops.aten.view.dtype(where_1, torch.float8_e8m0fnu)
+                squeeze_1 = torch.ops.aten.squeeze.dim(view_9, -1)
+
+                permute = torch.ops.aten.permute.default(view_8, [1, 0])
+
+                view_13 = torch.ops.aten.reshape.default(squeeze, [64, 128, 64, 4])
+                permute_2 = torch.ops.aten.permute.default(view_13, [0, 2, 1, 3])
+                clone = torch.ops.aten.clone.default(
+                    permute_2, memory_format=torch.contiguous_format
+                )
+                view_14 = torch.ops.aten.reshape.default(clone, [4096, 4, 32, 4])
+                permute_3 = torch.ops.aten.permute.default(view_14, [0, 2, 1, 3])
+                clone_1 = torch.ops.aten.clone.default(
+                    permute_3, memory_format=torch.contiguous_format
+                )
+                view_15 = torch.ops.aten.reshape.default(clone_1, [4096, 32, 16])
+
+                view_16 = torch.ops.aten.reshape.default(view_15, [2097152])
+
+                view_18 = torch.ops.aten.reshape.default(squeeze_1, [64, 128, 64, 4])
+                permute_5 = torch.ops.aten.permute.default(view_18, [0, 2, 1, 3])
+                clone_2 = torch.ops.aten.clone.default(
+                    permute_5, memory_format=torch.contiguous_format
+                )
+                view_19 = torch.ops.aten.reshape.default(clone_2, [4096, 4, 32, 4])
+                permute_6 = torch.ops.aten.permute.default(view_19, [0, 2, 1, 3])
+                clone_3 = torch.ops.aten.clone.default(
+                    permute_6, memory_format=torch.contiguous_format
+                )
+                view_20 = torch.ops.aten.reshape.default(clone_3, [4096, 32, 16])
+
+                view_21 = torch.ops.aten.reshape.default(view_20, [2097152])
+
+                _scaled_mm = torch.ops.test_fp8.fake_scaled_mm.default(
+                    view_3, permute, view_16, view_21, None, None, torch.float32
+                )
+                return (_scaled_mm,)
+
+            # Run with largest shape
+            M, K, N = 8192, 8192, 8192
+            device = "cuda"
+
+            A = torch.randn(M, K, dtype=torch.float32, device=device)
+            B = torch.randn(K, N, dtype=torch.float32, device=device)
+            f_c = torch.compile(fullgraph=True)(forward)
+
+            _, code = run_and_get_code(f_c, A, B)
+
+            FileCheck().check(".run(").check(".run(").check("fake_scaled_mm").run(
+                code[0]
+            )
+
+            for seed in range(5):
+                input_values.clear()
+                torch.manual_seed(seed)
+                # without dividing, outputs get way too large
+                A = torch.randn(M, K, dtype=torch.float32, device=device)
+                B = torch.randn(K, N, dtype=torch.float32, device=device)
+
+                # Uses fake_scaled_mm custom op (no CUDA 12.8 needed!)
+                torch._dynamo.reset()
+                torch.compile(forward)(A, B)
+
+                torch._dynamo.reset()
+                with config.patch({"loop_index_inversion_in_fusion": False}):
+                    torch.compile(forward)(A, B)
+
+                assert len(input_values) == 2
+                for i in range(4):
+                    self.assertEqual(
+                        input_values[0][i],
+                        input_values[1][i],
+                        msg=f"idx {i} seed {seed}",
+                    )
+
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     @parametrize("M", (1, 3, 33, 257, 1024))
     @parametrize("K", (16, 32, 1024))
diff --git a/test/inductor/test_fx_fusion.py b/test/inductor/test_fx_fusion.py
index ebe98373e622a..63342502d3cd9 100644
--- a/test/inductor/test_fx_fusion.py
+++ b/test/inductor/test_fx_fusion.py
@@ -1,5 +1,6 @@
 # Owner(s): ["module: inductor"]
-from typing import Any, Callable
+from collections.abc import Callable
+from typing import Any
 
 import torch
 from torch._inductor.fx_passes.pre_grad import (
diff --git a/test/inductor/test_fxir_backend.py b/test/inductor/test_fxir_backend.py
index 3ebc7f04b3887..2c232594f3329 100644
--- a/test/inductor/test_fxir_backend.py
+++ b/test/inductor/test_fxir_backend.py
@@ -148,6 +148,24 @@ def test_basic(self):
         args = [torch.randn(8, device=self.device) for _ in range(2)]
         self._compile_and_check(torch.add, args)
 
+    def test_device_type(self):
+        """
+        Test that we allocate on a device type instead of a specific index.
+        """
+        # Pass in a tensor on an indexed device.
+        device_runtime = getattr(torch, self.device)
+        indexed_device = torch.device(self.device, device_runtime.current_device())
+        args = [torch.randn(8, device=indexed_device) for _ in range(2)]
+        (gm,) = self._compile_and_check(torch.add, args)
+        (empty_strided,) = gm.graph.find_nodes(
+            op="call_function", target=torch.empty_strided
+        )
+
+        # Check that the device of the output allocation is not indexed.
+        output_device = torch.device(empty_strided.kwargs["device"])
+        self.assertIs(output_device.index, None)
+        self.assertEqual(output_device.type, indexed_device.type)
+
     def test_multiple_kernels(self):
         def foo(x, y):
             return x.sum() + y.sum()
@@ -813,7 +831,9 @@ def check(
             gm = torch._inductor.aot_compile(
                 ep.module(), inp, options={"fx_wrapper": True, **test_config}
             )
-            self.assertTrue(same(model(*inp), gm(*inp)))
+            # Flatten args for fx_wrapper gm
+            flat_args, _ = pytree.tree_flatten(inp)
+            self.assertTrue(same(model(*inp), gm(*flat_args)))
 
             for node in gm.graph.nodes:
                 if (
@@ -1164,6 +1184,38 @@ def mock_set_hook(gm: torch.fx.GraphModule, fn):
             compiled_out = compiled(*args)
             self.assertEqual(compiled_out.shape, shape)
 
+    def test_reshape_dynamic_ph(self):
+        """
+        Test dynamic scalars using SymInts placeholder
+        """
+
+        class TestModule(torch.nn.Module):
+            def forward(self, x, shape):
+                return torch.reshape(x, shape) + 2
+
+        ds = {
+            "x": (torch.export.Dim.AUTO, torch.export.Dim.AUTO),
+            "shape": [torch.export.Dim.AUTO, torch.export.Dim.AUTO],
+        }
+        args = (torch.randn((12, 14), device=self.device), [6, 28])
+        self.check(TestModule(), args, ds)
+
+    def test_reshape_dynamic_tmd(self):
+        """
+        Test dynamic reshape using shape dependent information
+        """
+
+        class TestModule(torch.nn.Module):
+            def forward(self, x):
+                new_shape = [x.shape[0] // 2, x.shape[1] * 2]
+                return torch.reshape(x, new_shape) + 2
+
+        ds = {
+            "x": (torch.export.Dim.AUTO, torch.export.Dim.AUTO),
+        }
+        args = (torch.randn((12, 14), device=self.device),)
+        self.check(TestModule(), args, ds)
+
 
 class TestReplaceFloorDiv(InductorTestCase):
     """
diff --git a/test/inductor/test_gpu_cpp_wrapper.py b/test/inductor/test_gpu_cpp_wrapper.py
index e308ab2931b19..832b119c8455d 100644
--- a/test/inductor/test_gpu_cpp_wrapper.py
+++ b/test/inductor/test_gpu_cpp_wrapper.py
@@ -11,6 +11,8 @@
 from torch.testing._internal.inductor_utils import GPU_TYPE, RUN_GPU
 
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
 try:
     try:
         from . import (
@@ -306,11 +308,11 @@ class BaseTest(NamedTuple):
 
     from torch._inductor.utils import is_big_gpu
 
-    if GPU_TYPE == "cuda" and is_big_gpu():
+    if GPU_TYPE in ("cuda", "xpu") and is_big_gpu():
         skip_list = ["test_addmm", "test_linear_relu"]
         # need to skip instead of omit, otherwise fbcode ci can be flaky
         for test_name in skip_list:
-            test_failures_gpu_wrapper[f"{test_name}_cuda"] = (
+            test_failures_gpu_wrapper[f"{test_name}_{device_type}"] = (
                 test_torchinductor.TestFailure(("gpu_wrapper",), is_skip=True)
             )
             test_failures_gpu_wrapper[f"{test_name}_gpu_dynamic_shapes"] = (
diff --git a/test/inductor/test_inductor_scheduler.py b/test/inductor/test_inductor_scheduler.py
index 3a3583c144ebd..d22f4ad5f193a 100644
--- a/test/inductor/test_inductor_scheduler.py
+++ b/test/inductor/test_inductor_scheduler.py
@@ -16,7 +16,12 @@
     instantiate_device_type_tests,
     skipCUDAIf,
 )
-from torch.testing._internal.common_utils import parametrize, run_tests, TestCase
+from torch.testing._internal.common_utils import (
+    parametrize,
+    run_tests,
+    skipIfXpu,
+    TestCase,
+)
 from torch.testing._internal.inductor_utils import IS_BIG_GPU
 from torch.utils._ordered_set import OrderedSet
 
@@ -91,6 +96,10 @@ def test_disable_get_estimated_runtime_logging(self, device, dtype):
             metrics.reset()
         torch._logging.set_logs()
 
+    @skipIfXpu(
+        msg="InvalidModule: Invalid SPIR-V module, "
+        "https://github.com/intel/torch-xpu-ops/issues/2329"
+    )
     @dtypes(torch.float, torch.float16)
     @skipCUDAIf(not SM70OrLater, "GPU capability is < SM70")
     @parametrize(
diff --git a/test/inductor/test_loop_ordering.py b/test/inductor/test_loop_ordering.py
index c77b3574b2227..60b4ce077bfcd 100644
--- a/test/inductor/test_loop_ordering.py
+++ b/test/inductor/test_loop_ordering.py
@@ -16,6 +16,7 @@
 from torch._inductor import config as inductor_config, ir, metrics
 from torch._inductor.codegen.triton import TritonScheduling
 from torch._inductor.graph import GraphLowering
+from torch._inductor.invert_expr_analysis import generate_inverse_formula
 from torch._inductor.scheduler import SchedulerNode
 from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.test_operators import realize
@@ -1187,6 +1188,129 @@ def fn(nodes):
         with torch._inductor.config.patch(_post_fusion_custom_pass=fn), torch.no_grad():
             torch.compile(f)(x)
 
+    def test_find_broadcast_var(self):
+        """Test broadcast variable detection for tiling improvements."""
+        from torch._inductor import tiling_utils
+
+        i, j, k = sympy.symbols("i j k", integer=True)
+
+        # Test broadcast pattern detection: FloorDiv creates broadcast
+        result = tiling_utils.find_broadcast_var(
+            FloorDiv(i, 10), {i: 100, j: 50, k: 20}
+        )
+        self.assertEqual(result, i)
+
+        # Test non-broadcast: linear access pattern
+        result = tiling_utils.find_broadcast_var(i + j * 10, {i: 10, j: 8, k: 20})
+        self.assertEqual(result, None)
+
+
+class TestIndexInversion(TestCase):
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+
+        gm = torch.fx.symbolic_trace(lambda: 0)
+        graph = GraphLowering(gm)
+        graph.scheduler = MockScheduler
+        cls._exit_stack = contextlib.ExitStack()
+        cls._exit_stack.enter_context(V.set_graph_handler(graph))
+
+    def _check_expr(self, expr, reconstruction, val_range):
+        import numpy as np
+        from sympy import lambdify
+
+        assert len(expr.free_symbols) == 1
+        p0 = next(iter(expr.free_symbols))
+
+        def floordiv_replacement(a, b):
+            """Replace FloorDiv(a, b) with a // b"""
+            return a // b
+
+        def modularindexing_replacement(x, base, divisor):
+            """Replace ModularIndexing(x, base, divisor) with (x // base) % divisor"""
+            return (x // base) % divisor
+
+        # Replace custom functions with sympy equivalents
+        expr_numpy_ready = expr.replace(FloorDiv, floordiv_replacement).replace(
+            ModularIndexing, modularindexing_replacement
+        )
+        reconstruction_numpy_ready = reconstruction.replace(
+            FloorDiv, floordiv_replacement
+        ).replace(ModularIndexing, modularindexing_replacement)
+
+        # Now lambdify with standard numpy
+        forward_func = lambdify(p0, expr_numpy_ready, modules="numpy")
+        inverse_func = lambdify(p0, reconstruction_numpy_ready, modules="numpy")
+
+        test_values = np.arange(0, val_range, dtype=np.int64)
+        forward_values = forward_func(test_values).astype(np.int64)
+        recovered_values = inverse_func(forward_values).astype(np.int64)
+        torch.testing.assert_close(test_values, recovered_values)
+
+    @classmethod
+    def tearDownClass(cls):
+        super().tearDownClass()
+        cls._exit_stack.close()
+
+    def test_original_complex_expression(self):
+        """Test the original motivating complex expression."""
+        p0 = sympy.Symbol("p0")
+        expr = (
+            32768 * FloorDiv(p0, 32768)
+            + 8192 * FloorDiv(ModularIndexing(p0, 1, 16), 4)
+            + ModularIndexing(p0, 1, 4)
+            + 256 * ModularIndexing(p0, 16, 32)
+            + 4 * ModularIndexing(p0, 512, 64)
+        )
+
+        reconstruction = generate_inverse_formula(expr, p0)
+        self.assertIsNotNone(reconstruction)
+        self._check_expr(expr, reconstruction, 2097152)
+
+    def test_inversion_cases(self):
+        """Test various expressions for correct inversion behavior."""
+        p = sympy.Symbol("p")
+
+        cases = [
+            # (expression, should_be_invertible, test_range)
+            # Simple 2-term base-10 style: 10 = 1 × 10 ✓
+            (10 * ModularIndexing(p, 10, 10) + ModularIndexing(p, 1, 10), True, 100),
+            # Simple 2-term base-2 style: 2 = 1 × 2 ✓
+            (2 * ModularIndexing(p, 2, 2) + ModularIndexing(p, 1, 2), True, 4),
+            # 3-term decimal: 100 = 10×10, 10 = 1×10 ✓
+            (
+                100 * FloorDiv(p, 100)
+                + 10 * FloorDiv(ModularIndexing(p, 1, 100), 10)
+                + ModularIndexing(p, 1, 10),
+                True,
+                1000,
+            ),
+            (4 * p, False, 64),  # expr and inverse not bijections
+            # when sorted, invertible
+            (ModularIndexing(p, 1, 10) + 10 * ModularIndexing(p, 10, 10), True, None),
+            # Wrong coefficient ratios: 4 ≠ 1×2
+            (4 * ModularIndexing(p, 1, 8) + ModularIndexing(p, 8, 2), False, None),
+            (
+                100 * FloorDiv(p, 100) + 7 * ModularIndexing(p, 1, 100),
+                False,
+                None,
+            ),  # Wrong ratios
+            (FloorDiv(p, 100) + FloorDiv(p, 10) + p, False, None),  # Overlapping ranges
+            (p**2 + 10 * p + 1, False, None),  # Quadratic
+            (sympy.sin(p) + sympy.cos(p), False, None),  # Trigonometric
+        ]
+
+        for expr, should_invert, test_range in cases:
+            reconstruction = generate_inverse_formula(expr, p)
+
+            if should_invert:
+                self.assertIsNotNone(reconstruction, f"Expected invertible: {expr}")
+                # Test correctness on sample values
+                self._check_expr(expr, reconstruction, test_range)
+            else:
+                self.assertIsNone(reconstruction, f"Expected non-invertible: {expr}")
+
 
 if __name__ == "__main__":
     if HAS_GPU:
diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
index 2f753b7ae0e69..46a63db754697 100644
--- a/test/inductor/test_max_autotune.py
+++ b/test/inductor/test_max_autotune.py
@@ -876,15 +876,16 @@ def scaled_mm(
 
                 # Extract grid sizes from the trace events for TMA kernels
                 kernel_name = "triton_tem_fused"
-                kernel_events = [
-                    {
-                        "grid": evt.get("args", {}).get("grid", []),
-                        "grid_size": math.prod(evt.get("args", {}).get("grid", [])),
-                    }
-                    for evt in json.load(open(f.name))["traceEvents"]
-                    if evt.get("cat", "") == "kernel"
-                    and kernel_name in evt.get("name", "").lower()
-                ]
+                with open(f.name) as file:
+                    kernel_events = [
+                        {
+                            "grid": evt.get("args", {}).get("grid", []),
+                            "grid_size": math.prod(evt.get("args", {}).get("grid", [])),
+                        }
+                        for evt in json.load(file)["traceEvents"]
+                        if evt.get("cat", "") == "kernel"
+                        and kernel_name in evt.get("name", "").lower()
+                    ]
 
                 # We should have exactly 1 kernel event for this run
                 self.assertEqual(
@@ -1416,11 +1417,6 @@ def mock_lookup(self, *args, **kwargs):
                 torch.compile(lambda a, b: a.matmul(b))(a, b)
             self.assertIn("NoValidChoicesError", str(context.exception))
 
-    @unittest.skipIf(
-        not torch.cuda.is_available()
-        or torch.cuda.get_device_properties().total_memory < 2e10,
-        "Only if the GPU has at least 20GB memory to be safe",
-    )
     @config.patch(force_shape_pad=True, max_autotune=True)
     def test_linear_and_cel(self):
         """
@@ -1913,6 +1909,29 @@ def mm_transpose_relu(a, b):
             # Check that contiguous transform was used
             FileCheck().check("contiguous_mm").run(code[0])
 
+    @unittest.skipIf(config.cpp_wrapper, "out_dtype override not supported for AOTI")
+    @unittest.skipIf(TEST_WITH_ROCM, "out_dtype override only available on NVIDIA")
+    def test_bmm_out_dtype(self):
+        def f(a, b):
+            return torch.bmm(a, b, out_dtype=torch.float32)
+
+        a = torch.randn(2, 3, 4, device=GPU_TYPE, dtype=torch.float16)
+        b = torch.randn(2, 4, 5, device=GPU_TYPE, dtype=torch.float16)
+        with config.patch(
+            max_autotune=True,
+            max_autotune_gemm_backends="TRITON",
+        ):
+            compiled_f = torch.compile(f)
+            with self.assertRaisesRegex(
+                torch._inductor.exc.InductorError,
+                r"LoweringException: NoValidChoicesError: No choices to select",
+            ):
+                out, code = run_and_get_code(compiled_f, a, b)
+
+        compiled_f = torch.compile(f)
+        out, code = run_and_get_code(compiled_f, a, b)
+        FileCheck().check("extern_kernels.bmm_dtype").run(code[0])
+
     def test_triton_template_generated_code_cache_key(self):
         generate_and_load_args = len(
             inspect.signature(
diff --git a/test/inductor/test_mem_estimation.py b/test/inductor/test_mem_estimation.py
index 4b49982c6377d..2f0ccfe6b284d 100644
--- a/test/inductor/test_mem_estimation.py
+++ b/test/inductor/test_mem_estimation.py
@@ -3,7 +3,8 @@
 import functools
 import weakref
 from collections import Counter
-from typing import Callable, Optional
+from collections.abc import Callable
+from typing import Optional
 
 import torch
 from torch._inductor.fx_passes.memory_estimator import (
@@ -28,7 +29,7 @@ def device_filter(device):
 
 
 class FakeTensorMemoryProfilerMode(TorchDispatchMode):
-    def __init__(self, device_filter: Optional[Callable[torch.device, bool]] = None):
+    def __init__(self, device_filter: Optional[Callable[[torch.device], bool]] = None):
         # counter of storage ids to live references
         self.storage_count: dict[int, int] = Counter()
         # live fake tensors
diff --git a/test/inductor/test_memory.py b/test/inductor/test_memory.py
index 158200edc729e..2bb3cf9d66432 100644
--- a/test/inductor/test_memory.py
+++ b/test/inductor/test_memory.py
@@ -319,11 +319,6 @@ def f(a, p):
                 # succ nodes should be forwarded to pre mutation buffer
                 self.assertTrue(buffer_info[post][2] <= buffer_info[pre][2])
 
-    @unittest.skipIf(
-        not torch.cuda.is_available()
-        or torch.cuda.get_device_properties().total_memory < int(1e10),
-        "Need 10GB memory to be safe to run the test",
-    )
     def test_fusing_reductions_increase_peak_memory(self):
         @torch.compile
         def f(a, b, c):
@@ -332,9 +327,9 @@ def f(a, b, c):
         a = torch.randn(1024 * 32, 16, device=GPU_TYPE)
         b = torch.randn(1024 * 32, 16, device=GPU_TYPE)
         c = torch.randn(16, 1024 * 32, device=GPU_TYPE)
-        torch.cuda.reset_peak_memory_stats()
+        torch.get_device_module(GPU_TYPE).reset_peak_memory_stats()
         f(a, b, c)
-        peak_mem = torch.cuda.max_memory_allocated()
+        peak_mem = torch.get_device_module(GPU_TYPE).max_memory_allocated()
 
         expected_bound = a.size(0) * c.size(1) * a.dtype.itemsize * 2
         self.assertLess(peak_mem, expected_bound)
diff --git a/test/inductor/test_memory_planning.py b/test/inductor/test_memory_planning.py
index 867121cd68f9e..ba0aec9650310 100644
--- a/test/inductor/test_memory_planning.py
+++ b/test/inductor/test_memory_planning.py
@@ -3,7 +3,7 @@
 import sys
 import unittest
 
-from torch.testing._internal.common_utils import IS_CI, IS_WINDOWS, skipIfXpu
+from torch.testing._internal.common_utils import IS_CI, IS_WINDOWS
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU, requires_gpu
 
 
@@ -82,7 +82,6 @@ def test_cpp_wrapper(self):
         ).run(code)
         self.assertTrue(same(f(*args), result))
 
-    @skipIfXpu(msg="aoti doesn't work on XPU")
     def test_aoti(self):
         f, args = self._generate(device=GPU_TYPE)
         dim0_x = Dim("dim0_x", min=1, max=2048)
diff --git a/test/inductor/test_minifier.py b/test/inductor/test_minifier.py
index e8d695a1852d3..2da1a1386212b 100644
--- a/test/inductor/test_minifier.py
+++ b/test/inductor/test_minifier.py
@@ -7,12 +7,7 @@
 from torch._dynamo.test_minifier_common import MinifierTestBase
 from torch._inductor import config
 from torch.export import load as export_load
-from torch.testing._internal.common_utils import (
-    IS_JETSON,
-    IS_MACOS,
-    skipIfXpu,
-    TEST_WITH_ASAN,
-)
+from torch.testing._internal.common_utils import IS_JETSON, IS_MACOS, TEST_WITH_ASAN
 from torch.testing._internal.inductor_utils import GPU_TYPE
 from torch.testing._internal.triton_utils import requires_gpu
 
@@ -278,7 +273,6 @@ def test_aoti_cpu_compile_error_unflatten(self):
         self._aoti_check_relu_repro(res)
 
     @requires_gpu
-    @skipIfXpu(msg="AOTI for XPU not enabled yet")
     @inductor_config.patch(
         "triton.inject_relu_bug_TESTING_ONLY",
         "compile_error",
@@ -288,7 +282,6 @@ def test_aoti_gpu_compile_error(self):
         self._aoti_check_relu_repro(res)
 
     @requires_gpu
-    @skipIfXpu(msg="AOTI for XPU not enabled yet")
     @inductor_config.patch(
         "triton.inject_relu_bug_TESTING_ONLY",
         "compile_error",
@@ -304,7 +297,6 @@ def test_aoti_cpu_accuracy_error(self):
         self._aoti_check_relu_repro(res)
 
     @requires_gpu
-    @skipIfXpu(msg="AOTI for XPU not enabled yet")
     @inductor_config.patch("triton.inject_relu_bug_TESTING_ONLY", "accuracy")
     def test_aoti_gpu_accuracy_error(self):
         res = self._test_aoti(GPU_TYPE, "AccuracyError")
diff --git a/test/inductor/test_mix_order_reduction.py b/test/inductor/test_mix_order_reduction.py
index 230a2514b9171..1114810ceccdf 100644
--- a/test/inductor/test_mix_order_reduction.py
+++ b/test/inductor/test_mix_order_reduction.py
@@ -117,6 +117,74 @@ def outer_red():
             metrics.codegen_mix_order_reduction,
         )
 
+    def test_xmask(self):
+        """
+        Make sure xmask is setup properly
+        """
+        if not inductor_config.triton.mix_order_reduction:
+            self.skipTest("Mix order reduction not enabled")
+
+        def f(x):
+            return x.sum(dim=0), x.sum(dim=1)
+
+        M, N = 32768 + 1023, 768
+        EXTRA_ROW = 1
+        buf = torch.randn(M + EXTRA_ROW, N, device=GPU_TYPE)
+        x = buf[:M, :]
+        # make sure wrong xmask error loud if read excess elements
+        buf[M:, :] = 1000000
+
+        opt_f = torch.compile(
+            f,
+            options={
+                "triton.mix_order_reduction_initial_xblock": 2,
+            },
+        )
+
+        ref = f(x)
+        act = opt_f(x)
+
+        self.assertTrue(same(ref, act, tol=1e-3), f"ref:\n{ref}\nact:\n{act}")
+        self.assertEqual(
+            inductor_config.triton.mix_order_reduction,
+            metrics.codegen_mix_order_reduction,
+        )
+
+    def test_avoid_non_coalesced_access(self):
+        if not inductor_config.triton.mix_order_reduction:
+            self.skipTest("Mix order reduction not enabled")
+
+        def f(x, y):
+            return (x + y).sum(dim=-1), x.sum(dim=(0, 1))
+
+        x = torch.randn(128, 256, 768, device=GPU_TYPE)
+        y = torch.randn(128, 768, 256, device=GPU_TYPE).transpose(1, 2)
+        self.check_numeric(f, (x, y))
+
+        # we skip mix order reduction for such kernel since
+        # we force XBLOCK to be 1, the access to tensor y would be
+        # very inefficient.
+        # TODO: support XBLOCK larger than 1. But in that case, we
+        # would have bigger restriction on rnumel to avoid exploding
+        # shared memory.
+        self.assertEqual(metrics.codegen_mix_order_reduction, 0)
+
+    @inductor_config.patch(coordinate_descent_tuning=True)
+    def test_XBLOCK_coordest_tuning(self):
+        """
+        We should skip XBLOCK coordinate descent tuning for
+        mix order reduction.
+        """
+        if not inductor_config.triton.mix_order_reduction:
+            self.skipTest("Mix order reduction not enabled")
+
+        def f(x):
+            return x.sum(dim=-1), x.sum(dim=0)
+
+        x = torch.randn(32768, 256, dtype=torch.float, device=GPU_TYPE)
+        self.check_numeric(f, (x,))
+        self.assertEqual(metrics.codegen_mix_order_reduction, 1)
+
     @inductor_config.patch(unroll_reductions_threshold=1)
     def test_3layer_split_reduction(self):
         """
@@ -183,8 +251,8 @@ def test_multi_workspace_allocation(self):
         def f(x, y):
             return x.sum(dim=0), x.sum(dim=1), y.sum(dim=0), y.sum(dim=1)
 
-        x = torch.randn(4096, 32, device=GPU_TYPE)
-        y = torch.randn(4098, 34, device=GPU_TYPE)
+        x = torch.randn(4096 * 64, 32, device=GPU_TYPE)
+        y = torch.randn(4098 * 64, 34, device=GPU_TYPE)
 
         self.check_numeric(f, (x, y))
         expected_mix_order_reduction = (
@@ -202,8 +270,35 @@ def f(x, y):
         ],
     )
     @parametrize("split_reductions", (False, True))
-    @parametrize("shape", ((32768, 2048), (32768, 768), (32769, 768)))
-    def test_rms_norm_bwd(self, wdtype, split_reductions, shape):
+    @parametrize(
+        "shape", ((1000000, 256), (32768, 2048), (32768, 768), (32768 + 1023, 768))
+    )
+    @parametrize("max_autotune", (False, True))
+    @parametrize("initial_xblock", (1, 2))
+    @parametrize("add_1dim", (False, True))
+    def test_rms_norm_bwd(
+        self,
+        wdtype,
+        split_reductions,
+        shape,
+        max_autotune,
+        initial_xblock,
+        add_1dim,
+    ):
+        # max_autotune can be slow and cost resource, trim down the tests
+        # for max autotune
+        if max_autotune and not (
+            wdtype == torch.bfloat16
+            and not split_reductions
+            and shape in ((32768, 768), (32769, 768))
+            and initial_xblock == 1
+            and inductor_config.triton.mix_order_reduction
+        ):
+            self.skipTest("Skip non-critical tests to save resources.")
+
+        if shape != (1000000, 256) and add_1dim:
+            self.skipTest("Skip non-critical tests to save resources.")
+
         def f(x, w, eps):
             orig_dtype = x.dtype
 
@@ -224,6 +319,9 @@ def fwd_bwd(f):
         # M, N = 1152 * 500, 384
         M, N = shape
         x = torch.randn(M, N, dtype=torch.bfloat16, device=GPU_TYPE, requires_grad=True)
+        if add_1dim:
+            x = x[:, None, :]
+
         w = torch.randn(N, dtype=wdtype, device=GPU_TYPE, requires_grad=True)
         dy = torch.randn_like(x)
         eps = 1e-5
@@ -232,6 +330,15 @@ def fwd_bwd(f):
             f,
             options={
                 "split_reductions": split_reductions,
+                "triton.mix_order_reduction_initial_xblock": initial_xblock,
+                **(
+                    {
+                        "max_autotune": True,
+                        "coordinate_descent_tuning": True,
+                    }
+                    if max_autotune
+                    else {}
+                ),
             },
         )
 
diff --git a/test/inductor/test_mkldnn_pattern_matcher.py b/test/inductor/test_mkldnn_pattern_matcher.py
index 709b1fe7f0798..c135d05f060f1 100644
--- a/test/inductor/test_mkldnn_pattern_matcher.py
+++ b/test/inductor/test_mkldnn_pattern_matcher.py
@@ -142,7 +142,7 @@ def cal_conv_generated_kernel_number(mod, input, dtype, dim=4, device="cpu"):
 
 class TestPatternMatcherBase(TestCase):
     def setUp(self):
-        TestCase.setUp(self)
+        super().setUp()
         self.ctx_stack = contextlib.ExitStack()
         self.ctx_stack.enter_context(config.patch({"freezing": True}))
 
diff --git a/test/inductor/test_move_constructors_to_cuda.py b/test/inductor/test_move_constructors_to_gpu.py
similarity index 80%
rename from test/inductor/test_move_constructors_to_cuda.py
rename to test/inductor/test_move_constructors_to_gpu.py
index b174c79f1ebd0..f304196f4a2ce 100644
--- a/test/inductor/test_move_constructors_to_cuda.py
+++ b/test/inductor/test_move_constructors_to_gpu.py
@@ -7,19 +7,23 @@
 from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.utils import run_and_get_code
 from torch.testing import FileCheck
-from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_utils import IS_LINUX
-from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON
+from torch.testing._internal.inductor_utils import (
+    GPU_TYPE,
+    HAS_GPU_AND_TRITON,
+    HAS_MULTIGPU,
+)
 
 
 requires_multigpu = functools.partial(
-    unittest.skipIf, not TEST_MULTIGPU, "requires multiple cuda devices"
+    unittest.skipIf, not HAS_MULTIGPU, f"requires multiple {GPU_TYPE} devices"
 )
 
+
 aten = torch.ops.aten
 
 
-class TestMoveConstructorsToCuda(TestCase):
+class TestMoveConstructorsToGpu(TestCase):
     def _check_fn(self, func, expect_cpu, *args):
         out_eager = func(*args)
 
@@ -36,7 +40,7 @@ def test_simple(self):
         def foo(x):
             return x[torch.arange(x.shape[0])]
 
-        inp = torch.rand(32, 77, 512, device="cuda")
+        inp = torch.rand(32, 77, 512, device=GPU_TYPE)
 
         self._check_fn(foo, False, inp)
 
@@ -45,14 +49,14 @@ def foo(x):
             tmp1 = torch.arange(x.shape[0])
             return tmp1, x[tmp1]
 
-        inp = torch.rand(32, 77, 512, device="cuda")
+        inp = torch.rand(32, 77, 512, device=GPU_TYPE)
 
         self._check_fn(foo, True, inp)
 
     def test_non_convertable_op_failure(self):
         def foo(x):
             y = torch.arange(x.shape[0])
-            return x + y, torch.ones([4], device="cuda")
+            return x + y, torch.ones([4], device=GPU_TYPE)
 
         inp = torch.rand([100])
 
@@ -76,7 +80,7 @@ def foo(x):
             c2 = torch.arange(-1, 3)
             return x[c1 + c2], c2 - 4 * 2
 
-        inp = torch.rand([4]).cuda()
+        inp = torch.rand([4]).to(GPU_TYPE)
         _, code = run_and_get_code(foo, inp)
         FileCheck().check_not("triton.jit").run(code[0])
 
@@ -95,12 +99,12 @@ def test_multi_gpu(self):
         def foo(x):
             return (
                 x[torch.arange(x.shape[0])],
-                torch.ones([4], device="cuda:0"),
-                torch.ones([4], device="cuda:1"),
+                torch.ones([4], device=f"{GPU_TYPE}:0"),
+                torch.ones([4], device=f"{GPU_TYPE}:1"),
             )
 
         # nyi, multi-gpu
-        inp = torch.rand([100], device="cuda")
+        inp = torch.rand([100], device=GPU_TYPE)
         self._check_fn(foo, True, inp)
 
     def test_no_gpu(self):
@@ -112,5 +116,5 @@ def foo(x):
 
 
 if __name__ == "__main__":
-    if IS_LINUX and HAS_CUDA_AND_TRITON:
+    if IS_LINUX and HAS_GPU_AND_TRITON:
         run_tests()
diff --git a/test/inductor/test_multi_kernel.py b/test/inductor/test_multi_kernel.py
index 8284ac639fbe9..55f54756913db 100644
--- a/test/inductor/test_multi_kernel.py
+++ b/test/inductor/test_multi_kernel.py
@@ -70,7 +70,6 @@ def make_cpp_wrapper_test(orig_test, **extra_args):
     """
 
     @config.patch("cpp_wrapper", True)
-    @skipIfXpu(msg="cpp wrapper doesn't currently work on the XPU stack")
     def fn(self):
         # The same kernel may have been compiled by previous tests with
         # cpp_wrapper disabled. Clear the cache so we go ahead to re-compile
@@ -111,8 +110,8 @@ def test_softmax(self, expect_multi_kernel=True):
     @requires_triton()
     # TODO: bobrenjc93 to fix multi-kernel for ROCM
     @skipIfRocm
-    @skipIfXpu
     @unittest.skipIf(not IS_BIG_GPU, "templates require big gpu")
+    @skipIfXpu(msg="https://github.com/intel/torch-xpu-ops/issues/2295")
     def test_triton_gemm(self):
         def fn(x, y):
             return x @ y
@@ -137,10 +136,10 @@ def fn(x, y):
         self.assertEqual(ref, act)
         self.assertTrue(_contains_size_hint_multi_kernel_code(wrapper_code))
 
+    @skipIfXpu(msg="https://github.com/intel/torch-xpu-ops/issues/2295")
     @requires_triton()
     # TODO: bobrenjc93 to fix multi-kernel for ROCM
     @skipIfRocm
-    @skipIfXpu
     @unittest.skipIf(not IS_BIG_GPU, "templates require big gpu")
     def test_triton_relu_fused_gemm(self):
         def fn(x, y):
diff --git a/test/inductor/test_native_matmul.py b/test/inductor/test_native_matmul.py
index 1870a0e373be0..c37f844e41eae 100644
--- a/test/inductor/test_native_matmul.py
+++ b/test/inductor/test_native_matmul.py
@@ -1,7 +1,7 @@
 # Owner(s): ["module: inductor"]
 
 
-from typing import Callable
+from collections.abc import Callable
 
 import torch
 from torch._dynamo.testing import rand_strided
diff --git a/test/inductor/test_ordered_set.py b/test/inductor/test_ordered_set.py
index c588018fcf667..debd621b0659c 100644
--- a/test/inductor/test_ordered_set.py
+++ b/test/inductor/test_ordered_set.py
@@ -57,6 +57,7 @@ class TestJointOps(TestCase):
     basetype = OrderedSet
 
     def setUp(self):
+        super().setUp()
         self.word = word = "simsalabim"
         self.otherword = "madagascar"
         self.letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
@@ -851,6 +852,7 @@ def test_issue_37219(self):
 
 class TestBasicOpsEmpty(TestBasicOps, TestCase):
     def setUp(self):
+        super().setUp()
         self.case = "empty OrderedSet"
         self.values = []
         self.OrderedSet = OrderedSet(self.values)
@@ -864,6 +866,7 @@ def setUp(self):
 
 class TestBasicOpsSingleton(TestBasicOps, TestCase):
     def setUp(self):
+        super().setUp()
         self.case = "unit OrderedSet (number)"
         self.values = [3]
         self.OrderedSet = OrderedSet(self.values)
@@ -883,6 +886,7 @@ def test_not_in(self):
 
 class TestBasicOpsTuple(TestBasicOps, TestCase):
     def setUp(self):
+        super().setUp()
         self.case = "unit OrderedSet (tuple)"
         self.values = [(0, "zero")]
         self.OrderedSet = OrderedSet(self.values)
@@ -902,6 +906,7 @@ def test_not_in(self):
 
 class TestBasicOpsTriple(TestBasicOps, TestCase):
     def setUp(self):
+        super().setUp()
         self.case = "triple OrderedSet"
         self.values = [0, "zero", operator.add]
         self.OrderedSet = OrderedSet(self.values)
@@ -915,6 +920,7 @@ def setUp(self):
 
 class TestBasicOpsString(TestBasicOps, TestCase):
     def setUp(self):
+        super().setUp()
         self.case = "string OrderedSet"
         self.values = ["a", "b", "c"]
         self.OrderedSet = OrderedSet(self.values)
@@ -931,6 +937,7 @@ def test_repr(self):
 
 class TestBasicOpsBytes(TestBasicOps, TestCase):
     def setUp(self):
+        super().setUp()
         self.case = "bytes OrderedSet"
         self.values = [b"a", b"b", b"c"]
         self.OrderedSet = OrderedSet(self.values)
@@ -947,6 +954,7 @@ def test_repr(self):
 
 class TestBasicOpsMixedStringBytes(TestBasicOps, TestCase):
     def setUp(self):
+        super().setUp()
         warnings.simplefilter("ignore", BytesWarning)
         self.case = "string and bytes OrderedSet"
         self.values = ["a", "b", b"a", b"b"]
@@ -1018,6 +1026,7 @@ def test_constructor(self):
 
 class TestBinaryOps(TestCase):
     def setUp(self):
+        super().setUp()
         self.OrderedSet = OrderedSet((2, 4, 6))
 
     def test_eq(self):  # SF bug 643115
@@ -1093,6 +1102,7 @@ def test_sym_difference_non_overlap(self):
 
 class TestUpdateOps(TestCase):
     def setUp(self):
+        super().setUp()
         self.OrderedSet = OrderedSet((2, 4, 6))
 
     def test_union_subset(self):
@@ -1181,6 +1191,7 @@ def test_difference_method_call(self):
 
 class TestMutate(TestCase):
     def setUp(self):
+        super().setUp()
         self.values = ["a", "b", "c"]
         self.OrderedSet = OrderedSet(self.values)
 
@@ -1469,6 +1480,7 @@ def test_difference(self):
 
 class TestOnlySetsNumeric(TestOnlySetsInBinaryOps, TestCase):
     def setUp(self):
+        super().setUp()
         self.OrderedSet = OrderedSet((1, 2, 3))
         self.other = 19
         self.otherIsIterable = False
@@ -1479,6 +1491,7 @@ def setUp(self):
 
 class TestOnlySetsDict(TestOnlySetsInBinaryOps, TestCase):
     def setUp(self):
+        super().setUp()
         self.OrderedSet = OrderedSet((1, 2, 3))
         self.other = {1: 2, 3: 4}
         self.otherIsIterable = True
@@ -1489,6 +1502,7 @@ def setUp(self):
 
 class TestOnlySetsOperator(TestOnlySetsInBinaryOps, TestCase):
     def setUp(self):
+        super().setUp()
         self.OrderedSet = OrderedSet((1, 2, 3))
         self.other = operator.add
         self.otherIsIterable = False
@@ -1499,6 +1513,7 @@ def setUp(self):
 
 class TestOnlySetsTuple(TestOnlySetsInBinaryOps, TestCase):
     def setUp(self):
+        super().setUp()
         self.OrderedSet = OrderedSet((1, 2, 3))
         self.other = (2, 4, 6)
         self.otherIsIterable = True
@@ -1509,6 +1524,7 @@ def setUp(self):
 
 class TestOnlySetsString(TestOnlySetsInBinaryOps, TestCase):
     def setUp(self):
+        super().setUp()
         self.OrderedSet = OrderedSet((1, 2, 3))
         self.other = "abc"
         self.otherIsIterable = True
@@ -1519,6 +1535,8 @@ def setUp(self):
 
 class TestOnlySetsGenerator(TestOnlySetsInBinaryOps, TestCase):
     def setUp(self):
+        super().setUp()
+
         def gen():
             for i in range(0, 10, 2):  # noqa: UP028
                 yield i
@@ -1556,6 +1574,7 @@ def test_deep_copy(self):
 
 class TestCopyingEmpty(TestCopying, TestCase):
     def setUp(self):
+        super().setUp()
         self.OrderedSet = OrderedSet()
 
 
@@ -1564,6 +1583,7 @@ def setUp(self):
 
 class TestCopyingSingleton(TestCopying, TestCase):
     def setUp(self):
+        super().setUp()
         self.OrderedSet = OrderedSet(["hello"])
 
 
@@ -1572,6 +1592,7 @@ def setUp(self):
 
 class TestCopyingTriple(TestCopying, TestCase):
     def setUp(self):
+        super().setUp()
         self.OrderedSet = OrderedSet(["zero", 0, None])
 
 
@@ -1580,6 +1601,7 @@ def setUp(self):
 
 class TestCopyingTuple(TestCopying, TestCase):
     def setUp(self):
+        super().setUp()
         self.OrderedSet = OrderedSet([(1, 2)])
 
 
@@ -1588,6 +1610,7 @@ def setUp(self):
 
 class TestCopyingNested(TestCopying, TestCase):
     def setUp(self):
+        super().setUp()
         self.OrderedSet = OrderedSet([((1, 2), (3, 4))])
 
 
@@ -1598,6 +1621,7 @@ def setUp(self):
 
 class TestIdentities(TestCase):
     def setUp(self):
+        super().setUp()
         self.a = OrderedSet("abracadabra")
         self.b = OrderedSet("alacazam")
 
diff --git a/test/inductor/test_pad_mm.py b/test/inductor/test_pad_mm.py
index 781f4588e1472..c61434427f535 100644
--- a/test/inductor/test_pad_mm.py
+++ b/test/inductor/test_pad_mm.py
@@ -16,7 +16,7 @@
 from torch._inductor.utils import fresh_cache, is_big_gpu, run_and_get_code
 from torch.testing import FileCheck
 from torch.testing._internal.common_utils import skipIfRocm
-from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU_AND_TRITON
 
 
 class PadMMTest(TestCase):
@@ -38,15 +38,15 @@ class Model(torch.nn.Module):
             def __init__(self) -> None:
                 super().__init__()
                 self.w = rand_strided(
-                    (K2, N), (1, K2), device="cuda", dtype=torch.float32
+                    (K2, N), (1, K2), device=GPU_TYPE, dtype=torch.float32
                 )
 
             def forward(self, a):
                 a1 = torch.narrow(a, 1, 0, K2)
                 return torch.mm(a1, self.w)
 
-        fn = Model().cuda()
-        a = rand_strided((M, K1), (K1, 1), device="cuda", dtype=torch.float32)
+        fn = Model().to(GPU_TYPE)
+        a = rand_strided((M, K1), (K1, 1), device=GPU_TYPE, dtype=torch.float32)
         aligned_k = get_padded_length(K2, get_alignment_size(a)) + K2
         torch._dynamo.mark_dynamic(a, 0)
         with unittest.mock.patch(
@@ -72,7 +72,7 @@ class Model(torch.nn.Module):
             def __init__(self) -> None:
                 super().__init__()
                 self.w = rand_strided(
-                    (K2, N), (1, K2), device="cuda", dtype=torch.float32
+                    (K2, N), (1, K2), device=GPU_TYPE, dtype=torch.float32
                 )
 
             def forward(self, a, b):
@@ -80,9 +80,9 @@ def forward(self, a, b):
                 a1 = torch.narrow(c, 1, 0, K2)
                 return torch.mm(a1, self.w)
 
-        fn = Model().cuda()
-        a = rand_strided((M1, K1), (K1, 1), device="cuda", dtype=torch.float32)
-        b = rand_strided((M2, K1), (K1, 1), device="cuda", dtype=torch.float32)
+        fn = Model().to(GPU_TYPE)
+        a = rand_strided((M1, K1), (K1, 1), device=GPU_TYPE, dtype=torch.float32)
+        b = rand_strided((M2, K1), (K1, 1), device=GPU_TYPE, dtype=torch.float32)
         torch._dynamo.mark_dynamic(a, 0)
         torch._dynamo.mark_dynamic(b, 0)
         aligned_k = get_padded_length(K2, get_alignment_size(a)) + K2
@@ -110,9 +110,9 @@ def __init__(self) -> None:
             def forward(self, a, b):
                 return torch.mm(a, b)
 
-        fn = Model().cuda()
-        a = rand_strided((M, K), (K, 1), device="cuda", dtype=torch.float32)
-        b = rand_strided((K, N), (1, K), device="cuda", dtype=torch.float32)
+        fn = Model().to(GPU_TYPE)
+        a = rand_strided((M, K), (K, 1), device=GPU_TYPE, dtype=torch.float32)
+        b = rand_strided((K, N), (1, K), device=GPU_TYPE, dtype=torch.float32)
         aligned_k = get_padded_length(K, get_alignment_size(a)) + K
         torch._dynamo.mark_dynamic(b, 1)
         with unittest.mock.patch(
@@ -139,9 +139,9 @@ def __init__(self) -> None:
             def forward(self, a, b):
                 return torch.mm(a, b)
 
-        fn = Model().cuda()
-        a = rand_strided((M, K), (K, 1), device="cuda", dtype=torch.float32)
-        b = rand_strided((K, N), (1, K), device="cuda", dtype=torch.float32)
+        fn = Model().to(GPU_TYPE)
+        a = rand_strided((M, K), (K, 1), device=GPU_TYPE, dtype=torch.float32)
+        b = rand_strided((K, N), (1, K), device=GPU_TYPE, dtype=torch.float32)
         # TODO: Getting the alignment right requires pattern matcher to
         # run on newly added nodes
         aligned_m = get_padded_length(M, get_alignment_size(a)) + M
@@ -168,9 +168,9 @@ def __init__(self) -> None:
             def forward(self, a, b):
                 return torch.mm(a, b)
 
-        fn = Model().cuda()
-        a = rand_strided((M, K), (K, 1), device="cuda", dtype=torch.float32)
-        b = rand_strided((K, N), (1, K), device="cuda", dtype=torch.float32)
+        fn = Model().to(GPU_TYPE)
+        a = rand_strided((M, K), (K, 1), device=GPU_TYPE, dtype=torch.float32)
+        b = rand_strided((K, N), (1, K), device=GPU_TYPE, dtype=torch.float32)
         torch._dynamo.mark_dynamic(a, 0)
         torch._dynamo.mark_dynamic(a, 1)
         torch._dynamo.mark_dynamic(b, 0)
@@ -188,9 +188,9 @@ def test_zero_dim(self):
         def addmm(x, a, b):
             return torch.addmm(x, a, b)
 
-        x = torch.randn(100).cuda()
-        a = torch.randn(0, 10).cuda()
-        b = torch.randn(10, 100).cuda()
+        x = torch.randn(100).to(GPU_TYPE)
+        a = torch.randn(0, 10).to(GPU_TYPE)
+        b = torch.randn(10, 100).to(GPU_TYPE)
         self.assertEqual(torch.compile(addmm)(x, a, b), addmm(x, a, b))
 
     @inductor_config.patch(
@@ -209,9 +209,9 @@ def __init__(self) -> None:
             def forward(self, a, b):
                 return torch.bmm(a, b)
 
-        fn = Model().cuda()
-        a = torch.randn(B, M, K, device="cuda", dtype=torch.float32)
-        b = torch.randn(B, K, N, device="cuda", dtype=torch.float32)
+        fn = Model().to(GPU_TYPE)
+        a = torch.randn(B, M, K, device=GPU_TYPE, dtype=torch.float32)
+        b = torch.randn(B, K, N, device=GPU_TYPE, dtype=torch.float32)
         aligned_k = get_padded_length(K, get_alignment_size(a)) + K
         torch._dynamo.mark_dynamic(a, 0)
         torch._dynamo.mark_dynamic(b, 0)
@@ -240,9 +240,9 @@ def __init__(self) -> None:
             def forward(self, a, b):
                 return torch.bmm(a, b)
 
-        fn = Model().cuda()
-        a = torch.randn(B, M, K, device="cuda", dtype=torch.float32)
-        b = torch.randn(B, K, N, device="cuda", dtype=torch.float32)
+        fn = Model().to(GPU_TYPE)
+        a = torch.randn(B, M, K, device=GPU_TYPE, dtype=torch.float32)
+        b = torch.randn(B, K, N, device=GPU_TYPE, dtype=torch.float32)
         aligned_n = get_padded_length(N, get_alignment_size(b)) + N
         torch._dynamo.mark_dynamic(a, 2)
         torch._dynamo.mark_dynamic(b, 1)
@@ -271,9 +271,9 @@ def __init__(self) -> None:
             def forward(self, a, b):
                 return torch.bmm(a, b)
 
-        fn = Model().cuda()
-        a = torch.randn(B, M, K, device="cuda", dtype=torch.float32)
-        b = torch.randn(B, K, N, device="cuda", dtype=torch.float32)
+        fn = Model().to(GPU_TYPE)
+        a = torch.randn(B, M, K, device=GPU_TYPE, dtype=torch.float32)
+        b = torch.randn(B, K, N, device=GPU_TYPE, dtype=torch.float32)
         aligned_n = get_padded_length(N, get_alignment_size(b)) + N
         torch._dynamo.mark_dynamic(a, 0)
         torch._dynamo.mark_dynamic(a, 1)
@@ -302,10 +302,10 @@ def __init__(self) -> None:
             def forward(self, a, b, c):
                 return torch.addmm(a, b, c)
 
-        fn = Model().cuda()
-        a = torch.randn(M, N, device="cuda", dtype=torch.float32)
-        b = torch.randn(M, K, device="cuda", dtype=torch.float32)
-        c = torch.randn(K, N, device="cuda", dtype=torch.float32)
+        fn = Model().to(GPU_TYPE)
+        a = torch.randn(M, N, device=GPU_TYPE, dtype=torch.float32)
+        b = torch.randn(M, K, device=GPU_TYPE, dtype=torch.float32)
+        c = torch.randn(K, N, device=GPU_TYPE, dtype=torch.float32)
         aligned_k = get_padded_length(K, get_alignment_size(b)) + K
         torch._dynamo.mark_dynamic(a, 0)
         torch._dynamo.mark_dynamic(b, 0)
@@ -333,10 +333,10 @@ def __init__(self) -> None:
             def forward(self, a, b, c):
                 return torch.addmm(a, b, c)
 
-        fn = Model().cuda()
-        a = torch.randn(M, N, device="cuda", dtype=torch.float32)
-        b = torch.randn(M, K, device="cuda", dtype=torch.float32)
-        c = torch.randn(K, N, device="cuda", dtype=torch.float32)
+        fn = Model().to(GPU_TYPE)
+        a = torch.randn(M, N, device=GPU_TYPE, dtype=torch.float32)
+        b = torch.randn(M, K, device=GPU_TYPE, dtype=torch.float32)
+        c = torch.randn(K, N, device=GPU_TYPE, dtype=torch.float32)
         torch._dynamo.mark_dynamic(a, 0)
         torch._dynamo.mark_dynamic(a, 1)
         torch._dynamo.mark_dynamic(b, 0)
@@ -357,7 +357,7 @@ def test_pad_single_cat(self):
         def foo(x, y):
             return x @ y
 
-        inps = [torch.rand([5, 5], device="cuda") for _ in range(2)]
+        inps = [torch.rand([5, 5], device=GPU_TYPE) for _ in range(2)]
         out = foo(*inps)
         self.assertEqual(out, inps[0] @ inps[1])
 
@@ -371,9 +371,9 @@ def foo(input, x, y):
         for a in [1, 4]:
             for b in [1, 6]:
                 inps = (
-                    torch.rand([a, b], device="cuda"),
-                    torch.rand([4, 5], device="cuda"),
-                    torch.rand([5, 6], device="cuda"),
+                    torch.rand([a, b], device=GPU_TYPE),
+                    torch.rand([4, 5], device=GPU_TYPE),
+                    torch.rand([5, 6], device=GPU_TYPE),
                 )
                 out = foo(*inps)
                 out_eager = torch.ops.aten.addmm(*inps)
@@ -381,9 +381,9 @@ def foo(input, x, y):
 
         for a in [1, 6]:
             inps = (
-                torch.rand([a], device="cuda"),
-                torch.rand([4, 5], device="cuda"),
-                torch.rand([5, 6], device="cuda"),
+                torch.rand([a], device=GPU_TYPE),
+                torch.rand([4, 5], device=GPU_TYPE),
+                torch.rand([5, 6], device=GPU_TYPE),
             )
             out = foo(*inps)
             out_eager = torch.ops.aten.addmm(*inps)
@@ -395,8 +395,8 @@ def test_pad_batch(self):
         n = 9
         k = 11
         batch_size = 3
-        mat1 = torch.ones((batch_size, m, k), device="cuda", dtype=torch.float16)
-        mat2 = torch.ones((batch_size, k, n), device="cuda", dtype=torch.float16)
+        mat1 = torch.ones((batch_size, m, k), device=GPU_TYPE, dtype=torch.float16)
+        mat2 = torch.ones((batch_size, k, n), device=GPU_TYPE, dtype=torch.float16)
         expected_alignment = get_alignment_size(mat1)
 
         assert expected_alignment == 8, "Alignment for float16 should be 8"
@@ -413,7 +413,7 @@ def bmm(mat1, mat2):
         # in call code, expect to see a single pad per input, and then we should see padded allocation for output
         FileCheck().check("del async_compile").check_count(
             ".run(", 2, exactly=True
-        ).check("empty_strided_cuda((3, 8, 16)").run(code)
+        ).check(f"empty_strided_{GPU_TYPE}((3, 8, 16)").run(code)
 
         assert torch.allclose(res2, bmm_expected_result), (
             "BMM results are not identical"
@@ -425,7 +425,7 @@ def test_exclude_padding(self):
         def mm(a, b):
             return a @ b
 
-        mm(torch.rand([25, 25], device="cuda"), torch.rand([25, 25], device="cuda"))
+        mm(torch.rand([25, 25], device=GPU_TYPE), torch.rand([25, 25], device=GPU_TYPE))
         local_cache = get_pad_cache().get_local_cache()
         self.assertTrue(len(local_cache) == 2)
         FileCheck().check_count("exclude_pad:False", 2, exactly=True).run(
@@ -436,7 +436,7 @@ def mm(a, b):
         def mm(a, b):
             return (a + 1) @ b
 
-        mm(torch.rand([25, 25], device="cuda"), torch.rand([25, 25], device="cuda"))
+        mm(torch.rand([25, 25], device=GPU_TYPE), torch.rand([25, 25], device=GPU_TYPE))
         local_cache = get_pad_cache().get_local_cache()
         # reuse original base timing
         self.assertTrue(len(local_cache) == 3)
@@ -455,8 +455,8 @@ def test_exclude_cat_padding(self):
         def mm(inps, b):
             return torch.cat(inps) @ b
 
-        inp = torch.rand([2046, 2046], device="cuda")
-        inp2 = torch.rand([2046, 2046], device="cuda")
+        inp = torch.rand([2046, 2046], device=GPU_TYPE)
+        inp2 = torch.rand([2046, 2046], device=GPU_TYPE)
 
         inps = inp.chunk(3)
         mm(inps, inp2)
@@ -471,7 +471,8 @@ def mm(inps, b):
         )
 
     @unittest.skipIf(
-        not torch.cuda.is_available() or torch.cuda.get_device_capability() >= (9, 0),
+        (not torch.cuda.is_available() or torch.cuda.get_device_capability() >= (9, 0))
+        and (not torch.xpu.is_available()),
         "No perf regression on H100+ with BF16",
     )
     @skipIfRocm
@@ -483,8 +484,8 @@ def test_pad_mm_bf16(self):
         m = 2
         n = 13
         k = 15691904
-        mat1 = torch.ones((m, k), device="cuda", dtype=torch.bfloat16)
-        mat2 = torch.ones((k, n), device="cuda", dtype=torch.bfloat16)
+        mat1 = torch.ones((m, k), device=GPU_TYPE, dtype=torch.bfloat16)
+        mat2 = torch.ones((k, n), device=GPU_TYPE, dtype=torch.bfloat16)
         expected_alignment = get_alignment_size(mat1)
 
         assert expected_alignment == 8, "Alignment for bfloat16 should be 8"
@@ -504,7 +505,7 @@ def mm(mat1, mat2):
         # in call code, expect to see a single pad per input, and then we should see padded allocation for output
         FileCheck().check("del async_compile").check_count(
             ".run(", 2, exactly=True
-        ).check("empty_strided_cuda((8, 16)").run(code)
+        ).check(f"empty_strided_{GPU_TYPE}((8, 16)").run(code)
 
         assert torch.allclose(res2, mm_expected_result), "MM results are not identical"
 
@@ -521,8 +522,8 @@ def fn(x, y):
             return x @ y
 
         args = [
-            torch.randn(2**4, 2**8 - 1, device="cuda", dtype=torch.float16),
-            torch.randn(2**8 - 1, 2**4, device="cuda", dtype=torch.float16),
+            torch.randn(2**4, 2**8 - 1, device=GPU_TYPE, dtype=torch.float16),
+            torch.randn(2**8 - 1, 2**4, device=GPU_TYPE, dtype=torch.float16),
         ]
 
         counters.clear()
@@ -615,7 +616,7 @@ def test_masked_mha(B, H, S, D, device, dtype):
             ):
                 mha = torch.compile(mha, fullgraph=True, backend="inductor")
                 with torch.autocast(
-                    device_type="cuda", dtype=dtype, cache_enabled=False
+                    device_type=GPU_TYPE, dtype=dtype, cache_enabled=False
                 ):
                     out_vid = mha(x1, x2, attn_mask)
                     target_vid = torch.randn_like(out_vid)
@@ -624,7 +625,7 @@ def test_masked_mha(B, H, S, D, device, dtype):
                     loss = loss_vid
                 loss.backward()
 
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
 
             # Check if any bmm operations had dtype changes
             for node_name_pre, node_name_post in zip(
@@ -642,7 +643,7 @@ def test_masked_mha(B, H, S, D, device, dtype):
             self.assertFalse(torch.any(x2.grad.isnan()).item())
 
         B, H, S, D = 2, 32, 549, 128
-        device = "cuda"
+        device = GPU_TYPE
         dtype = torch.bfloat16
         torch.compiler.reset()
         torch.manual_seed(42)
@@ -650,5 +651,5 @@ def test_masked_mha(B, H, S, D, device, dtype):
 
 
 if __name__ == "__main__":
-    if HAS_CUDA_AND_TRITON:
+    if HAS_GPU_AND_TRITON:
         run_tests()
diff --git a/test/inductor/test_pallas.py b/test/inductor/test_pallas.py
new file mode 100644
index 0000000000000..c4aabd0375090
--- /dev/null
+++ b/test/inductor/test_pallas.py
@@ -0,0 +1,772 @@
+# Owner(s): ["oncall: pt2"]
+import functools
+import re
+import sys
+import unittest
+
+import torch
+import torch._inductor.async_compile  # noqa: F401 required to warm up AsyncCompile pools
+from torch._dynamo.testing import make_test_cls_with_patches
+from torch._inductor import config
+from torch._inductor.test_case import run_tests, TestCase
+from torch._inductor.utils import run_and_get_code
+from torch.testing._internal.common_utils import IS_CI, IS_WINDOWS
+from torch.testing._internal.inductor_utils import HAS_PALLAS
+from torch.utils._triton import has_triton
+
+
+if IS_WINDOWS and IS_CI:
+    sys.stderr.write(
+        "Windows CI does not have necessary dependencies for test_torchinductor yet\n"
+    )
+    if __name__ == "__main__":
+        sys.exit(0)
+    raise unittest.SkipTest("requires sympy/functorch/filelock")
+
+
+try:
+    from . import test_torchinductor
+except ImportError:
+    import test_torchinductor  # @manual=fbcode//caffe2/test/inductor:test_inductor-library
+
+
+test_classes = {}
+
+
+def make_pallas(cls):
+    """Create a test class variant that uses Pallas backend."""
+    suffix = "_pallas"
+    cls_prefix = "Pallas"
+
+    test_class = make_test_cls_with_patches(
+        cls,
+        cls_prefix,
+        suffix,
+        (config, "cpu_backend", "pallas"),
+        (config, "cuda_backend", "pallas"),
+        xfail_prop="_expected_failure_pallas",
+    )
+
+    test_classes[test_class.__name__] = test_class
+    # REMOVING THIS LINE WILL STOP TESTS FROM RUNNING
+    globals()[test_class.__name__] = test_class
+    test_class.__module__ = __name__
+    return test_class
+
+
+class PallasTestsMixin:
+    """Basic tests for Pallas backend functionality (parameterized by DEVICE). Mixin only, not collected."""
+
+    def _compile(self, fn):
+        key = "cuda_backend" if self.DEVICE == "cuda" else "cpu_backend"
+        return torch.compile(fn, backend="inductor", options={key: "pallas"})
+
+    def test_simple_add(self):
+        """Test basic element-wise addition."""
+
+        def fn(a, b):
+            return a + b
+
+        compiled = self._compile(fn)
+
+        a = torch.randn(1024, device=self.DEVICE)
+        b = torch.randn(1024, device=self.DEVICE)
+        result = compiled(a, b)
+        expected = fn(a, b)
+        self.assertEqual(result, expected)
+
+    def test_simple_mul(self):
+        """Test basic element-wise multiplication."""
+
+        def fn(a, b):
+            return a * b
+
+        compiled = self._compile(fn)
+
+        a = torch.randn(1024, device=self.DEVICE)
+        b = torch.randn(1024, device=self.DEVICE)
+        result = compiled(a, b)
+        expected = fn(a, b)
+        self.assertEqual(result, expected)
+
+    def test_sin(self):
+        """Test sin operation."""
+
+        def fn(x):
+            return torch.sin(x)
+
+        compiled = self._compile(fn)
+
+        x = torch.randn(1024, device=self.DEVICE)
+        result = compiled(x)
+        expected = fn(x)
+        self.assertEqual(result, expected)
+
+    def test_fused_ops(self):
+        """Test fused operations (sin + add)."""
+
+        def fn(x, y):
+            return x.sin() + y
+
+        compiled = self._compile(fn)
+
+        x = torch.randn(1024, device=self.DEVICE)
+        y = torch.randn(1024, device=self.DEVICE)
+        result = compiled(x, y)
+        expected = fn(x, y)
+        self.assertEqual(result, expected)
+
+    def test_exp_log(self):
+        """Test exp and log operations."""
+
+        def fn(x):
+            return torch.log(torch.exp(x))
+
+        compiled = self._compile(fn)
+
+        x = torch.randn(1024, device=self.DEVICE)
+        result = compiled(x)
+        expected = fn(x)
+        self.assertEqual(result, expected)
+
+    def test_sqrt(self):
+        """Test sqrt operation."""
+
+        def fn(x):
+            return torch.sqrt(x)
+
+        compiled = self._compile(fn)
+
+        x = torch.randn(1024, device=self.DEVICE).abs()  # Ensure positive for sqrt
+        result = compiled(x)
+        expected = fn(x)
+        self.assertEqual(result, expected)
+
+    def test_tanh(self):
+        """Test tanh operation."""
+
+        def fn(x):
+            return torch.tanh(x)
+
+        compiled = self._compile(fn)
+
+        x = torch.randn(1024, device=self.DEVICE)
+        result = compiled(x)
+        expected = fn(x)
+        self.assertEqual(result, expected)
+
+    def test_abs_neg(self):
+        """Test abs and neg operations."""
+
+        def fn(x):
+            return torch.abs(-x)
+
+        compiled = self._compile(fn)
+
+        x = torch.randn(1024, device=self.DEVICE)
+        result = compiled(x)
+        expected = fn(x)
+        self.assertEqual(result, expected)
+
+    def test_maximum_minimum(self):
+        """Test maximum and minimum operations."""
+
+        def fn(a, b):
+            return torch.maximum(a, b) + torch.minimum(a, b)
+
+        compiled = self._compile(fn)
+
+        a = torch.randn(1024, device=self.DEVICE)
+        b = torch.randn(1024, device=self.DEVICE)
+        result = compiled(a, b)
+        expected = fn(a, b)
+        self.assertEqual(result, expected)
+
+    @unittest.skipUnless(has_triton(), "requires triton")
+    @unittest.skip("Random ops not yet implemented in Pallas backend")
+    def test_random_consistency(self):
+        """Test that random number generation is consistent across backends."""
+        seed = 1234
+        shape = (3, 3)
+        dtype = torch.float32
+
+        for rand_fn in [
+            functools.partial(torch.rand, shape, dtype=dtype, device="cuda"),
+            functools.partial(torch.randn, shape, dtype=dtype, device="cuda"),
+        ]:
+
+            @torch.compile(backend="inductor", options={"cuda_backend": "pallas"})
+            def get_rand_pallas():
+                return rand_fn()
+
+            @torch.compile(backend="inductor", options={"cuda_backend": "triton"})
+            def get_rand_triton():
+                return rand_fn()
+
+            torch.manual_seed(seed)
+            pallas_output = get_rand_pallas()
+            torch.manual_seed(seed)
+            triton_output = get_rand_triton()
+
+            self.assertEqual(pallas_output, triton_output)
+
+    def test_compile_options(self):
+        """Test that Pallas backend is properly configured."""
+
+        @torch.compile(
+            backend="inductor",
+            options={
+                ("cuda_backend" if self.DEVICE == "cuda" else "cpu_backend"): "pallas"
+            },
+        )
+        def pallas_fn(a, b):
+            return a.sin() + b.cos()
+
+        _, (code,) = run_and_get_code(
+            pallas_fn,
+            torch.randn(64, device=self.DEVICE),
+            torch.randn(64, device=self.DEVICE),
+        )
+        # Verify Pallas-specific code generation
+        self.assertIn("import jax", code)
+        self.assertIn("import jax.numpy as jnp", code)
+        self.assertIn("from jax.experimental import pallas as pl", code)
+
+    def test_jax_jit_wrapper_is_emitted(self):
+        """Ensure generated Pallas code wraps pl.pallas_call in jax.jit."""
+
+        key = "cuda_backend" if self.DEVICE == "cuda" else "cpu_backend"
+
+        @torch.compile(backend="inductor", options={key: "pallas"})
+        def pallas_fn(a, b):
+            return a + b
+
+        _, (code,) = run_and_get_code(
+            pallas_fn,
+            torch.randn(32, device=self.DEVICE),
+            torch.randn(32, device=self.DEVICE),
+        )
+
+        kernel_match = re.search(r"def (pallas_[A-Za-z0-9_]+)_kernel", code)
+        self.assertIsNotNone(kernel_match)
+        kernel_name = kernel_match.group(1)
+        wrapper_name = f"{kernel_name}_jit_wrapper"
+        self.assertIn(wrapper_name, code)
+        start = code.index(f"def {wrapper_name}")
+        end = code.index(f"def {kernel_name}_main", start)
+        wrapper_block = code[start:end]
+
+        self.assertIn("jax.jit", code)
+        self.assertIn("donate_argnums", code)
+        self.assertIn("input_output_aliases", wrapper_block)
+        if self.DEVICE == "cuda":
+            self.assertNotIn(".copy_(", code)
+        self.assertNotIn("torch.", wrapper_block)
+
+    def test_2d_tensor(self):
+        """Test with 2D tensors (though current implementation flattens)."""
+
+        def fn(x, y):
+            return x + y
+
+        compiled = self._compile(fn)
+
+        x = torch.randn(32, 32, device=self.DEVICE)
+        y = torch.randn(32, 32, device=self.DEVICE)
+        result = compiled(x, y)
+        expected = fn(x, y)
+        self.assertEqual(result, expected)
+
+    def test_different_shapes(self):
+        """Test with different tensor shapes."""
+
+        def fn(x):
+            return x * 2.0
+
+        compiled = self._compile(fn)
+
+        for shape in [(64,), (128,), (256,), (1024,)]:
+            x = torch.randn(shape, device=self.DEVICE)
+            result = compiled(x)
+            expected = fn(x)
+            self.assertEqual(result, expected)
+
+    def test_contiguous_index_validation(self):
+        """Test that contiguous index validation works correctly end-to-end."""
+
+        # Test 1: Contiguous operations should work
+        def contiguous_add(a, b):
+            return a + b
+
+        compiled = self._compile(contiguous_add)
+
+        a = torch.randn(1024, device=self.DEVICE)
+        b = torch.randn(1024, device=self.DEVICE)
+        result = compiled(a, b)
+        expected = contiguous_add(a, b)
+        self.assertEqual(result, expected)
+
+        # Test 2: Operations on contiguous tensors should work
+        def contiguous_mul(x):
+            return x * 2.0
+
+        compiled = self._compile(contiguous_mul)
+
+        x = torch.randn(128, 8, device=self.DEVICE)
+        result = compiled(x)
+        expected = contiguous_mul(x)
+        self.assertEqual(result, expected)
+
+        # Test 3: Non-contiguous views should work with the simplified dlpack approach
+        # The direct dlpack conversion handles non-contiguous tensors correctly
+        def operate_on_tensor(x):
+            return x.sin()
+
+        compiled = self._compile(operate_on_tensor)
+
+        # Create a transposed (non-contiguous) view
+        x = torch.randn(64, 32, device=self.DEVICE)
+        x_t = x.t()  # Non-contiguous view
+        self.assertFalse(x_t.is_contiguous())
+
+        # With the simplified dlpack approach, non-contiguous tensors now work
+        result = compiled(x_t)
+        expected = operate_on_tensor(x_t)
+        self.assertEqual(result, expected)
+
+        # Contiguous tensors should also continue to work
+        x_t_contiguous = x_t.contiguous()
+        self.assertTrue(x_t_contiguous.is_contiguous())
+        result = compiled(x_t_contiguous)
+        expected = operate_on_tensor(x_t_contiguous)
+        self.assertEqual(result, expected)
+
+    def test_strided_int_pallas(self):
+        """Test strided access patterns with the Pallas backend."""
+
+        def fn(x):
+            # Access every other element (strided access)
+            return x[::2] * 2.0
+
+        compiled = self._compile(fn)
+
+        x = torch.arange(16, dtype=torch.float32, device=self.DEVICE)
+        result = compiled(x)
+        expected = fn(x)
+        self.assertEqual(result, expected)
+
+    def test_strided_offset_pallas(self):
+        """Test strided access with offset."""
+
+        def fn(x):
+            # Access every other element starting from index 1
+            return x[1::2] + 1.0
+
+        compiled = self._compile(fn)
+
+        x = torch.arange(16, dtype=torch.float32, device=self.DEVICE)
+        result = compiled(x)
+        expected = fn(x)
+        self.assertEqual(result, expected)
+
+    def test_strided_2d_pallas(self):
+        """Test strided access on 2D tensors."""
+
+        def fn(x):
+            # Simple operation on 2D tensor
+            return x * 3.0
+
+        compiled = self._compile(fn)
+
+        x = torch.randn(8, 16, device=self.DEVICE)
+        result = compiled(x)
+        expected = fn(x)
+        self.assertEqual(result, expected)
+
+    def test_non_power_of_2_sizes(self):
+        """Test that non-power-of-2 tensor sizes work with masked ops on GPU.
+
+        On GPU (Triton backend), Pallas requires power-of-2 sizes. We use masked
+        loads/stores to handle non-power-of-2 tensors by allocating power-of-2
+        blocks and masking out invalid elements.
+        """
+
+        def fn(a, b):
+            return a + b
+
+        compiled = self._compile(fn)
+
+        # Test a specific non-power-of-2 size (10)
+        a = torch.randn(10, device=self.DEVICE)
+        b = torch.randn(10, device=self.DEVICE)
+        result = compiled(a, b)
+        expected = fn(a, b)
+        self.assertEqual(result, expected)
+
+    def test_non_power_of_2_multiple_ops(self):
+        """Test non-power-of-2 sizes with multiple operations."""
+
+        def fn(x, y):
+            return x.sin() + y.cos() - (x * y)
+
+        compiled = self._compile(fn)
+
+        # Non-power-of-2 size: 17
+        x = torch.randn(17, device=self.DEVICE)
+        y = torch.randn(17, device=self.DEVICE)
+        result = compiled(x, y)
+        expected = fn(x, y)
+        self.assertEqual(result, expected)
+
+    def test_complex_indexing_gather(self):
+        """Test complex indexing with gather-like operations."""
+
+        def fn(x, indices):
+            # Use indices to gather elements from x
+            return x[indices]
+
+        compiled = self._compile(fn)
+
+        x = torch.arange(16, dtype=torch.float32, device=self.DEVICE)
+        # Use power-of-2 size for indices (Pallas Triton requirement)
+        indices = torch.tensor(
+            [0, 2, 5, 7, 11, 13, 14, 15], dtype=torch.int64, device=self.DEVICE
+        )
+        result = compiled(x, indices)
+        expected = fn(x, indices)
+        self.assertEqual(result, expected)
+
+    def test_complex_indexing_2d(self):
+        """Test complex indexing on 2D tensors with integer array indexing."""
+        if self.DEVICE == "cuda":
+            # Pallas Triton backend doesn't support gather operations with array indices
+            # This limitation is in the Pallas/Triton lowering, not our implementation
+            self.skipTest(
+                "Multi-dimensional gather not supported on Pallas Triton (CUDA) backend"
+            )
+
+        def fn(x, row_indices):
+            # Select specific rows using integer array indexing
+            return x[row_indices, :]
+
+        compiled = self._compile(fn)
+
+        x = torch.randn(16, 8, device=self.DEVICE)
+        # Use power-of-2 sizes (Pallas Triton requirement)
+        row_indices = torch.tensor([0, 2, 5, 7], dtype=torch.int64, device=self.DEVICE)
+        result = compiled(x, row_indices)
+        expected = fn(x, row_indices)
+        self.assertEqual(result, expected)
+
+    def test_complex64_mul(self):
+        """Test complex64 multiplication."""
+
+        def fn(a, b):
+            return a * b
+
+        compiled = self._compile(fn)
+
+        a = torch.randn(16, dtype=torch.complex64, device=self.DEVICE)
+        b = torch.randn(16, dtype=torch.complex64, device=self.DEVICE)
+        result = compiled(a, b)
+        expected = fn(a, b)
+        self.assertEqual(result, expected)
+
+    def test_complex_conj(self):
+        """Test complex conjugate."""
+
+        def fn(x):
+            return torch.conj(x)
+
+        compiled = self._compile(fn)
+
+        x = torch.randn(16, dtype=torch.complex64, device=self.DEVICE)
+        result = compiled(x)
+        expected = fn(x)
+        self.assertEqual(result, expected)
+
+    def test_complex_real(self):
+        """Test extracting real part of complex tensor."""
+
+        def fn(x):
+            return torch.real(x)
+
+        compiled = self._compile(fn)
+
+        x = torch.randn(16, dtype=torch.complex64, device=self.DEVICE)
+        result = compiled(x)
+        expected = fn(x)
+        self.assertEqual(result, expected)
+
+    def test_complex_imag(self):
+        """Test extracting imaginary part of complex tensor."""
+
+        def fn(x):
+            return torch.imag(x)
+
+        compiled = self._compile(fn)
+
+        x = torch.randn(16, dtype=torch.complex64, device=self.DEVICE)
+        result = compiled(x)
+        expected = fn(x)
+        self.assertEqual(result, expected)
+
+    def test_complex_abs(self):
+        """Test complex absolute value (magnitude)."""
+
+        def fn(x):
+            return torch.abs(x)
+
+        compiled = self._compile(fn)
+
+        x = torch.randn(16, dtype=torch.complex64, device=self.DEVICE)
+        result = compiled(x)
+        expected = fn(x)
+        self.assertEqual(result, expected)
+
+    def test_complex128_conj(self):
+        """Test complex128 conjugate operation."""
+
+        def fn(x):
+            return torch.conj(x)
+
+        compiled = self._compile(fn)
+
+        x = torch.randn(16, dtype=torch.complex128, device=self.DEVICE)
+        result = compiled(x)
+        expected = fn(x)
+        self.assertEqual(result, expected)
+
+    def test_complex_mul_scalar(self):
+        """Test complex multiplication with scalar."""
+
+        def fn(x):
+            return x * 2.5
+
+        compiled = self._compile(fn)
+
+        x = torch.randn(16, dtype=torch.complex64, device=self.DEVICE)
+        result = compiled(x)
+        expected = fn(x)
+        self.assertEqual(result, expected)
+
+    def test_complex_conj_mul(self):
+        """Test conjugate followed by multiplication."""
+
+        def fn(x, y):
+            return torch.conj(x) * y
+
+        compiled = self._compile(fn)
+
+        x = torch.randn(16, dtype=torch.complex64, device=self.DEVICE)
+        y = torch.randn(16, dtype=torch.complex64, device=self.DEVICE)
+        result = compiled(x, y)
+        expected = fn(x, y)
+        self.assertEqual(result, expected)
+
+    def test_where(self):
+        """Test torch.where operation."""
+
+        def fn(x, y):
+            return torch.where(x > 0, x, y)
+
+        compiled = self._compile(fn)
+
+        x = torch.randn(16, device=self.DEVICE)
+        y = torch.randn(16, device=self.DEVICE)
+        result = compiled(x, y)
+        expected = fn(x, y)
+        self.assertEqual(result, expected)
+
+    def test_clamp(self):
+        """Test torch.clamp operation."""
+
+        def fn(x):
+            return torch.clamp(x, -1.0, 1.0)
+
+        compiled = self._compile(fn)
+
+        x = torch.randn(16, device=self.DEVICE) * 2
+        result = compiled(x)
+        expected = fn(x)
+        self.assertEqual(result, expected)
+
+    def test_comparison_ops(self):
+        """Test comparison operations."""
+
+        def fn(a, b):
+            gt = a > b
+            lt = a < b
+            eq = a == b
+            return gt.float() + lt.float() + eq.float()
+
+        compiled = self._compile(fn)
+
+        a = torch.randn(16, device=self.DEVICE)
+        b = torch.randn(16, device=self.DEVICE)
+        result = compiled(a, b)
+        expected = fn(a, b)
+        self.assertEqual(result, expected)
+
+    def test_logical_ops(self):
+        """Test logical operations."""
+
+        def fn(a, b):
+            return torch.logical_and(a > 0, b > 0).float()
+
+        compiled = self._compile(fn)
+
+        a = torch.randn(16, device=self.DEVICE)
+        b = torch.randn(16, device=self.DEVICE)
+        result = compiled(a, b)
+        expected = fn(a, b)
+        self.assertEqual(result, expected)
+
+    def test_sign(self):
+        """Test sign operation."""
+
+        def fn(x):
+            return torch.sign(x)
+
+        compiled = self._compile(fn)
+
+        x = torch.randn(16, device=self.DEVICE)
+        result = compiled(x)
+        expected = fn(x)
+        self.assertEqual(result, expected)
+
+    def test_reciprocal(self):
+        """Test reciprocal operation."""
+
+        def fn(x):
+            return torch.reciprocal(x)
+
+        compiled = self._compile(fn)
+
+        x = torch.randn(16, device=self.DEVICE) + 1.0  # Avoid zeros
+        result = compiled(x)
+        expected = fn(x)
+        self.assertEqual(result, expected)
+
+    def test_square(self):
+        """Test square operation."""
+
+        def fn(x):
+            return torch.square(x)
+
+        compiled = self._compile(fn)
+
+        x = torch.randn(16, device=self.DEVICE)
+        result = compiled(x)
+        expected = fn(x)
+        self.assertEqual(result, expected)
+
+    def test_erf(self):
+        """Test erf operation."""
+        if self.DEVICE == "cuda":
+            self.skipTest("erf not supported in Pallas GPU (Triton) backend")
+
+        def fn(x):
+            return torch.erf(x)
+
+        compiled = self._compile(fn)
+
+        x = torch.randn(16, device=self.DEVICE)
+        result = compiled(x)
+        expected = fn(x)
+        self.assertEqual(result, expected)
+
+    def test_atan2(self):
+        """Test atan2 operation."""
+
+        def fn(a, b):
+            return torch.atan2(a, b)
+
+        compiled = self._compile(fn)
+
+        a = torch.randn(16, device=self.DEVICE)
+        b = torch.randn(16, device=self.DEVICE)
+        result = compiled(a, b)
+        expected = fn(a, b)
+        self.assertEqual(result, expected)
+
+    def test_sum_reduction(self):
+        """Test sum reduction."""
+
+        def fn(x):
+            return x.sum()
+
+        compiled = self._compile(fn)
+
+        x = torch.randn(16, device=self.DEVICE)
+        result = compiled(x)
+        expected = fn(x)
+        self.assertEqual(result, expected)
+
+    def test_max_reduction(self):
+        """Test max reduction."""
+
+        def fn(x):
+            return x.max()
+
+        compiled = self._compile(fn)
+
+        x = torch.randn(16, device=self.DEVICE)
+        result = compiled(x)
+        expected = fn(x)
+        self.assertEqual(result, expected)
+
+    def test_min_reduction(self):
+        """Test min reduction."""
+
+        def fn(x):
+            return x.min()
+
+        compiled = self._compile(fn)
+
+        x = torch.randn(16, device=self.DEVICE)
+        result = compiled(x)
+        expected = fn(x)
+        self.assertEqual(result, expected)
+
+    def test_prod_reduction(self):
+        """Test prod reduction."""
+        if self.DEVICE == "cuda":
+            self.skipTest("prod reduction not supported in Pallas GPU (Triton) backend")
+
+        def fn(x):
+            # Use smaller values to avoid overflow
+            return (x * 0.1).prod()
+
+        compiled = self._compile(fn)
+
+        x = torch.randn(16, device=self.DEVICE)
+        result = compiled(x)
+        expected = fn(x)
+        self.assertEqual(result, expected)
+
+
+@unittest.skipUnless(HAS_PALLAS, "requires jax and pallas")
+class PallasTestsCUDA(PallasTestsMixin, TestCase):
+    DEVICE = "cuda"
+
+
+@unittest.skipUnless(HAS_PALLAS, "requires jax and pallas")
+class PallasTestsCPU(PallasTestsMixin, TestCase):
+    DEVICE = "cpu"
+
+
+if test_torchinductor.HAS_CPU and HAS_PALLAS:
+    make_pallas(test_torchinductor.SweepInputsCpuTest)
+    # make_pallas(test_torchinductor.CpuTests)
+
+
+if test_torchinductor.HAS_GPU and HAS_PALLAS:
+    # make_pallas(test_torchinductor.SweepInputsGPUTest)
+    # make_pallas(test_torchinductor.GPUTests)
+    pass
+
+
+if __name__ == "__main__":
+    if HAS_PALLAS:
+        run_tests(needs="filelock")
diff --git a/test/inductor/test_pattern_matcher.py b/test/inductor/test_pattern_matcher.py
index 4b8c866b9c291..9928b89b81e64 100644
--- a/test/inductor/test_pattern_matcher.py
+++ b/test/inductor/test_pattern_matcher.py
@@ -41,7 +41,6 @@
     IS_LINUX,
     parametrize,
     skipIfRocm,
-    skipIfXpu,
 )
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU, IS_BIG_GPU
 from torch.utils import _pytree as pytree
@@ -1217,6 +1216,43 @@ def fn2(inp, a, b):
         _, (code) = run_and_get_code(fn2, args[0], args[1], args[2])
         FileCheck().check_not("extern_kernels.addmm(").run(code[0])
 
+    def test_addmm_alpha_beta_with_pointwise(self):
+        # Test that addmm with alpha/beta != 1 is unfused correctly with pointwise ops
+        # See https://github.com/pytorch/pytorch/issues/167313
+        x = torch.rand(2, device=GPU_TYPE)
+        a = torch.rand(2, 3, device=GPU_TYPE)
+        b = torch.rand(3, 2, device=GPU_TYPE)
+
+        def f(x, a, b):
+            return torch.nn.functional.relu(torch.addmm(x, a, b, alpha=0.8, beta=0.2))
+
+        fc = torch.compile(f)
+
+        expected = f(x, a, b)
+        actual = fc(x, a, b)
+
+        # The compiled version should produce the same result as eager
+        torch.testing.assert_close(actual, expected)
+
+        # Verify that addmm is unfused (should not use extern_kernels.addmm)
+        # The pattern should be replaced with beta * x + alpha * (a @ b)
+        _, (code) = run_and_get_code(fc, x, a, b)
+        FileCheck().check_not("extern_kernels.addmm(").run(code[0])
+
+        # Test with alpha=1, beta=1 (default) - should also unfuse
+        def f_default(x, a, b):
+            return torch.nn.functional.relu(torch.addmm(x, a, b))
+
+        fc_default = torch.compile(f_default)
+        expected_default = f_default(x, a, b)
+        actual_default = fc_default(x, a, b)
+
+        torch.testing.assert_close(actual_default, expected_default)
+
+        # Should unfuse and not use extern_kernels.addmm
+        _, (code) = run_and_get_code(fc_default, x, a, b)
+        FileCheck().check_not("extern_kernels.addmm(").run(code[0])
+
     def test_serialized_patterns_up_to_date(self):
         import torch.utils._pytree as pytree
         from torch._inductor.fx_passes import joint_graph
@@ -1261,7 +1297,6 @@ def remap_fake_tensor(x):
                 # of search_fn).
                 self.assertTrue(pattern.pattern_eq(search_fn_pattern))
 
-    @skipIfXpu
     @xfailIfSM89
     @inductor_config.patch(
         {
@@ -1795,6 +1830,138 @@ def f(x):
         self.assertEqual(len(sigmoid_nodes), 1)
         self.assertTrue("original_aten" in sigmoid_nodes[0].meta)
 
+    @inductor_config.patch(is_predispatch=True)
+    def test_remove_noop_pass_with_remove_passes(self):
+        def fn_with_noop(x):
+            batch_size, dim = x.shape
+            y = x.view(batch_size, dim)
+            return y + 1
+
+        def count_view_ops(graph_module):
+            count = 0
+            for node in graph_module.graph.nodes:
+                if node.op == "call_function" and node.target in [
+                    torch.ops.aten.view.default,
+                    torch.ops.aten.reshape.default,
+                ]:
+                    count += 1
+            return count
+
+        device = "cuda" if HAS_GPU else "cpu"
+        input_tensor = torch.randn(8, 16, device=device)
+
+        with inductor_config.patch(remove_pre_grad_passes=None):
+            compiled_fn_default = torch.compile(fn_with_noop, fullgraph=True)
+            result_default = compiled_fn_default(input_tensor)
+
+        with inductor_config.patch(remove_pre_grad_passes="remove_noop"):
+            compiled_fn_skip_noop = torch.compile(fn_with_noop, fullgraph=True)
+            result_skip_noop = compiled_fn_skip_noop(input_tensor)
+
+        expected = fn_with_noop(input_tensor)
+        torch.testing.assert_close(result_default, expected)
+        torch.testing.assert_close(result_skip_noop, expected)
+
+        from torch._inductor.fx_passes.pre_grad import pre_grad_passes
+        from torch.fx.experimental.proxy_tensor import make_fx
+
+        with inductor_config.patch(
+            is_predispatch=True, pattern_matcher=True, remove_pre_grad_passes=None
+        ):
+            gm_default = make_fx(fn_with_noop)(input_tensor)
+            gm_default_processed = pre_grad_passes(
+                gm_default, [input_tensor], add_passes=None, remove_passes=None
+            )
+            view_count_default = count_view_ops(gm_default_processed)
+
+        with inductor_config.patch(
+            is_predispatch=True,
+            pattern_matcher=True,
+            remove_pre_grad_passes="remove_noop",
+        ):
+            gm_skip_noop = make_fx(fn_with_noop)(input_tensor)
+            gm_skip_noop_processed = pre_grad_passes(
+                gm_skip_noop,
+                [input_tensor],
+                add_passes=None,
+                remove_passes="remove_noop",
+            )
+            view_count_skip_noop = count_view_ops(gm_skip_noop_processed)
+
+        self.assertGreaterEqual(
+            view_count_skip_noop,
+            view_count_default,
+            f"Expected view count with remove_noop disabled ({view_count_skip_noop}) "
+            f"to be >= view count with remove_noop enabled ({view_count_default})",
+        )
+
+    def test_bound_method_pattern_matcher(self):
+        class ReluSumPattern:
+            def __init__(self, e: float):
+                self.e = e
+
+            def pattern(self, x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
+                return x.pow(self.e) + y.pow(self.e) + z.pow(self.e)
+
+            def replacement(self, x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
+                return (x + y + z).pow(self.e)
+
+            def inputs(self):
+                return [
+                    torch.empty(5, 5),  # x
+                    torch.empty(5, 5),  # y
+                    torch.empty(5, 5),  # z
+                ]
+
+            def register(self, pm: PatternMatcherPass):
+                register_replacement(
+                    self.pattern, self.replacement, self.inputs(), fwd_only, pm
+                )
+
+        my_patterns = PatternMatcherPass()
+        ReluSumPattern(4).register(my_patterns)
+
+        count = 0
+
+        def custom_pass(graph: torch.fx.Graph) -> torch.fx.Graph:
+            nonlocal count
+            count = my_patterns.apply(graph)
+            graph.eliminate_dead_code()
+            return graph
+
+        def custom_backend(graph: torch.fx.GraphModule, example_inputs):
+            from torch._inductor import config
+
+            current_config = config.shallow_copy_dict()
+            from torch._inductor.compile_fx import compile_fx
+
+            current_config["post_grad_custom_post_pass"] = custom_pass
+            current_config["enable_auto_functionalized_v2"] = False
+            return compile_fx(graph, example_inputs, config_patches=current_config)
+
+        @torch.compile(fullgraph=True, backend=custom_backend)
+        def fn(x):
+            y = x.relu()
+            z = y.tanh()
+            z2 = x.pow(2) + y.pow(2) + z.pow(2)
+            z3 = x.pow(3) + y.pow(3) + z2.pow(3)
+            z4 = x.pow(4) + y.pow(4) + z3.pow(4)
+            return z4 + 5
+
+        def fn_replaced(x):
+            y = x.relu()
+            z = y.tanh()
+            z2 = x.pow(2) + y.pow(2) + z.pow(2)
+            z3 = x.pow(3) + y.pow(3) + z2.pow(3)
+            z4 = (x + y + z3).pow(4)
+            return z4 + 5
+
+        x = [torch.ones((5, 4))]
+        fn_result = fn(*x)
+        fn_replaced_result = fn_replaced(*x)
+        self.assertEqual(count, 1)
+        self.assertEqual(fn_result, fn_replaced_result)
+
 
 if __name__ == "__main__":
     if IS_LINUX and HAS_GPU:
diff --git a/test/inductor/test_profiler.py b/test/inductor/test_profiler.py
index 11d1d4ce371a0..be35a2aedfe9e 100644
--- a/test/inductor/test_profiler.py
+++ b/test/inductor/test_profiler.py
@@ -12,8 +12,12 @@
 from torch import _dynamo as torchdynamo
 from torch._inductor import config
 from torch.profiler import ProfilerActivity
-from torch.testing._internal.common_utils import TemporaryFileName
-from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON, IS_BIG_GPU
+from torch.testing._internal.common_utils import skipIfXpu, TemporaryFileName
+from torch.testing._internal.inductor_utils import (
+    GPU_TYPE,
+    HAS_GPU_AND_TRITON,
+    IS_BIG_GPU,
+)
 from torch.torch_version import TorchVersion
 from torch.utils._triton import has_triton
 
@@ -22,6 +26,10 @@
 
 
 class DynamoProfilerTests(torch._inductor.test_case.TestCase):
+    @skipIfXpu(
+        msg="AssertionError: False is not true, "
+        "https://github.com/intel/torch-xpu-ops/issues/2335"
+    )
     @unittest.skipIf(not HAS_TRITON, "requires cuda & triton")
     def test_inductor_profiling_triton_launch(self):
         # Verify that we get some sort of CPU-side indication of triton kernel launches
@@ -31,7 +39,7 @@ def test_inductor_profiling_triton_launch(self):
         def fn(x, y):
             return (x + y).sin().cos()
 
-        x, y = (torch.rand((4, 4), device="cuda") for _ in range(2))
+        x, y = (torch.rand((4, 4), device=GPU_TYPE) for _ in range(2))
 
         with torch.profiler.profile() as prof:
             fn(x, y)
@@ -95,7 +103,7 @@ def test_inductor_profiling_kernel_names_pointwise(self):
         def fn(x, y):
             return (x + y).sin().cos()
 
-        args = [torch.rand((4, 4), device="cuda") for _ in range(2)]
+        args = [torch.rand((4, 4), device=GPU_TYPE) for _ in range(2)]
 
         events = self._test_profiling_kernel_names(fn, args, "sin")
         event_found = False
@@ -120,7 +128,7 @@ def test_inductor_profiling_kernel_names_template(self):
             def fn(x, y):
                 return x @ y
 
-            args = [torch.rand((4, 4), device="cuda") for _ in range(2)]
+            args = [torch.rand((4, 4), device=GPU_TYPE) for _ in range(2)]
 
             def check_fn():
                 # test_profiling_kernel_names will check this before asserting mm is in the trace.
@@ -153,8 +161,8 @@ def test_inductor_profiling_kernel_names_foreach(self):
             def fn(x, y):
                 return torch._foreach_add(x, y)
 
-            x = [torch.rand((4, 4), device="cuda") for _ in range(3)]
-            y = [torch.rand((4, 4), device="cuda") for _ in range(3)]
+            x = [torch.rand((4, 4), device=GPU_TYPE) for _ in range(3)]
+            y = [torch.rand((4, 4), device=GPU_TYPE) for _ in range(3)]
 
             args = (x, y)
 
@@ -206,8 +214,8 @@ def launch_exit_hook(lazy_dict):
         def fn(x, y):
             return torch._foreach_add(x, y)
 
-        x = [torch.rand((4, 4), device="cuda") for _ in range(3)]
-        y = [torch.rand((4, 4), device="cuda") for _ in range(3)]
+        x = [torch.rand((4, 4), device=GPU_TYPE) for _ in range(3)]
+        y = [torch.rand((4, 4), device=GPU_TYPE) for _ in range(3)]
 
         args = (x, y)
         fn_opt = torch.compile(fn)
@@ -216,11 +224,14 @@ def fn(x, y):
         self.assertTrue(hooks_called["enter"])
         self.assertTrue(hooks_called["exit"])
 
+    @skipIfXpu(
+        msg="TypeError: list indices must be integers or slices, not str, https://github.com/intel/torch-xpu-ops/issues/2335"
+    )
     @unittest.skipIf(not HAS_TRITON, "requires cuda & triton")
     def test_pt2_triton_attributes(self):
         from torch._inductor.codecache import code_hash
 
-        device = "cuda"
+        device = GPU_TYPE
         debug = False  # set to True to get output file
 
         @torchdynamo.optimize("inductor")
@@ -295,7 +306,7 @@ def check_triton_event(e) -> None:
 
     @unittest.skipIf(not HAS_TRITON, "requires cuda & triton")
     def test_cupti_lazy_reinit(self):
-        x, y = (torch.randn(4, 4, device="cuda") for _ in range(2))
+        x, y = (torch.randn(4, 4, device=GPU_TYPE) for _ in range(2))
 
         def fn(x, y):
             return (x + y).sin()
@@ -314,5 +325,5 @@ def fn(x, y):
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
 
-    if HAS_CUDA_AND_TRITON:
+    if HAS_GPU_AND_TRITON:
         run_tests()
diff --git a/test/inductor/test_provenance_tracing.py b/test/inductor/test_provenance_tracing.py
index 0d59616bc5338..3fd27cc02b006 100644
--- a/test/inductor/test_provenance_tracing.py
+++ b/test/inductor/test_provenance_tracing.py
@@ -28,7 +28,11 @@
 from torch._inductor.utils import run_and_get_code, run_and_get_cpp_code
 from torch._inductor.virtualized import V
 from torch.testing._internal.common_utils import IS_MACOS
-from torch.testing._internal.triton_utils import requires_cuda_and_triton
+from torch.testing._internal.inductor_utils import GPU_TYPE
+from torch.testing._internal.triton_utils import (
+    requires_cuda_and_triton,
+    requires_gpu_and_triton,
+)
 
 
 try:
@@ -70,8 +74,8 @@ def forward(self, a, b, c):
 class Model3(torch.nn.Module):
     def __init__(self, n, k):
         super().__init__()
-        self.weight = torch.randn(n, k, device="cuda")
-        self.bias = torch.randn(n, device="cuda")
+        self.weight = torch.randn(n, k, device=GPU_TYPE)
+        self.bias = torch.randn(n, device=GPU_TYPE)
 
     def forward(self, a):
         return torch.nn.functional.linear(a, self.weight, self.bias)
@@ -151,7 +155,7 @@ def _test_triton_kernel_to_post_grad_tracing(self, device):
                     m = re.match(r"WARNING.* debug trace: (.*)", cm.output[0])
                     self.assertTrue(m)
                     filepath = Path(m.group(1))
-                    if device == "cuda":
+                    if device == "cuda" or device == "xpu":
                         expected_mapping = [
                             (
                                 "cppCodeToPost",
@@ -201,13 +205,20 @@ def _test_triton_kernel_to_post_grad_tracing(self, device):
                                 },
                             ),
                         ]
-                        if backend == "aot_inductor":
+                        if backend == "aot_inductor" and device == "cuda":
                             expected_mapping[0][1]["aoti_torch_cuda_mm_out:2"] = [
                                 "mm_default"
                             ]
                             expected_mapping[1][1]["mm_default"] = [
                                 "aoti_torch_cuda_mm_out:2"
                             ]
+                        elif backend == "aot_inductor" and device == "xpu":
+                            expected_mapping[0][1]["aoti_torch_xpu_mm_out:2"] = [
+                                "mm_default"
+                            ]
+                            expected_mapping[1][1]["mm_default"] = [
+                                "aoti_torch_xpu_mm_out:2"
+                            ]
                         else:
                             expected_mapping[0][1]["extern_kernels.mm:2"] = [
                                 "mm_default"
@@ -254,21 +265,21 @@ def _test_triton_kernel_to_post_grad_tracing(self, device):
                 if filepath:
                     shutil.rmtree(filepath)
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     def test_triton_kernel_to_post_grad_tracing_cuda(self):
-        self._test_triton_kernel_to_post_grad_tracing(device="cuda")
+        self._test_triton_kernel_to_post_grad_tracing(device=GPU_TYPE)
 
     def test_triton_kernel_to_post_grad_tracing_cpu(self):
         self._test_triton_kernel_to_post_grad_tracing(device="cpu")
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     def test_triton_kernel_to_post_grad_tracing_extern_kernel(self):
         M = 8
         N = 6
         K = 16
         model = Model3(N, K)
         batch = 2
-        a = torch.randn(batch, M, K, device="cuda")
+        a = torch.randn(batch, M, K, device=GPU_TYPE)
         example_inputs = (a,)
         filepath = None
 
@@ -302,9 +313,10 @@ def test_triton_kernel_to_post_grad_tracing_extern_kernel(self):
                     else:
                         # backend = aot_inductor
                         expected_data = {
-                            "aoti_torch_cuda_addmm_out:2": ["addmm"],
+                            f"aoti_torch_{GPU_TYPE}_addmm_out:2": ["addmm"],
                             "triton_poi_fused_0:1": ["_tensor_constant1"],
                         }
+
                     self._check_provenance_tracing_kernel_to_post_grad(
                         filepath, expected_data
                     )
@@ -312,12 +324,12 @@ def test_triton_kernel_to_post_grad_tracing_extern_kernel(self):
                 if filepath:
                     shutil.rmtree(filepath)
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     def _test_pt_tracing_combo_kernel(self, backend):
         """This test checks that generated provenance tracing artifact from triton combo kernel to post grad nodes"""
-        a = torch.randn(10, 10, device="cuda")
-        b = torch.randn(20, 20, device="cuda")
-        c = torch.randn(10, 10, device="cuda")
+        a = torch.randn(10, 10, device=GPU_TYPE)
+        b = torch.randn(20, 20, device=GPU_TYPE)
+        c = torch.randn(10, 10, device=GPU_TYPE)
         example_inputs = (a, b, c)
 
         model = Model2()
@@ -348,7 +360,7 @@ def _test_pt_tracing_combo_kernel(self, backend):
             expected_data = {"triton_poi_fused_0:1": ["relu", "sigmoid", "tanh"]}
             self._check_provenance_tracing_kernel_to_post_grad(filepath, expected_data)
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     def test_triton_kernel_to_post_grad_tracing_combo_kernel(self):
         self._test_pt_tracing_combo_kernel(backend="inductor")
         self._test_pt_tracing_combo_kernel(backend="aot_inductor")
@@ -465,7 +477,7 @@ def get_node_with_target(self, gm, target):
         """
         return next(iter([node for node in gm.graph.nodes if node.target == target]))
 
-    @requires_cuda_and_triton  # test only works for cuda pattern matcher
+    @requires_gpu_and_triton  # test only works for cuda pattern matcher
     def test_pattern_matcher_transfer_meta(self):
         """
         Test that stack trace is transfered when node is decomposed in post_grad_passes
@@ -484,9 +496,9 @@ def forward(self, x):
                 x = self.sigmoid(x)
                 return x * 3
 
-        x = torch.randn(8, 10).to("cuda")
+        x = torch.randn(8, 10).to(GPU_TYPE)
         example_inputs = (x,)
-        model = Model().to("cuda")
+        model = Model().to(GPU_TYPE)
 
         # mimic the before_post_grad graph
         ep = torch.export.export(model, example_inputs).run_decompositions()
@@ -546,9 +558,9 @@ def extract_code_line(self, s, i=-2):
         return s.split("\n")[i].strip()
 
     @torch._inductor.config.patch({"trace.provenance_tracking_level": 2})
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     def test_tlparse_kernel_stack_traces(self):
-        device = "cuda"
+        device = GPU_TYPE
         model = Model4().to(device)
         x = torch.randn(8, 10).to(device)
         a = torch.randn(10, 20).to(device)
@@ -642,16 +654,16 @@ def _check_kernel_information_json(self, kernel_info, expected_kernels):
                 for item in data[field]:
                     self.assertIsInstance(item, str)
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     @torch._inductor.config.patch("trace.provenance_tracking_level", 1)
     def test_kernel_information_generation(self):
         """Test basic kernel information generation in AOTI packages."""
 
-        model = Model4().to("cuda")
-        x = torch.randn(8, 10, device="cuda")
-        a = torch.randn(10, 20, device="cuda")
-        b = torch.randn(20, 30, device="cuda")
-        c = torch.randn(10, 30, device="cuda")
+        model = Model4().to(GPU_TYPE)
+        x = torch.randn(8, 10, device=GPU_TYPE)
+        a = torch.randn(10, 20, device=GPU_TYPE)
+        b = torch.randn(20, 30, device=GPU_TYPE)
+        c = torch.randn(10, 30, device=GPU_TYPE)
         inputs = (x, a, b, c)
 
         with tempfile.TemporaryDirectory() as temp_dir:
@@ -712,14 +724,14 @@ def test_kernel_information_generation(self):
                     ],
                     "pre_grad_nodes": ["gelu", "addmm"],
                 },
-                "aoti_torch_cuda_mm_out:1": {
+                f"aoti_torch_{GPU_TYPE}_mm_out:1": {
                     "stack_traces": [
                         "x = self.fc1(x)",
                     ],
                     "post_grad_nodes": ["mm_default_1"],
                     "pre_grad_nodes": ["linear"],
                 },
-                "aoti_torch_cuda_mm_out:4": {
+                f"aoti_torch_{GPU_TYPE}_mm_out:4": {
                     "stack_traces": [
                         "y = torch.addmm(c, d, b)",
                     ],
diff --git a/test/inductor/test_select_algorithm.py b/test/inductor/test_select_algorithm.py
index 2e6ee09bf10ac..776a6dfcb7fd5 100644
--- a/test/inductor/test_select_algorithm.py
+++ b/test/inductor/test_select_algorithm.py
@@ -29,7 +29,7 @@
 from torch._inductor.utils import is_big_gpu, run_and_get_kernels
 from torch._inductor.virtualized import V
 from torch._prims_common import ELEMENTWISE_TYPE_PROMOTION_KIND
-from torch.testing._internal.common_utils import IS_LINUX, skipIfRocm, skipIfXpu
+from torch.testing._internal.common_utils import IS_LINUX, skipIfRocm
 from torch.testing._internal.inductor_utils import (
     GPU_TYPE,
     HAS_GPU,
@@ -180,7 +180,6 @@ def foo(a, b):
         self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)
 
     @patches
-    @skipIfXpu(msg="Double datatype matmul is not supported in oneDNN")
     def test_mm_skip(self):
         @torch.compile
         def foo(a, b):
@@ -249,7 +248,6 @@ def foo(a, b, c, d):
 
     # TODO: fix accuracy failure of the triton template on XPU.
     # and enable this test case.
-    @skipIfXpu
     @patches
     def test_mm_plus_mm2(self):
         @torch.compile
diff --git a/test/inductor/test_split_cat_fx_aten_passes.py b/test/inductor/test_split_cat_fx_aten_passes.py
index 0ec7825df001c..a575c3b71374b 100644
--- a/test/inductor/test_split_cat_fx_aten_passes.py
+++ b/test/inductor/test_split_cat_fx_aten_passes.py
@@ -5,7 +5,7 @@
 from torch._dynamo.utils import counters
 from torch._inductor.test_case import run_tests, TestCase
 from torch.testing._internal.inductor_utils import GPU_TYPE
-from torch.testing._internal.triton_utils import requires_cuda_and_triton
+from torch.testing._internal.triton_utils import requires_gpu_and_triton
 
 
 try:
@@ -248,7 +248,7 @@ def compare_gradients(self, module, traced, rtol=1e-3, atol=1e-3):
             self.compare_dict_tensors(ref_grad, res_grad, rtol=rtol, atol=atol)
         )
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     @torch._inductor.config.patch(
         pre_grad_fusion_options={},
         post_grad_fusion_options={
@@ -291,7 +291,7 @@ def test_split_cat_post_grad(self):
         self.compare_parameters(module, traced, rtol=1e-8, atol=1e-8)
         counters.clear()
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     @torch._inductor.config.patch(
         pre_grad_fusion_options={},
         post_grad_fusion_options={
@@ -317,7 +317,7 @@ def test_split_cat_post_grad_singular(self):
         self.compare_parameters(module, traced, rtol=1e-8, atol=1e-8)
         counters.clear()
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     @torch._inductor.config.patch(
         pre_grad_fusion_options={},
         post_grad_fusion_options={
@@ -342,7 +342,7 @@ def test_select_cat_post_grad(self):
         self.compare_parameters(module, traced, rtol=1e-8, atol=1e-8)
         counters.clear()
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     @torch._inductor.config.patch(
         pre_grad_fusion_options={},
         post_grad_fusion_options={
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 675d912c0c01f..f5d5c5107313f 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -30,6 +30,7 @@
 import torch
 import torch._dynamo.config as dynamo_config
 import torch._inductor.aoti_eager
+import torch.fx.traceback as fx_traceback
 import torch.nn as nn
 from torch._C._dynamo.guards import assert_alignment, assert_size_stride
 from torch._dispatch.python import enable_python_dispatcher
@@ -140,7 +141,10 @@
     skipCPUIf,
     skipCUDAIf,
 )
-from torch.testing._internal.triton_utils import requires_cuda_and_triton
+from torch.testing._internal.triton_utils import (
+    requires_cuda_and_triton,
+    requires_gpu_and_triton,
+)
 
 
 _T = TypeVar("_T")
@@ -189,7 +193,7 @@
     torch.int64,
 ]
 
-if SM80OrLater or MACOS_VERSION >= 14.0:
+if SM80OrLater or MACOS_VERSION >= 14.0 or GPU_TYPE == "xpu":
     test_dtypes.append(torch.bfloat16)
 
 
@@ -2317,7 +2321,7 @@ def fn(a, b):
         {"dynamic_shapes": False, "assume_static_by_default": True}
     )
     def test_custom_scan_op(self):
-        if self.device != "cuda":
+        if self.device != "cuda" and self.device != "xpu":
             raise unittest.SkipTest("associative_scan only supported on GPU")
 
         def sum_combine(a, b):
@@ -2346,7 +2350,7 @@ def logcumsum_combine(a, b):
         {"dynamic_shapes": False, "assume_static_by_default": True}
     )
     def test_custom_scan_op_compiled(self):
-        if self.device != "cuda":
+        if self.device != "cuda" and self.device != "xpu":
             raise unittest.SkipTest("associative_scan only supported on GPU")
 
         from torch._higher_order_ops.associative_scan import associative_scan
@@ -2376,7 +2380,7 @@ def fn(a, b, dim):
         {"dynamic_shapes": False, "assume_static_by_default": True}
     )
     def test_custom_scan_op_multi_input(self):
-        if self.device != "cuda":
+        if self.device != "cuda" and self.device != "xpu":
             raise unittest.SkipTest("associative_scan only supported on GPU")
 
         def argmax_combine(a, b):
@@ -2403,7 +2407,7 @@ def argmax_combine(a, b):
         {"dynamic_shapes": False, "assume_static_by_default": True}
     )
     def test_custom_scan_would_split(self):
-        if self.device != "cuda":
+        if self.device != "cuda" and self.device != "xpu":
             raise unittest.SkipTest("associative_scan only supported on GPU")
 
         def combine_linear_recurrence(left, right):
@@ -2454,7 +2458,6 @@ def fn(a):
         self.common(fn, [packed])
 
     @xfail_if_mps_unimplemented
-    @skipIfXpu(msg="No _weight_int8pack_mm implementation on XPU")
     def test_int8_weight_only_quant(self):
         def convert_weight_to_int8pack(b):
             b_int8pack, b_scales, _ = _dynamically_quantize_per_channel(
@@ -2938,17 +2941,18 @@ def fn2(x, y):
             self.common(fn2, (torch.randn(size1), torch.randn(size2)))
 
     def test_views2(self):
-        def fn1(x):
-            return (x.view(size2) + 1,)
-
-        def fn2(x):
-            return ((x * 2).view(size2) + 1,)
-
         for size1, size2 in [
             ([2, 2, 2, 2], [4, -1]),
             ([10, 1, 10, 1, 10], [-1, 100]),
             ([10 * 5, 20], [10, -1, 20]),
         ]:
+
+            def fn1(x):
+                return (x.view(size2) + 1,)
+
+            def fn2(x):
+                return ((x * 2).view(size2) + 1,)
+
             self.common(fn1, (torch.randn(size1),))
             self.common(fn2, (torch.randn(size1),))
 
@@ -3911,7 +3915,6 @@ def fn(a, b):
             check_lowp=True,
         )
 
-    @skipIfXpu
     def test_mm_mixed_dtype(self):
         def fn(a, b):
             return torch.mm(a, b)
@@ -3927,7 +3930,6 @@ def fn(a, b):
         with self.assertRaisesRegex(RuntimeError, msg):
             torch.compile(fn)(t1, t2)
 
-    @skipIfXpu
     @xfail_if_mps_unimplemented  # linear for non-float inputs
     def test_linear_mixed_dtype(self):
         class Net(nn.Module):
@@ -5876,6 +5878,22 @@ def fn(x, y):
             reference_in_float=False,
         )
 
+    @skipIfMPS
+    def test_linalg_eig_stride_consistency(self):
+        def fn(x):
+            eigenvals, eigenvecs = torch.linalg.eig(x)
+            return eigenvecs
+
+        x = torch.randn(5, 5, device=self.device, dtype=torch.float32)
+
+        self.common(
+            fn,
+            [x],
+            exact_stride=True,
+            exact_dtype=True,
+            check_lowp=False,
+        )
+
     def test_view_as_complex(self):
         class Repro(torch.nn.Module):
             def __init__(self) -> None:
@@ -7485,8 +7503,8 @@ def fn(a, b):
 
     @requires_gpu()
     def test_grid_sampler_expand_preserves_view(self):
-        if not self.device.startswith("cuda"):
-            self.skipTest("requires CUDA")
+        if not self.device.startswith("cuda") and not self.device.startswith("xpu"):
+            self.skipTest("requires CUDA or XPU")
 
         torch.manual_seed(0)
         torch._dynamo.reset()
@@ -11889,7 +11907,6 @@ def _cases_resize_as_common():
                 torch.preserve_format,
             )
 
-    @skipIfXpu
     def test_resize_as(self):
         def fn(x, y, memory_format):
             return torch.ops.aten.resize_as(x, y, memory_format=memory_format)
@@ -13547,6 +13564,224 @@ def f(image_latent):
             size_assert_pattern = r"assert_size_stride.[a-z]+[0-9]+, .2, 3, 16, 32, 32., .49152, 16384, 1, 512, 16.."
         FileCheck().check_regex(size_assert_pattern).run(code)
 
+    def test_lite_mode_fallback(self):
+        def f(x):
+            z = x.sin()
+            return z.cos()
+
+        f = torch.compile(f, mode="lite")
+
+        _, code = run_and_get_code(f, torch.randn(2, device=self.device))
+
+        # Checks that aten ops are kept and run
+        if config.cpp_wrapper:
+            FileCheck().check("aoti_torch_call_dispatcher(").check("aten::sin").check(
+                "aoti_torch_call_dispatcher("
+            ).check("aten::cos").run(code[0])
+        else:
+            FileCheck().check("torch.ops.aten.sin.default(").check(
+                "torch.ops.aten.cos.default("
+            ).run(code[0])
+            # Checks that no triton code run in the generated code
+            self.assertFalse(".run(" in code[0])
+
+    # skip cpu test since rms norm is always decomposed on cpu
+    def test_lite_mode_not_decompose(self):
+        if self.device != GPU_TYPE or self.device == "mps":
+            raise unittest.SkipTest("requires GPU")
+
+        def f(x, shape):
+            y = x + 1
+            z = torch.ops.aten._fused_rms_norm(y, shape, None, None)
+            return z[0] + z[1]
+
+        f = torch.compile(f, mode="lite")
+
+        x = torch.randn(2, 3, device=self.device)
+        _, code = run_and_get_code(f, x, [2, 3])
+        if config.cpp_wrapper:
+            FileCheck().check(
+                "AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda__fused_rms_norm("
+            ).run(code[0])
+        else:
+            FileCheck().check("torch.ops.aten._fused_rms_norm.default(").run(code[0])
+
+        if config.cpp_wrapper:
+            # arg type List[int] is not yet supported by custom_op_wrapper
+            pass
+        else:
+            x = torch.randn(2, 3, device=self.device, requires_grad=True)
+            _, codes = run_fw_bw_and_get_code(lambda: f(x, [2, 3]))
+            self.assertEqual(len(codes), 2)
+            FileCheck().check("torch.ops.aten._fused_rms_norm.default(").run(code[0])
+
+    def test_lite_regional_compile_flex_attention(self):
+        if self.device != GPU_TYPE or self.device == "mps":
+            raise unittest.SkipTest("requires GPU")
+
+        from torch.nn.attention.flex_attention import create_block_mask, flex_attention
+
+        def _squared(score, b, h, m, n):
+            return score * score
+
+        def mask_mod(b, h, q, k):
+            return q >= 0
+
+        a = 12
+        b = 64
+        block_mask = create_block_mask(
+            mask_mod, None, None, a * b, a * b, device=self.device
+        )
+
+        def fn(x):
+            x = torch.sin(x)
+            with fx_traceback.annotate({"compile_with_inductor": 0}):
+                x = flex_attention(x, x, x, block_mask=block_mask, score_mod=_squared)
+            return torch.cos(x)
+
+        x = torch.randn(
+            1,
+            1,
+            a * b,
+            b,
+            dtype=torch.bfloat16,
+            device=self.device,
+            requires_grad=True,
+        )
+
+        opt_fn = torch.compile(
+            fn,
+            mode="lite",
+            fullgraph=True,
+        )
+
+        # Check that inductor compilation is called twice
+        _, codes = run_fw_bw_and_get_code(lambda: opt_fn(x))
+        self.assertEqual(len(codes), 2)
+
+    @unittest.skipIf(
+        config.cpp_wrapper,
+        "codegen invoke_subgraph is not implemented for cpp wrapper",
+    )
+    def test_lite_regional_compile_invoke_subgraph(self):
+        # Checks that get_attr nodes custom metadata is propagated
+        @torch.compiler.nested_compile_region
+        def gn(x):
+            return torch.sin(x)
+
+        def fn(x):
+            x = x + 1
+            with fx_traceback.annotate({"compile_with_inductor": 0}):
+                z = gn(x)
+            return torch.sigmoid(z)
+
+        opt_fn = torch.compile(fn, mode="lite", fullgraph=True)
+        x = torch.randn(10, requires_grad=True)
+
+        _, codes = run_fw_bw_and_get_code(lambda: opt_fn(x))
+        self.assertEqual(len(codes), 2)
+
+    @unittest.skipIf(
+        config.cpp_wrapper,
+        "codegen triton_kernel_wrapper_functional is not implemented for cpp wrapper",
+    )
+    def test_lite_triton_kernel_wrapper_functional(self):
+        if self.device != GPU_TYPE or self.device == "mps":
+            raise unittest.SkipTest("requires GPU")
+
+        from torch._higher_order_ops.triton_kernel_wrap import (
+            kernel_side_table,
+            triton_kernel_wrapper_functional,
+        )
+        from torch.testing._internal.triton_utils import mul2_kernel
+
+        kernel_side_table.reset_table()
+
+        def f(x, output):
+            out = triton_kernel_wrapper_functional(
+                kernel_idx=kernel_side_table.add_kernel(mul2_kernel),
+                constant_args_idx=kernel_side_table.add_constant_args(
+                    {"n_elements": output.numel(), "BLOCK_SIZE": 16}
+                ),
+                grid=[(x.numel(),)],
+                tma_descriptor_metadata={},
+                kwargs={
+                    "in_ptr0": x,
+                    "out_ptr": output,
+                },
+                tensors_to_clone=["in_ptr0", "out_ptr"],
+            )
+            return out["out_ptr"]
+
+        t1 = torch.rand(5, device=self.device)
+        t2 = torch.rand(5, device=self.device)
+
+        compiled_f = torch.compile(f, mode="lite")
+        out = compiled_f(t1, t2)
+
+        # Make sure t2 was not modified
+        self.assertNotEqual(out, t2)
+
+    def test_lite_regional_compile_repeated_blocks(self):
+        def fn(x, y):
+            sin = torch.sin(x)
+
+            with fx_traceback.annotate({"compile_with_inductor": 0}):
+                mul = sin * y
+                add = mul + 1
+
+            return torch.sin(add)
+
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                a = fn(x, y)
+                return fn(a, y)
+
+        mod = Mod()
+
+        opt_mod = torch.compile(
+            mod,
+            mode="lite",
+            fullgraph=True,
+        )
+        x = torch.randn(10, requires_grad=True)
+        y = torch.randn(10, requires_grad=True)
+
+        _, codes = run_fw_bw_and_get_code(lambda: opt_mod(x, y))
+        self.assertEqual(len(codes), 2)
+
+    def test_lite_dynamic_shape_assertion(self):
+        class Model(torch.nn.Module):
+            def forward(self, c):
+                d = torch.concat([c, c], dim=0)
+                with fx_traceback.annotate({"compile_with_inductor": "my_region"}):
+                    d = d + 1
+                return d
+
+        model = Model()
+        model = torch.compile(
+            model,
+            mode="lite",
+            fullgraph=True,
+        )
+
+        c = torch.randn((64, 32), device=self.device)
+        torch._dynamo.decorators.mark_unbacked(c, 0)
+
+        _, code = run_and_get_code(model, c)
+        # Checks that unbacked symint assertions are kept
+        if config.cpp_wrapper:
+            FileCheck().check_regex(r"if \(!\(u.* >= 0L\)\)").check_regex(
+                "Expected u.* >= 0 but receive"
+            ).run(code[0])
+        else:
+            FileCheck().check_regex(r"if not \(u.* >= 0\):").check_regex(
+                r"raise RuntimeError\('u.* >= 0'\)"
+            ).run(code[0])
+
     @lowering.force_fallback(aten.sort.default)
     @unittest.skipIf(
         config.cpp_wrapper,
@@ -13569,7 +13804,7 @@ def f(x):
                 "assert_size_stride(buf2, (16, 32), (32, 1)"
             ).run(code)
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     @config.patch(use_fast_math=True)
     def test_prepare_softmax_with_fast_math(self):
         """
@@ -14068,7 +14303,11 @@ def forward(self, x):
         inputs = (torch.randn(4, device=self.device),)
         self.common(Model(), inputs)
 
-    @requires_cuda_and_triton
+    @skipIfXpu(
+        msg="Profile not enabled on XPU CI, "
+        "https://github.com/intel/torch-xpu-ops/issues/2334"
+    )
+    @requires_gpu_and_triton
     @parametrize("use_cat", [True, False])
     def test_copy_non_blocking_is_pinned(self, use_cat):
         def f(a_list):
@@ -14194,7 +14433,7 @@ def fn(m, inp):
         inp = torch.randn(100, 100, device=self.device)
         self.assertTrue(CommonTemplate._is_triggering_buffer_reuse(fn, m, inp))
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     def test_cpu_scalar_with_gpu_tensor(self):
         def fn(a, b):
             return a + b[0]
@@ -14208,7 +14447,7 @@ def fn(a, b):
         self.assertEqual(eager, compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     @torch._inductor.config.patch(cpp_wrapper=True)
     def test_cpu_scalar_with_gpu_tensor_cpp(self):
         def fn(a, b):
@@ -14221,7 +14460,7 @@ def fn(a, b):
         compiled = torch.compile(fn, backend="inductor")(a, b)
         self.assertEqual(eager, compiled)
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     def test_cpu_scalar_with_gpu_tensor_dynamic(self):
         def fn(a, b):
             return a + b[0]
@@ -14246,7 +14485,7 @@ def fn(a, b):
         self.assertEqual(eager, compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     def test_gpu_scalar_with_gpu_tensor(self):
         def fn(a, b):
             return a + b[0]
@@ -14260,7 +14499,7 @@ def fn(a, b):
         self.assertEqual(eager, compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     def test_cpu_tensor_with_gpu_tensor(self):
         def fn(a, b):
             return a + b
@@ -14293,7 +14532,7 @@ def fn(a, b):
         compiled = torch.compile(fn, backend="inductor")(a, b)
         self.assertEqual(eager, compiled)
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     def test_gpu_scalar_with_cpu_tensor(self):
         def fn(a, b):
             return a[0] + b
@@ -14304,7 +14543,7 @@ def fn(a, b):
         with self.assertRaises(RuntimeError):
             compiled = torch.compile(fn, backend="inductor")(a, b)
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     @config.patch(emulate_precision_casts=True)
     def test_emulate_precision_triton_fp_fusion(self):
         def fn(a, b):
@@ -14318,7 +14557,7 @@ def fn(a, b):
         self.assertTrue("'enable_fp_fusion': False" in code)
         torch.testing.assert_close(out, fn(a, b), atol=0, rtol=0)
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     @config.patch(runtime_triton_nan_asserts=True)
     def test_nan_assert_inside_triton_kernel(self):
         def fn(x):
@@ -14335,7 +14574,7 @@ def fn(x):
         torch.testing.assert_close(out, fn(x))
 
     @skip_if_cpp_wrapper("skip cpp wrapper")
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     def test_repeat_interleave_decomposition_has_clamp(self):
         repeat = torch.ones(2560, dtype=torch.int64, device=GPU_TYPE)
         output_size = 505450
@@ -14370,6 +14609,41 @@ def fn(repeat, output_size, data):
             "Generated Triton code should use triton_helpers.minimum for clamping",
         )
 
+    @skipIfMPS  # Accuracy issue on MPS
+    def test_weight_norm_conv2d(self):
+        """
+        Verify fix for https://github.com/pytorch/pytorch/issues/165749
+        """
+        from torch.nn.utils.parametrizations import weight_norm
+
+        d = 65
+        x = torch.randn((2, 2, 32, 32), device=self.device)
+        conv = weight_norm(nn.Conv2d(2, d, 2)).to(device=self.device)
+        ref = conv(x)
+        grad_out = torch.randn_like(ref)
+        ref_grad = torch.autograd.grad(ref, list(conv.parameters()), grad_out)
+
+        compiled_conv = torch.compile(conv)
+        act = compiled_conv(x)
+        act_grad = torch.autograd.grad(act, list(compiled_conv.parameters()), grad_out)
+
+        self.assertTrue(same((ref, ref_grad), (act, act_grad), tol=1e-3))
+
+    @skipIfMPS
+    def test_inner_reduction_detection(self):
+        if self.device == "cpu":
+            self.skipTest("Skip for CPU device")
+
+        x = torch.randn(100000, 1, 256, device=self.device)
+
+        @torch.compile
+        def f(x):
+            return x.sum(dim=(0, 1))
+
+        code = run_and_get_triton_code(f, x)
+        self.assertTrue("ReductionHint.OUTER" in code)
+        self.assertFalse("ReductionHint.INNER" in code)
+
     @skip_if_halide
     @requires_cuda_and_triton
     @skip_if_cpp_wrapper("skip cpp wrapper")
@@ -14408,6 +14682,20 @@ def fn(x):
 
         self.common(fn, (torch.randn(6, 4, device=GPU_TYPE).t().contiguous().t(),))
 
+    @skip_if_halide
+    @requires_cuda_and_triton
+    def test_unbacked_float_item(self):
+        def fn(x, max_val):
+            return torch.clamp(x, 0, max_val.item())
+
+        self.common(
+            fn,
+            (
+                torch.randn(10, 20, 30, device=self.device),
+                torch.tensor(5.0, device=self.device),
+            ),
+        )
+
     # end of class CommonTemplate - add new tests here
 
 
@@ -15605,8 +15893,6 @@ def wrapper(inp, weight):
             _, code = run_and_get_code(wrapper, inp, weight)
             self.assertTrue("in_out_ptr" in code[1])
 
-        # TODO: Enable this case after pad_mm is enabled on XPU.
-        @expectedFailureXPU
         @torch._functorch.config.patch("donated_buffer", True)
         @torch._inductor.config.patch("force_shape_pad", True)
         def test_donated_buffer_inplace_gpt(self):
@@ -15897,6 +16183,26 @@ def f(x):
                     f"with torch.{GPU_TYPE}._DeviceGuard(0)", 1, exactly=True
                 ).run(code)
 
+        @skipCUDAIf(
+            not SM90OrLater, "uses bfloat16 atomic add instrs which requires SM >= 90"
+        )
+        def test_bf16_atomic_add(self):
+            def fn(output, indices, values):
+                output.index_put_([indices], values, accumulate=True)
+                return output
+
+            indices = torch.tensor([0, 1], device=GPU_TYPE)
+            values = torch.randn(2, 768, dtype=torch.bfloat16, device=GPU_TYPE)
+            output = torch.zeros(512, 768, dtype=torch.bfloat16, device=GPU_TYPE)
+
+            result, code = run_and_get_code(torch.compile(fn), output, indices, values)
+            self.assertTrue(
+                "tl.atomic_add" in code[0],
+                "bf16 should generate tl.atomic_add",
+            )
+            expected = torch.zeros(512, 768, dtype=torch.bfloat16, device=GPU_TYPE)
+            torch.testing.assert_close(result, fn(expected, indices, values))
+
     class RNNTest(TestCase):
         device_type = GPU_TYPE
 
diff --git a/test/inductor/test_torchinductor_codegen_dynamic_shapes.py b/test/inductor/test_torchinductor_codegen_dynamic_shapes.py
index 2244af38f635a..e73f82ab64911 100644
--- a/test/inductor/test_torchinductor_codegen_dynamic_shapes.py
+++ b/test/inductor/test_torchinductor_codegen_dynamic_shapes.py
@@ -159,6 +159,9 @@ def run(*ex, **kwargs):
     #
     "test_complex_fallback_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
     "test_adaptive_avg_pool2d2_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
+    "test_linalg_eig_stride_consistency_dynamic_shapes": TestFailure(
+        ("cpu", "cuda", "xpu")
+    ),
     "test_adaptive_max_pool2d2_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
     "test_argmax_to_float_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
     "test_avg_pool2d7_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
diff --git a/test/inductor/test_torchinductor_dynamic_shapes.py b/test/inductor/test_torchinductor_dynamic_shapes.py
index 5eaa007a8a1cb..9c02fcf5cc7c3 100644
--- a/test/inductor/test_torchinductor_dynamic_shapes.py
+++ b/test/inductor/test_torchinductor_dynamic_shapes.py
@@ -31,7 +31,6 @@
     serialTest,
     TEST_CUDA_MEM_LEAK_CHECK,
     TEST_WITH_ASAN,
-    TEST_WITH_ROCM,
 )
 from torch.testing._internal.inductor_utils import (
     GPU_TYPE,
@@ -93,17 +92,6 @@
         ("cuda",)
     )
 
-if TEST_WITH_ROCM:
-    # Tensor-likes are not close
-    test_failures["test_dynamic_stride_nobreak"] = TestFailure(
-        ("cpu", "cuda"), is_skip=True
-    )
-    test_failures["test_item_to_inputs_kernel_nobreak"] = TestFailure(
-        ("cpu", "cuda"), is_skip=True
-    )
-    test_failures["test_unbacked_reduction"] = TestFailure(("cpu"), is_skip=True)
-
-
 if any(os.getenv("BUILD_ENVIRONMENT", "").endswith(x) for x in ("-debug", "-asan")):
     # Fails with TORCH_INTERNAL_ASSERT(!is_heap_allocated()), see https://github.com/pytorch/pytorch/issues/130073
     # After https://github.com/pytorch/pytorch/pull/161586, starts failing UBSAN so we can't even xfail.
@@ -159,7 +147,7 @@ def setUp(self):
         if not HAS_GPU:
             self.skipTest("Triton not available")
         torch._dynamo.reset()
-        TestCase.setUp(self)
+        super().setUp()
         # this should be in setUpClass, but device-generic tests
         # don't work with setUpClass well (non-deterministically the wrong setUpClass is resolved),
         # so put it in test setUp, it's cheap
diff --git a/test/inductor/test_torchinductor_strided_blocks.py b/test/inductor/test_torchinductor_strided_blocks.py
index 506174103f56c..7a9edd5570f3e 100644
--- a/test/inductor/test_torchinductor_strided_blocks.py
+++ b/test/inductor/test_torchinductor_strided_blocks.py
@@ -657,7 +657,6 @@ def test_dynamic_shapes_pointwise(self, nd_tiling: bool, num_block_pointers: int
             (False, 0),  # We can't infer that the load is a power of 2.
         ],
     )
-    @skipIfXpu(msg="Remove this after Intel triton issue #4000 resolved.")
     def test_dynamic_shapes_reduction(self, with_tiling: bool, num_block_pointers: int):
         """
         Test a reduction kernel with dynamic shapes.
@@ -906,6 +905,10 @@ def test_reduction_multiple_discontiguous_dims(self):
         # Check for 2 reduction dimensions.
         self._assert_reduction_ndims(code, 2)
 
+    @skipIfXpu(
+        msg="AssertionError: Scalars are not equal!, "
+        "https://github.com/intel/torch-xpu-ops/issues/2332"
+    )
     @xfail_if_use_tensor_descriptor  # Cannot use TMA API for store with no x dimension.
     @test_torchinductor.skip_if_triton_cpu  # Illegal instruction  File; cannot xfail because it crashes process
     def test_2d_reduction_multi_kernel(self):
diff --git a/test/inductor/test_triton_heuristics.py b/test/inductor/test_triton_heuristics.py
index 1573d4860a84c..429943385989c 100644
--- a/test/inductor/test_triton_heuristics.py
+++ b/test/inductor/test_triton_heuristics.py
@@ -20,8 +20,8 @@
 )
 from torch.testing._internal.inductor_utils import (
     GPU_TYPE,
-    HAS_CUDA_AND_TRITON,
     HAS_GPU,
+    HAS_GPU_AND_TRITON,
     requires_cuda_with_enough_memory,
 )
 
@@ -128,11 +128,11 @@ def forward(primals_1, primals_2, primals_5):
         ]
         self.assertEqual(forward(*args), foo_c(*args))
 
-    @skipIfXpu
+    # @skipIfXpu
     def test_artificial_zgrid(self):
         self._test_artificial_zgrid()
 
-    @skipIfXpu
+    # @skipIfXpu
     @config.patch("cpp_wrapper", True)
     def test_artificial_grid_cpp_wrapper(self):
         self._test_artificial_zgrid()
@@ -152,7 +152,7 @@ def triton_(in_ptr0, out_ptr0, xnumel, XBLOCK: tl.constexpr):
 
         triton_meta = {
             "signature": {"in_ptr0": "*fp32", "out_ptr0": "*fp32", "xnumel": "i32"},
-            "device": DeviceProperties.create(torch.device("cuda")),
+            "device": DeviceProperties.create(torch.device(GPU_TYPE)),
             "constants": {},
             "configs": [
                 AttrsDescriptorWrapper(divisible_by_16=(0, 1, 2), equal_to_1=())
@@ -178,7 +178,7 @@ def triton_(in_ptr0, out_ptr0, xnumel, XBLOCK: tl.constexpr):
             "inductor_meta": inductor_meta,
         }
 
-    @skipIfXpu
+    # @skipIfXpu
     def test_pre_hook_assert(self):
         # assert if any of the configs passed to the CachingAutotuner have pre-hooks
         args = self._get_cos_kernel_caching_autotuner_args()
@@ -272,9 +272,9 @@ def fn(x):
         res = torch.compile(fn)(x)
         self.assertEqual(ref, res)
 
-    @skipIfXpu
+    @skipIfXpu(msg="https://github.com/intel/torch-xpu-ops/issues/2331")
     @skipIfRocm
-    @skipUnless(HAS_CUDA_AND_TRITON, "requires CUDA")
+    @skipUnless(HAS_GPU_AND_TRITON, "requires gpu and triton")
     @parametrize("do_pruning", [False, True])
     def test_prune_configs_over_shared_memory_limit(self, do_pruning):
         from torch._inductor.template_heuristics.triton import (
@@ -326,7 +326,7 @@ def _create_tensor(self, pad=1, with_offset=False):
         return out
 
     def _do_test(self, gpu_tensor):
-        torch.cuda.reset_peak_memory_stats()
+        torch.get_device_module(GPU_TYPE).reset_peak_memory_stats()
         autotuner = self._create_caching_autotuner()
 
         old_storage_offset = gpu_tensor.storage_offset()
@@ -348,7 +348,7 @@ def _do_test(self, gpu_tensor):
 
         # Note: torch.allclose somehow allocates large amount of extra memory.
         # Record peak memory before that.
-        peak_mem_after = torch.cuda.max_memory_allocated()
+        peak_mem_after = torch.get_device_module(GPU_TYPE).max_memory_allocated()
 
         self.assertTrue(torch.allclose(gpu_tensor, gpu_tensor_clone))
         self.assertTrue(
diff --git a/test/inductor/test_triton_kernels.py b/test/inductor/test_triton_kernels.py
index e4ee0e4b2bd4c..eee4dba7f2772 100644
--- a/test/inductor/test_triton_kernels.py
+++ b/test/inductor/test_triton_kernels.py
@@ -711,7 +711,6 @@ def call_triton(
         self.assertEqual(int_result, resulti)
 
     @requires_gpu
-    @skipIfXpu
     def test_triton_kernel_constants(self):
         @triton.jit
         def mulC_kernel(
@@ -723,6 +722,7 @@ def mulC_kernel(
         ):
             pid = tl.program_id(axis=0)
             block_start = pid * BLOCK_SIZE
+
             offsets = block_start + tl.arange(0, BLOCK_SIZE)
             mask = offsets < n_elements
             x = tl.load(in_ptr0 + offsets, mask=mask)
@@ -2243,7 +2243,7 @@ def f(x):
         self.assertEqual(compiled_out, eager_out)
 
     # TODO enable this test case on XPU.
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     @parametrize("cfg", ["normal", "cpp_wrapper"])
     def test_triton_kernel_dtype_view(self, cfg):
         # https://github.com/pytorch/pytorch/issues/136159
@@ -2542,8 +2542,11 @@ def fn(sz):
         self.assertEqual(actual, expected)
 
     @requires_gpu
+    @skipIfXpu(
+        msg="XPU Triton result in nan, "
+        "https://github.com/intel/torch-xpu-ops/issues/2330"
+    )
     @skipIfRocm
-    @skipIfXpu
     @inductor_config.patch({"triton.autotune_at_compile_time": True})
     @parametrize("quotes", ["single", "double"])
     def test_kernel_inline_asm(self, quotes):
diff --git a/test/inductor/test_unbacked_symints.py b/test/inductor/test_unbacked_symints.py
index eb882d36160e2..2574d2210da60 100644
--- a/test/inductor/test_unbacked_symints.py
+++ b/test/inductor/test_unbacked_symints.py
@@ -38,10 +38,6 @@ def fn(x, y):
 
         torch.testing.assert_close(actual, expected)
 
-    @skipIfXpu(
-        msg="The OP aten.nonzero implemented by XPU has different memory layout with fake tensor."
-        " Remove this skip after #146883 fixed."
-    )
     @skipGPUIf(not HAS_GPU, "requires gpu and triton")
     @dynamo_config.patch({"capture_dynamic_output_shape_ops": True})
     def test_expand_ok_with_runtime_assert(self, device):
@@ -653,6 +649,9 @@ def fn(x):
         expected = fn(*example_inputs)
         torch.testing.assert_close(actual, expected)
 
+    @skipIfXpu(
+        msg="Invalid SPIR-V modul,https://github.com/intel/torch-xpu-ops/issues/2329"
+    )
     @skipGPUIf(not HAS_GPU, "requires gpu and triton")
     @inductor_config.patch({"max_autotune": True})
     @dynamo_config.patch({"capture_scalar_outputs": True})
diff --git a/test/jit/fixtures_srcs/test_upgrader_models_generation.py b/test/jit/fixtures_srcs/test_upgrader_models_generation.py
index a23b95af9dfcf..028244ac89583 100644
--- a/test/jit/fixtures_srcs/test_upgrader_models_generation.py
+++ b/test/jit/fixtures_srcs/test_upgrader_models_generation.py
@@ -7,7 +7,7 @@
 
 class TestUpgraderModelGeneration(TestCase):
     def test_all_modules(self):
-        for a_module in ALL_MODULES.keys():
+        for a_module in ALL_MODULES:
             module_name = type(a_module).__name__
             self.assertTrue(
                 isinstance(a_module, torch.nn.Module),
diff --git a/test/jit/test_list_dict.py b/test/jit/test_list_dict.py
index b8853d2e6f5f4..90dbc30d5d790 100644
--- a/test/jit/test_list_dict.py
+++ b/test/jit/test_list_dict.py
@@ -2979,7 +2979,7 @@ def __init__(self) -> None:
                 self.col2 = "b"
 
             def forward(self):
-                if self.col1 in self.segments_groupby_col.keys():
+                if self.col1 in self.segments_groupby_col:
                     return 1
                 else:
                     return 2
diff --git a/test/jit/test_module_containers.py b/test/jit/test_module_containers.py
index 7a8bbf58224bb..31254be34d671 100644
--- a/test/jit/test_module_containers.py
+++ b/test/jit/test_module_containers.py
@@ -78,7 +78,7 @@ def forward(self, x, skip_name):
                     x = mod(x)
                     values.append(x)
 
-                for key in self.moduledict.keys():
+                for key in self.moduledict:
                     names.append(key)
 
                 return x, names
@@ -306,7 +306,7 @@ def forward(self, inputs):
 
                 assert "submod" in self.moduledict, "__contains__ fails for ModuleDict"
 
-                for key in self.moduledict.keys():
+                for key in self.moduledict:
                     assert key == "submod", "keys() fails for ModuleDict"
 
                 for item in self.moduledict.items():
diff --git a/test/jit/test_pdt.py b/test/jit/test_pdt.py
index 0ac620b368b6e..eaff742f55591 100644
--- a/test/jit/test_pdt.py
+++ b/test/jit/test_pdt.py
@@ -276,7 +276,7 @@ def test_substring(self, a, b):
     def test_multiple_class_with_same_method(self):
         class PDTModelOne:
             def test_find(self, a, b):
-                return b in a.keys()
+                return b in a
 
         class PDTModelTwo:
             def test_find(self, a, b):
diff --git a/test/jit/test_typing.py b/test/jit/test_typing.py
index c1a010dcfb94d..714fa6768958e 100644
--- a/test/jit/test_typing.py
+++ b/test/jit/test_typing.py
@@ -342,7 +342,7 @@ def test_dict_keys_values(x):
             # type: (Dict[str, int]) -> Tuple[str, int]
             key_str = ""
             sum = 0
-            for key in x.keys():
+            for key in x:
                 key_str += key
             for val in x.values():
                 sum += val
diff --git a/test/nn/attention/__init__.py b/test/nn/attention/__init__.py
new file mode 100644
index 0000000000000..e999245ce6aaa
--- /dev/null
+++ b/test/nn/attention/__init__.py
@@ -0,0 +1 @@
+# Owner(s): ["module: sdpa"]
diff --git a/test/nn/attention/test_fa4.py b/test/nn/attention/test_fa4.py
new file mode 100644
index 0000000000000..55e893c940df9
--- /dev/null
+++ b/test/nn/attention/test_fa4.py
@@ -0,0 +1,266 @@
+# Owner(s): ["module: sdpa"]
+
+import importlib
+import unittest
+from collections import namedtuple
+from contextlib import contextmanager
+
+import torch
+import torch.nn.functional as F
+from torch.backends.cuda import SDPBackend
+from torch.nn.attention import activate_flash_attention_impl, sdpa_kernel
+from torch.profiler import profile, ProfilerActivity
+from torch.testing._internal.common_device_type import instantiate_device_type_tests
+from torch.testing._internal.common_utils import parametrize, run_tests, TestCase
+
+
+SdpaShape = namedtuple("Sdpa_Shape", ["batch", "num_heads", "seq_len", "head_dim"])
+
+
+def _fa4_dependencies_available() -> bool:
+    if not torch.cuda.is_available():
+        return False
+    major, _ = torch.cuda.get_device_capability(torch.cuda.current_device())
+    if major not in (9, 10):
+        return False
+    try:
+        importlib.import_module("flash_attn.cute.interface")
+    except ModuleNotFoundError:
+        return False
+    return True
+
+
+@contextmanager
+def cuda_kernel_profiler(kernel_pattern="flash_attncute"):
+    result = {"found": False, "kernel_names": []}
+
+    with profile(activities=[ProfilerActivity.CUDA]) as prof:
+        yield result
+
+    kernel_names = [
+        evt.name
+        for evt in prof.events()
+        if evt.device_type == torch.autograd.DeviceType.CUDA and evt.name
+    ]
+    result["kernel_names"] = kernel_names
+    result["found"] = any(kernel_pattern in name for name in kernel_names)
+
+
+def flash_vs_math(test_case, q, k, v, is_causal=False, rtol=2):
+    """
+    Compare flash-attention backend against math backend in fp32 and low precision.
+
+    Similar to flash_vs_triton from test_flex_flash.py but for SDPA backends.
+    Compares:
+    - Flash backend in low precision (fp16/bf16)
+    - Math backend in fp32 (reference)
+    - Math backend in low precision (fp16/bf16)
+
+    Args:
+        test_case: TestCase instance for assertions
+        q, k, v: Input tensors in low precision (fp16/bf16) with requires_grad=True
+        is_causal: Whether to use causal masking
+        rtol: Relative tolerance multiplier for error comparison
+
+    Returns:
+        out_flash, out_math_low, out_math_fp32
+    """
+    # Flash attention in low precision
+    with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
+        out_flash = F.scaled_dot_product_attention(
+            q, k, v, attn_mask=None, dropout_p=0.0, is_causal=is_causal
+        )
+
+    # Math backend in fp32 (reference)
+    with sdpa_kernel(SDPBackend.MATH):
+        out_math_fp32 = F.scaled_dot_product_attention(
+            q.to(torch.float32),
+            k.to(torch.float32),
+            v.to(torch.float32),
+            attn_mask=None,
+            dropout_p=0.0,
+            is_causal=is_causal,
+        ).to(q.dtype)
+
+    # Math backend in low precision
+    with sdpa_kernel(SDPBackend.MATH):
+        out_math_low = F.scaled_dot_product_attention(
+            q, k, v, attn_mask=None, dropout_p=0.0, is_causal=is_causal
+        )
+
+    # sanity checks
+    test_case.assertEqual(out_flash.shape, out_math_fp32.shape)
+    test_case.assertEqual(out_flash.shape, out_math_low.shape)
+    test_case.assertFalse(torch.isnan(out_flash).any())
+    test_case.assertFalse(torch.isnan(out_math_low).any())
+    test_case.assertFalse(torch.isnan(out_math_fp32).any())
+    test_case.assertTrue(torch.isfinite(out_flash).all())
+    test_case.assertTrue(torch.isfinite(out_math_low).all())
+    test_case.assertTrue(torch.isfinite(out_math_fp32).all())
+
+    # Calculate forward tolerance based on fp32 reference
+    fwd_atol = 2 * (out_math_fp32 + 0.3 - 0.3 - out_math_fp32).abs().max().item()
+
+    # Calculate errors
+    math_low_error = (out_math_low - out_math_fp32).abs().max().item()
+    flash_error = (out_flash - out_math_fp32).abs().max().item()
+
+    # Assert flash error is within tolerance of math low precision error
+    test_case.assertLessEqual(
+        flash_error,
+        rtol * math_low_error + fwd_atol,
+        f"Flash error {flash_error:.2e} exceeds {rtol}x Math-low error {math_low_error:.2e} + {fwd_atol:.2e}",
+    )
+
+    return out_flash, out_math_low, out_math_fp32
+
+
+class TestFlashAttentionFA4(TestCase):
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        if not _fa4_dependencies_available():
+            return
+        # This might pollute tests.. TODO
+        activate_flash_attention_impl("FA4")
+
+    @unittest.skipUnless(_fa4_dependencies_available(), "FA4 backend unavailable")
+    def _assert_flash_matches_math(
+        self,
+        device,
+        shape: SdpaShape,
+        dtype: torch.dtype,
+        is_causal: bool,
+        rtol: int = 2,
+        test_backward: bool = True,
+    ) -> None:
+        q = torch.randn(shape, dtype=dtype, device=device).requires_grad_(True)
+        k = torch.randn(shape, dtype=dtype, device=device).requires_grad_(True)
+        v = torch.randn(shape, dtype=dtype, device=device).requires_grad_(True)
+
+        # Forward pass comparison
+        out_flash, out_math_low, out_math_fp32 = flash_vs_math(
+            self, q, k, v, is_causal=is_causal, rtol=rtol
+        )
+
+        if test_backward:
+            # Backward pass comparison
+            g = torch.randn_like(out_flash)
+
+            # Flash gradients
+            dq_flash, dk_flash, dv_flash = torch.autograd.grad(
+                out_flash, (q, k, v), g, retain_graph=True
+            )
+
+            # Math fp32 gradients (reference)
+            dq_math_fp32, dk_math_fp32, dv_math_fp32 = torch.autograd.grad(
+                out_math_fp32, (q, k, v), g, retain_graph=True
+            )
+
+            # Math low precision gradients
+            dq_math_low, dk_math_low, dv_math_low = torch.autograd.grad(
+                out_math_low, (q, k, v), g
+            )
+
+            # Calculate gradient tolerances (similar to flash-attention tests)
+            dq_atol = 2 * (dq_math_fp32 + 0.3 - 0.3 - dq_math_fp32).abs().max().item()
+            dk_atol = 2 * (dk_math_fp32 + 0.3 - 0.3 - dk_math_fp32).abs().max().item()
+            dv_atol = 2 * (dv_math_fp32 + 0.3 - 0.3 - dv_math_fp32).abs().max().item()
+
+            # Check flash gradients are within tolerance of math low precision
+            dq_math_low_error = (dq_math_low - dq_math_fp32).abs().max().item()
+            dq_flash_error = (dq_flash - dq_math_fp32).abs().max().item()
+            self.assertLessEqual(
+                dq_flash_error,
+                rtol * dq_math_low_error + dq_atol,
+                f"dQ: Flash error {dq_flash_error:.2e} exceeds {rtol}x Math-low error {dq_math_low_error:.2e} + {dq_atol:.2e}",
+            )
+
+            dk_math_low_error = (dk_math_low - dk_math_fp32).abs().max().item()
+            dk_flash_error = (dk_flash - dk_math_fp32).abs().max().item()
+            self.assertLessEqual(
+                dk_flash_error,
+                rtol * dk_math_low_error + dk_atol,
+                f"dK: Flash error {dk_flash_error:.2e} exceeds {rtol}x Math-low error {dk_math_low_error:.2e} + {dk_atol:.2e}",
+            )
+
+            dv_math_low_error = (dv_math_low - dv_math_fp32).abs().max().item()
+            dv_flash_error = (dv_flash - dv_math_fp32).abs().max().item()
+            self.assertLessEqual(
+                dv_flash_error,
+                rtol * (dv_math_low_error + dv_atol),
+                f"dV: Flash error {dv_flash_error:.2e} exceeds {rtol}x (Math-low error {dv_math_low_error:.2e} + {dv_atol:.2e})",
+            )
+
+    @unittest.skipUnless(_fa4_dependencies_available(), "FA4 backend unavailable")
+    @parametrize("dtype", [torch.float16, torch.bfloat16])
+    @parametrize("batch", [1, 2])
+    @parametrize(
+        "seq_len",
+        [
+            512,
+            1024,
+        ],
+    )
+    @parametrize("heads", [4, 8])
+    @parametrize("head_dim", [64, 128])
+    @parametrize(
+        "is_causal",
+        [False, True],
+    )
+    def test_flash_attention_matches_math(
+        self, device, dtype, batch, seq_len, heads, head_dim, is_causal
+    ):
+        # TODO: Getting bad TMA setup on dO w/ headdim = 64, will take a look
+        test_backward = head_dim == 128 and dtype == torch.float16
+        shape = SdpaShape(batch, heads, seq_len, head_dim)
+        self._assert_flash_matches_math(
+            device,
+            shape=shape,
+            dtype=dtype,
+            is_causal=is_causal,
+            # Bwd is consistently erroring
+            test_backward=test_backward,
+        )
+
+    @unittest.skipUnless(_fa4_dependencies_available(), "FA4 backend unavailable")
+    @parametrize("dtype", [torch.float16, torch.bfloat16])
+    def test_fa4_kernel_called(self, device, dtype):
+        shape = SdpaShape(2, 4, 512, 128)
+        q = torch.randn(shape, dtype=dtype, device=device, requires_grad=True)
+        k = torch.randn(shape, dtype=dtype, device=device, requires_grad=True)
+        v = torch.randn(shape, dtype=dtype, device=device, requires_grad=True)
+
+        with cuda_kernel_profiler("flash_attncute") as prof_result:
+            with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
+                out = F.scaled_dot_product_attention(
+                    q, k, v, attn_mask=None, dropout_p=0.0, is_causal=False
+                )
+                out.sum().backward()
+
+        self.assertTrue(
+            prof_result["found"],
+            f"FA4 CUTE kernel not found in forward/backward. Available kernels: {prof_result['kernel_names']}",
+        )
+
+        q.grad = None
+        k.grad = None
+        v.grad = None
+
+        with cuda_kernel_profiler("flash_attncute") as prof_result:
+            with sdpa_kernel(SDPBackend.MATH):
+                out = F.scaled_dot_product_attention(
+                    q, k, v, attn_mask=None, dropout_p=0.0, is_causal=False
+                )
+                out.sum().backward()
+
+        self.assertFalse(
+            prof_result["found"],
+            f"FA4 CUTE kernel unexpectedly found with MATH backend. Kernels: {prof_result['kernel_names']}",
+        )
+
+
+instantiate_device_type_tests(TestFlashAttentionFA4, globals(), only_for="cuda")
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/nn/attention/test_open_registry.py b/test/nn/attention/test_open_registry.py
new file mode 100644
index 0000000000000..685d7afea4b41
--- /dev/null
+++ b/test/nn/attention/test_open_registry.py
@@ -0,0 +1,44 @@
+# Owner(s): ["module: sdpa"]
+
+import torch.nn.attention as attention
+from torch.nn.attention import _registry
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+class TestFlashAttentionRegistry(TestCase):
+    def setUp(self):
+        super().setUp()
+        self._saved_impls = dict(_registry._FLASH_ATTENTION_IMPLS)
+        self._saved_active = attention.current_flash_attention_impl()
+        _registry._FLASH_ATTENTION_IMPLS.clear()
+        _registry._FLASH_ATTENTION_ACTIVE = None
+
+    def tearDown(self):
+        _registry._FLASH_ATTENTION_IMPLS.clear()
+        _registry._FLASH_ATTENTION_IMPLS.update(self._saved_impls)
+        _registry._FLASH_ATTENTION_ACTIVE = self._saved_active
+        super().tearDown()
+
+    def test_register_and_activate_impl(self):
+        calls: dict[str, bool] = {}
+
+        def fake_register():
+            calls["called"] = True
+
+        attention.register_flash_attention_impl("TEST_FA", register_fn=fake_register)
+        self.assertIn("TEST_FA", attention.list_flash_attention_impls())
+
+        attention.activate_flash_attention_impl("TEST_FA")
+
+        self.assertTrue(calls.get("called", False))
+        self.assertEqual("TEST_FA", attention.current_flash_attention_impl())
+
+    def test_activate_unknown_impl_errors(self):
+        with self.assertRaisesRegex(
+            ValueError, "Unknown flash attention impl 'missing'"
+        ):
+            attention.activate_flash_attention_impl("missing")
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/nn/test_convolution.py b/test/nn/test_convolution.py
index f6d0355461596..83f4d0ccc9600 100644
--- a/test/nn/test_convolution.py
+++ b/test/nn/test_convolution.py
@@ -60,6 +60,7 @@ def _get_cudnn_version():
     MI300_ARCH,
     parametrize as parametrize_test,
     run_tests,
+    serialTest,
     set_default_dtype,
     skipIfRocmArch,
     subtest,
@@ -3236,6 +3237,7 @@ def test_convTranspose_empty(self, device):
 
     @onlyCUDA
     @largeTensorTest("12GB")
+    @serialTest()
     def test_conv_large_nosplit(self, device):
         # Here we just test the convolution correctly route to the fallback implementation
         # that is, it does not crash. The correctness of fallback implementation should be
@@ -3296,6 +3298,7 @@ def test_conv_noncontig_weights_and_bias(self, device):
 
     @onlyCUDA
     @largeTensorTest("12GB")
+    @serialTest()
     def test_conv_transposed_large(self, device):
         dtype = torch.half if self.device_type == "cuda" else torch.float
         conv = nn.ConvTranspose2d(1, 1, 1, 1, bias=False).to(device).to(dtype)
@@ -3340,6 +3343,7 @@ def test_conv_transposed_large(self, device):
 
     @onlyCUDA
     @largeTensorTest("12GB")
+    @serialTest()
     def test_conv_large(self, device):
         dtype = torch.half if self.device_type == "cuda" else torch.float
         conv = nn.Conv2d(2, 2, 8, 8, bias=False).to(device).to(dtype)
@@ -3373,6 +3377,7 @@ def test_conv_large(self, device):
     @onlyCUDA
     @largeTensorTest("20GB", "cpu")
     @largeTensorTest("60GB", "cuda")
+    @serialTest()
     def test_conv_large_batch_1(self, device):
         in_channels = 514
         dim = 2048
@@ -4186,6 +4191,7 @@ def test_conv_double_backward_strided_with_3D_input_and_weight(self, device):
     @onlyCUDA
     @largeTensorTest("40GB")
     @largeTensorTest("24GB", "cpu")
+    @serialTest()
     @tf32_on_and_off(0.005)
     def test_conv3d_64bit_indexing(self, device):
         x = torch.rand(1, 32, 512, 512, 256)
@@ -4196,7 +4202,8 @@ def test_conv3d_64bit_indexing(self, device):
 
     @skipCUDAIfRocm
     @onlyCUDA
-    @largeTensorTest("40GB", "cuda")
+    @largeTensorTest("48GB", "cuda")
+    @serialTest()
     def test_conv3d_cudnn_broken(self, device):
         for dtype in (torch.half, torch.bfloat16):
             x = torch.rand(1, 16, 124, 1282, 722, dtype=dtype, device=device)
@@ -4219,6 +4226,7 @@ def test_conv3d_cudnn_broken(self, device):
     @onlyCUDA
     @largeTensorTest("20GB")
     @largeTensorTest("64GB", "cpu")
+    @serialTest()
     # TODO(eqy): Remove this once it is fixed in cuDNN and we can dispatch to it again
     @xfailIf(_get_cudnn_version() is not None and _get_cudnn_version() > 91000)
     def test_depthwise_conv_64bit_indexing(self, device):
diff --git a/test/nn/test_embedding.py b/test/nn/test_embedding.py
index f21184290fa15..e1411f3101e22 100644
--- a/test/nn/test_embedding.py
+++ b/test/nn/test_embedding.py
@@ -7,16 +7,17 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_device_type import (
     dtypes,
     dtypesIfCUDA,
+    dtypesIfXPU,
     instantiate_device_type_tests,
     largeTensorTest,
-    onlyCUDA,
     onlyNativeDeviceTypes,
+    onlyOn,
     skipCUDAIf,
     skipMeta,
+    skipXPUIf,
     TEST_WITH_ROCM,
 )
 from torch.testing._internal.common_nn import NNTestCase
@@ -29,6 +30,13 @@
     run_tests,
     set_default_dtype,
     skipIfTorchDynamo,
+    TEST_CUDA,
+    TEST_XPU,
+)
+
+
+device_type = (
+    acc.type if (acc := torch.accelerator.current_accelerator(True)) else "cpu"
 )
 
 
@@ -36,7 +44,7 @@ class TestEmbeddingNN(NNTestCase):
     _do_cuda_memory_leak_check = True
     _do_cuda_non_default_stream = True
 
-    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "CUDA/XPU unavailable")
     def test_embedding_max_norm_unsorted_repeating_indices(self):
         def create_embedding(device):
             # Seed RNG so we get the same Embedding each time
@@ -48,8 +56,8 @@ def create_embedding(device):
         ix = torch.arange(2, device="cpu", dtype=torch.long).repeat(2000)
         out_cpu = create_embedding("cpu")(ix)
 
-        ix = ix.to("cuda")
-        out = create_embedding("cuda")(ix)
+        ix = ix.to(device_type)
+        out = create_embedding(device_type)(ix)
         self.assertEqual(out.cpu(), out_cpu)
 
     def test_embedding_sparse_basic(self):
@@ -81,9 +89,9 @@ def test_move_sparse_half_embedding(self):
         self.assertEqual(embedding.embedding_dim, 3)
         self.assertEqual(embedding.num_embeddings, 10)
 
-        if torch.cuda.is_available():
-            embedding.to("cuda")
-            self.assertEqual(embedding.weight.device.type, "cuda")
+        if not torch.accelerator.is_available():
+            embedding.to(device_type)
+            self.assertEqual(embedding.weight.device.type, device_type)
             embedding.to("cpu")
             self.assertEqual(embedding.weight.device.type, "cpu")
 
@@ -182,11 +190,11 @@ def test_embedding_functional(self):
         self.assertEqual(res_old, res_F)
 
     # https://github.com/pytorch/pytorch/issues/130806
-    @unittest.skipIf(not TEST_CUDA, "CUDA not available")
-    @largeTensorTest("40GB", device="cuda")
+    @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "CUDA/XPU not available")
+    @largeTensorTest("40GB", device=device_type)
     def test_large_tensors(self):
-        input = torch.randint(low=0, high=16032, size=[131072], device="cuda")
-        w = torch.randn([16032, 16384], device="cuda")
+        input = torch.randint(low=0, high=16032, size=[131072], device=device_type)
+        w = torch.randn([16032, 16384], device=device_type)
         out = torch.nn.functional.embedding(input, w)
         self.assertEqual(out.dim(), 2)
         self.assertEqual(out.numel(), 2147483648)
@@ -308,6 +316,7 @@ def test_embedding_scalar_weight_error(self, device):
                 torch.nn.functional.embedding(indices, weight)
 
     @dtypesIfCUDA(torch.float16, torch.float64)
+    @dtypesIfXPU(torch.float16, torch.float64)
     @dtypes(torch.float64)
     def test_embedding_backward(self, device, dtype):
         embedding = nn.Embedding(10, 3, sparse=True)
@@ -348,6 +357,7 @@ def test_embedding_backward(self, device, dtype):
             else (torch.float, torch.double, torch.half)
         )
     )
+    @dtypesIfXPU(torch.float32, torch.double, torch.half)
     @dtypes(torch.float32)
     def test_embedding_max_norm_backward(self, device, dtype):
         # can't use gradcheck since in place renorm makes analytical gradients different from produced ones
@@ -372,6 +382,7 @@ def test_embedding_max_norm_backward(self, device, dtype):
             else (torch.float, torch.double, torch.half)
         )
     )
+    @dtypesIfXPU(torch.float32, torch.double, torch.half)
     @dtypes(torch.float32)
     def test_embedding_max_norm_fwd_AD(self, device, dtype):
         if torch.device(device).type == "xla":
@@ -396,6 +407,7 @@ def test_embedding_max_norm_fwd_AD(self, device, dtype):
             else (torch.float, torch.double, torch.half)
         )
     )
+    @dtypesIfXPU(torch.float32, torch.double, torch.half)
     @dtypes(torch.float32)
     def test_embedding_padding_idx(self, device, dtype):
         embedding = nn.Embedding(10, 20, padding_idx=0).to(device, dtype)
@@ -488,6 +500,7 @@ def test_embedding_padding_idx(self, device, dtype):
     @onlyNativeDeviceTypes
     @dtypes(torch.float32, torch.float64)
     @dtypesIfCUDA(torch.half, torch.bfloat16)
+    @dtypesIfXPU(torch.half, torch.bfloat16)
     def test_embedding_bag_1D_padding_idx(self, device, dtype):
         num_features = 3
         max_indices_per_bag = 10
@@ -632,11 +645,12 @@ def gen_2D_indices_from_1D(
                     weights.grad, weights_check.grad, msg=msg, atol=atol, rtol=rtol
                 )
 
-    @onlyCUDA
+    @onlyOn(["cuda", "xpu"])
     @dtypes(
         torch.bfloat16,
     )
     @largeTensorTest("80GB", device="cuda")
+    @largeTensorTest("80GB", device="xpu")
     def test_embedding_backward_large_batch_overflow(self, device, dtype):
         """
         Test that embedding_dense_backward handles large batches that exceed INT32_MAX thread IDs.
@@ -708,6 +722,7 @@ def test_embedding_backward_large_batch_overflow(self, device, dtype):
     @onlyNativeDeviceTypes
     @dtypes(torch.float32, torch.float64)
     @dtypesIfCUDA(torch.half, torch.bfloat16)
+    @dtypesIfXPU(torch.half, torch.bfloat16)
     def test_embedding_bag_2D_padding_idx(self, device, dtype):
         # Use a Python implementation of embedding_bag with padding_idx support
         # to check torch.nn.functional.embedding_bag correctness
@@ -818,7 +833,7 @@ def embedding_bag_check(indices, weights, mode, sparse, padding_idx):
                     rtol = None
                 self.assertEqual(grad, grad_check, msg=msg, atol=atol, rtol=rtol)
 
-    @onlyCUDA
+    @onlyOn(["cuda", "xpu"])
     @dtypes(
         *(
             (torch.float, torch.double, torch.bfloat16, torch.half)
@@ -854,6 +869,7 @@ def test_embedding_bag_empty_input(self, device, dtypes):
             self.assertEqual(output, torch.zeros_like(output))
 
     @skipCUDAIf(True, "no out-of-bounds check on CUDA for perf.")
+    @skipXPUIf(True, "no out-of-bounds check on XPU for perf.")
     @dtypes(*itertools.product((torch.float, torch.double), (torch.int, torch.long)))
     @parametrize_test("padding_idx", [None, 0])
     @parametrize_test("mode", ["sum", "mean", "max"])
@@ -1066,6 +1082,13 @@ def _embedding_bag_reference_impl(
             (torch.float, torch.double, torch.half),
         )
     )
+    @dtypesIfXPU(
+        *itertools.product(
+            (torch.int, torch.long),
+            (torch.int, torch.long),
+            (torch.float32, torch.double, torch.half),
+        )
+    )
     def test_EmbeddingBag_empty_per_sample_weights_and_offsets(self, device, dtypes):
         # Test empty input and per sample weight, and backward pass. There was a CUDA
         # invalid configuration bug (more context in #46572)
@@ -1132,6 +1155,13 @@ def test_per_sample_weights(mode, trainable_scale):
             (torch.float, torch.double, torch.half),
         )
     )
+    @dtypesIfXPU(
+        *itertools.product(
+            (torch.int, torch.long),
+            (torch.int, torch.long),
+            (torch.float32, torch.double, torch.half),
+        )
+    )
     def test_EmbeddingBag_per_sample_weights_and_offsets(self, device, dtypes):
         def test_per_sample_weights(mode, trainable_scale):
             es = nn.EmbeddingBag(5, 2, mode=mode).to(dtype=dtypes[2], device=device)
@@ -1193,6 +1223,13 @@ def test_per_sample_weights(mode, trainable_scale):
             (torch.float, torch.double, torch.half),
         )
     )
+    @dtypesIfXPU(
+        *itertools.product(
+            (torch.int, torch.long),
+            (torch.int, torch.long),
+            (torch.float32, torch.double, torch.half),
+        )
+    )
     def test_EmbeddingBag_per_sample_weights_and_new_offsets(self, device, dtypes):
         def test_per_sample_weights_new_offsets(
             mode, trainable_scale, include_last_offset, has_weight=True
@@ -1357,6 +1394,11 @@ def _test_EmbeddingBag_vs_Embedding(
             (torch.int, torch.long), (torch.half, torch.float, torch.double)
         )
     )
+    @dtypesIfXPU(
+        *itertools.product(
+            (torch.int, torch.long), (torch.half, torch.float32, torch.double)
+        )
+    )
     @dtypes(*itertools.product((torch.int, torch.long), (torch.float, torch.double)))
     def test_EmbeddingBag_per_sample_weights_and_no_offsets(self, device, dtypes):
         def run_tests(mode, sparse, trainable_per_sample_weights):
@@ -1390,8 +1432,8 @@ def run_tests(mode, sparse, trainable_per_sample_weights):
         ):
             run_tests(mode, sparse, trainable_per_sample_weights)
 
-        # Test CUDA Dense on half precision
-        if device == "cuda":
+        # Test CUDA/XPU Dense on half precision
+        if device != "cpu":
             modes = ("sum",)
             sparsity = (False,)
             trainable_scale = (True, False)
@@ -1552,9 +1594,18 @@ def _test_EmbeddingBag(
             (torch.float, torch.double, torch.half),
         )
     )
+    @dtypesIfXPU(
+        *itertools.product(
+            (torch.int, torch.long),
+            (torch.int, torch.long),
+            (torch.float32, torch.double, torch.half),
+        )
+    )
     def test_embedding_bag_device(self, device, dtypes):
         if IS_JETSON and torch.bfloat16 in dtypes and device == "cpu":
             self.skipTest("bfloat16 not supported with Jetson cpu")
+        if dtypes[2] == torch.float64 and "xpu" in device:
+            self.skipTest("https://github.com/intel/torch-xpu-ops/issues/2295")
         with set_default_dtype(torch.double):
             self._test_EmbeddingBag(
                 device,
@@ -1582,10 +1633,10 @@ def test_embedding_bag_device(self, device, dtypes):
             )
 
             test_backward = False
-            if self.device_type == "cuda":
+            if self.device_type != "cpu":
                 # see 'todo' in test_embedding_bag.
                 test_backward = dtypes[2] is not torch.float16
-            elif self.device_type == "cpu":
+            else:
                 # TODO: figure out why precision on sparse embeddings isn't the
                 # same as for dense.
                 test_backward = (
@@ -1626,6 +1677,13 @@ def test_embedding_bag_device(self, device, dtypes):
             (torch.float, torch.double, torch.half),
         )
     )
+    @dtypesIfXPU(
+        *itertools.product(
+            (torch.int, torch.long),
+            (torch.int, torch.long),
+            (torch.float32, torch.double, torch.half),
+        )
+    )
     def test_embedding_bag_non_contiguous_weight(self, device, dtypes):
         weight_tensor = torch.randn(3, 4, dtype=dtypes[2], device=device)
 
@@ -1703,7 +1761,7 @@ def test_embedding_bag_per_sample_weights_grad(
         bag(x, per_sample_weights=F.softmax(w, dim=-1))
 
 
-instantiate_device_type_tests(TestEmbeddingNNDeviceType, globals())
+instantiate_device_type_tests(TestEmbeddingNNDeviceType, globals(), allow_xpu=True)
 instantiate_parametrized_tests(TestEmbeddingNN)
 
 if __name__ == "__main__":
diff --git a/test/nn/test_load_state_dict.py b/test/nn/test_load_state_dict.py
index 074ac6273689a..48d6c6b8009ee 100644
--- a/test/nn/test_load_state_dict.py
+++ b/test/nn/test_load_state_dict.py
@@ -60,6 +60,29 @@ def test_load_state_dict_type(self):
         ):
             m.load_state_dict(2)
 
+    @swap([True, False])
+    @skipIfTorchDynamo("dynamo installs weakrefs on some params")
+    def test_scalar_param_1d_tensor_raises(self):
+        class SimpleModule(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.threshold = nn.Parameter(torch.tensor(0.0))
+
+            def forward(self, x):
+                return x
+
+        m = SimpleModule()
+
+        # Test that [3] -> scalar raises error
+        sd = {"threshold": torch.randn(3)}
+        with self.assertRaisesRegex(RuntimeError, "size mismatch for threshold"):
+            m.load_state_dict(sd)
+
+        # Test that [1] -> scalar is allowed (backward compatibility)
+        sd = {"threshold": torch.tensor([1.0])}
+        m.load_state_dict(sd)
+        self.assertEqual(m.threshold.item(), 1.0)
+
     @swap([True, False])
     @skipIfTorchDynamo("dynamo installs weakrefs on some params")
     def test_load_state_dict(self):
@@ -287,7 +310,7 @@ def forward(self, input):
 
         # Make sure parameters and persistent buffers were assigned
         net_meta_state_dict = net_meta.state_dict(keep_vars=True)
-        for key in state_dict.keys():
+        for key in state_dict:
             if key in net_meta._parameters:
                 if keep_vars and not swap:
                     # state_dict[key] is an nn.Parameter
diff --git a/test/nn/test_packed_sequence.py b/test/nn/test_packed_sequence.py
index 0d6de01451068..016a5efaf4101 100644
--- a/test/nn/test_packed_sequence.py
+++ b/test/nn/test_packed_sequence.py
@@ -492,6 +492,36 @@ def pad(tensor, length):
                 torch.randn([0, 1, 10]), torch.randn([11, 14, 14, 2]), True
             )
 
+    def test_empty_packed_sequence(self):
+        """
+        Regression test for https://github.com/pytorch/pytorch/issues/149622
+        Tests that pad_packed_sequence and unpack_sequence handle empty tensors
+        without segmentation fault (CVE-2025-2998, CVE-2025-2999)
+        """
+        # Test case 1: pad_packed_sequence with empty tensors
+        # Previously caused segmentation fault
+        empty_data = torch.randn(0, 5)
+        empty_batch_sizes = torch.tensor([], dtype=torch.int64)
+        empty_packed = rnn_utils.PackedSequence(
+            empty_data, empty_batch_sizes, None, None
+        )
+
+        # Should not crash - either return empty result or raise informative error
+        with self.assertRaises(RuntimeError):
+            rnn_utils.pad_packed_sequence(empty_packed, batch_first=True)
+
+        # Test case 2: unpack_sequence with empty tensors
+        # Previously caused segmentation fault
+        empty_data = torch.tensor([])
+        empty_batch_sizes = torch.tensor([], dtype=torch.int64)
+        packed = rnn_utils.PackedSequence(
+            data=empty_data, batch_sizes=empty_batch_sizes
+        )
+
+        # Should not crash - either return empty list or raise informative error
+        with self.assertRaises(RuntimeError):
+            rnn_utils.unpack_sequence(packed)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/onnx/exporter/test_building.py b/test/onnx/exporter/test_building.py
index fdccf04c1d0af..8600ab44a67b7 100644
--- a/test/onnx/exporter/test_building.py
+++ b/test/onnx/exporter/test_building.py
@@ -14,6 +14,7 @@
 
 class TestOpRecorder(common_utils.TestCase):
     def setUp(self):
+        super().setUp()
         self.opset_version = 17
         self.opset = onnxscript.values.Opset("", self.opset_version)
         self.recorder = _building.OpRecorder(opset=self.opset, constant_farm={})
diff --git a/test/onnx/internal/test_registraion.py b/test/onnx/internal/test_registration.py
similarity index 99%
rename from test/onnx/internal/test_registraion.py
rename to test/onnx/internal/test_registration.py
index fcc4cdeedd92f..8d90553ac2181 100644
--- a/test/onnx/internal/test_registraion.py
+++ b/test/onnx/internal/test_registration.py
@@ -49,6 +49,7 @@ def test_dispatch_opset_version_returns_correct_version(
 
 class TestOverrideDict(common_utils.TestCase):
     def setUp(self):
+        super().setUp()
         self.override_dict: registration.OverrideDict[str, int] = (
             registration.OverrideDict()
         )
diff --git a/test/onnx/test_onnx_opset.py b/test/onnx/test_onnx_opset.py
index 16ca93dbfe2c5..f6e33fe599817 100644
--- a/test/onnx/test_onnx_opset.py
+++ b/test/onnx/test_onnx_opset.py
@@ -42,7 +42,7 @@ def check_onnx_opset_operator(
             attributes = ops[i]["attributes"]
             assert len(attributes) == len(graph.node[i].attribute)
             for j in range(len(attributes)):
-                for attribute_field in attributes[j].keys():
+                for attribute_field in attributes[j]:
                     assert attributes[j][attribute_field] == getattr(
                         graph.node[i].attribute[j], attribute_field
                     )
diff --git a/test/profiler/test_cpp_thread.py b/test/profiler/test_cpp_thread.py
index 9dbecf994a4fa..b4fcf49ad84d5 100644
--- a/test/profiler/test_cpp_thread.py
+++ b/test/profiler/test_cpp_thread.py
@@ -88,6 +88,7 @@ def tearDownClass(cls):
             torch.testing._internal.common_utils.remove_cpp_extensions_build_root()
 
     def setUp(self) -> None:
+        super().setUp()
         if not torch.cuda.is_available():
             self.skipTest("Test machine does not have cuda")
         global device
@@ -230,6 +231,7 @@ def tearDownClass(cls):
             torch.testing._internal.common_utils.remove_cpp_extensions_build_root()
 
     def setUp(self) -> None:
+        super().setUp()
         if not torch.xpu.is_available():
             self.skipTest("Test machine does not have xpu")
         global device
diff --git a/test/profiler/test_execution_trace.py b/test/profiler/test_execution_trace.py
index 3a174b1d66a67..dbd5d89ad6a61 100644
--- a/test/profiler/test_execution_trace.py
+++ b/test/profiler/test_execution_trace.py
@@ -482,8 +482,8 @@ def fn(a, b, c):
 
     @unittest.skipIf(IS_WINDOWS, "torch.compile does not support WINDOWS")
     @unittest.skipIf(
-        (not has_triton()) or (not TEST_CUDA and not TEST_XPU),
-        "need triton and device(CUDA or XPU) availability to run",
+        (not has_triton()) or (not TEST_CUDA),
+        "need triton and device CUDA availability to run",
     )
     @skipCPUIf(True, "skip CPU device for testing profiling triton")
     def test_triton_fx_graph_with_et(self, device):
diff --git a/test/profiler/test_profiler.py b/test/profiler/test_profiler.py
index 25fb60674e59e..831f99aafff0a 100644
--- a/test/profiler/test_profiler.py
+++ b/test/profiler/test_profiler.py
@@ -910,14 +910,13 @@ def judge(expected_event_count, prof):
             for e in prof.function_events:
                 if "#" in e.name:
                     key = e.name
-                    if key in expected_event_count.keys():
+                    if key in expected_event_count:
                         actual_event_count[key] = (
                             actual_event_count.setdefault(key, 0) + 1
                         )
             for key, count in expected_event_count.items():
                 self.assertTrue(
-                    (key in actual_event_count.keys())
-                    and (count == actual_event_count[key])
+                    (key in actual_event_count) and (count == actual_event_count[key])
                 )
 
         with _profile(use_kineto=kineto_available()) as prof:
@@ -1406,10 +1405,7 @@ def test_profiler_fwd_bwd_link(self):
                 s_ts_2 = flow_s_to_ts[2]
                 f_ts_2 = flow_f_to_ts[2]
                 self.assertTrue(
-                    all(
-                        ts in ts_to_name.keys()
-                        for ts in [s_ts_1, f_ts_1, s_ts_2, f_ts_2]
-                    )
+                    all(ts in ts_to_name for ts in [s_ts_1, f_ts_1, s_ts_2, f_ts_2])
                 )
                 self.assertTrue(
                     ts_to_name[s_ts_1] == "aten::binary_cross_entropy_with_logits"
@@ -2009,6 +2005,10 @@ def _test_chrome_trace_basic_helper(self, with_cuda=False):
                 report = json.load(f)
                 self._validate_basic_json(report["traceEvents"], with_cuda)
 
+    @unittest.skipIf(
+        torch.xpu.is_available(),
+        "XPU Trace event ends too late! Refer https://github.com/intel/torch-xpu-ops/issues/2263",
+    )
     @unittest.skipIf(not kineto_available(), "Kineto is required")
     @skipIfTorchDynamo("profiler gets ignored if dynamo activated")
     def test_basic_chrome_trace(self):
@@ -2162,7 +2162,10 @@ def test_user_annotation(self):
     @skipIfTorchDynamo("profiler gets ignored if dynamo activated")
     def test_basic_profile(self):
         # test a really basic profile to make sure no erroneous aten ops are run
-        x = torch.randn(4, device="cuda")
+        acc = torch.accelerator.current_accelerator()
+        self.assertIsNotNone(acc)
+        device = acc.type
+        x = torch.randn(4, device=device)
         with torch.profiler.profile(with_stack=True) as p:
             x *= 2
         names = [e.name for e in p.events()]
@@ -2229,6 +2232,7 @@ def test_lazy_build_tree(self):
     @unittest.skipIf(
         torch.cuda.is_available(), "CUDA complains about forking after init"
     )
+    @unittest.skipIf(torch.xpu.is_available(), "XPU complains about forking after init")
     @unittest.skipIf(IS_WINDOWS, "can't use os.fork() on Windows")
     def test_forked_process(self):
         # Induce a pid cache by running the profiler with payload
@@ -3098,7 +3102,7 @@ def test_profiler_pattern_matcher_json_report(self):
                 report = json.load(f)
 
             # It is platform dependent whether the path will include "profiler/"
-            keys = [k for k in report.keys() if k.endswith("test_profiler.py")]
+            keys = [k for k in report if k.endswith("test_profiler.py")]
             self.assertEqual(len(keys), 1, f"{keys}")
             entry = report[keys[0]]
 
diff --git a/test/profiler/test_profiler_tree.py b/test/profiler/test_profiler_tree.py
index c6316fe3cd7e3..e8d28d7eff032 100644
--- a/test/profiler/test_profiler_tree.py
+++ b/test/profiler/test_profiler_tree.py
@@ -624,8 +624,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                           torch/nn/modules/module.py(...): __getattr__
                           <built-in function linear>
                             aten::linear
-                              aten::reshape
-                                aten::view
+                              aten::view
                               aten::t
                                 aten::transpose
                                   aten::as_strided
@@ -671,8 +670,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                           torch/nn/modules/module.py(...): __getattr__
                           <built-in function linear>
                             aten::linear
-                              aten::reshape
-                                aten::view
+                              aten::view
                               aten::t
                                 aten::transpose
                                   aten::as_strided
diff --git a/test/quantization/ao_migration/common.py b/test/quantization/ao_migration/common.py
index 5797b4bab1d44..acfc8065de846 100644
--- a/test/quantization/ao_migration/common.py
+++ b/test/quantization/ao_migration/common.py
@@ -46,7 +46,7 @@ def _test_dict_import(
             old_dict = getattr(old_location, dict_name)
             new_dict = getattr(new_location, dict_name)
             assert old_dict == new_dict, f"Dicts don't match: {dict_name}"
-            for key in new_dict.keys():
+            for key in new_dict:
                 assert old_dict[key] == new_dict[key], (
                     f"Dicts don't match: {dict_name} for key {key}"
                 )
diff --git a/test/quantization/core/test_quantized_module.py b/test/quantization/core/test_quantized_module.py
index b2b2b402327ad..f2cdbfd2d6316 100644
--- a/test/quantization/core/test_quantized_module.py
+++ b/test/quantization/core/test_quantized_module.py
@@ -1840,7 +1840,7 @@ def test_cell_api(self, dtype):
                     'RNNTanh': torch.ops.quantized.quantized_rnn_tanh_cell_dynamic,
                     'RNNReLU': torch.ops.quantized.quantized_rnn_relu_cell_dynamic}
 
-        for rnn_type in cell_dict.keys():
+        for rnn_type in cell_dict:
             if not (dtype == torch.float16 and torch.backends.quantized.engine in ("qnnpack", "onednn")):
                 # fp16 dynamic quant is not supported for qnnpack or onednn
                 kwargs = {'input_size': input_size, 'hidden_size': hidden_size, 'bias': bias, 'dtype': dtype}
@@ -1903,7 +1903,7 @@ def test_rnn_cell(self):
                     'RNNTanh': nnqr.RNNCell,
                     'RNNReLU': nnqr.RNNCell}
 
-        for rnn_type in cell_dict.keys():
+        for rnn_type in cell_dict:
             kwargs = {'input_size': input_size, 'hidden_size': hidden_size, 'bias': bias}
             if rnn_type == 'RNNReLU':
                 kwargs['nonlinearity'] = "relu"
diff --git a/test/quantization/core/test_workflow_module.py b/test/quantization/core/test_workflow_module.py
index 9ea8d38828a63..93993fe33a49c 100644
--- a/test/quantization/core/test_workflow_module.py
+++ b/test/quantization/core/test_workflow_module.py
@@ -650,7 +650,7 @@ def test_record_observer(self):
                 observer_dict = {}
                 _get_observer_dict(model, observer_dict)
 
-                self.assertTrue('fc1.module.activation_post_process' in observer_dict.keys(),
+                self.assertTrue('fc1.module.activation_post_process' in observer_dict,
                                 'observer is not recorded in the dict')
                 self.assertEqual(len(observer_dict['fc1.module.activation_post_process'].get_tensor_value()),
                                  2 * len(self.calib_data))
diff --git a/test/quantization/core/test_workflow_ops.py b/test/quantization/core/test_workflow_ops.py
index f69852760e8a0..78e7799c864b1 100644
--- a/test/quantization/core/test_workflow_ops.py
+++ b/test/quantization/core/test_workflow_ops.py
@@ -368,8 +368,8 @@ def _test_forward_per_tensor_cachemask_impl(self, device):
         float_types = (torch.float32, torch.float16, torch.float64, torch.bfloat16)
         torch_types = (torch.qint8, torch.quint8)
         Xs = (torch.randn(4, 8, device=device), torch.randn(4, 16, device=device)[:, ::2])
-        tensor_qparam = (True, False)
-        for float_type, torch_type, X, tensor_qparams in itertools.product(float_types, torch_types, Xs, tensor_qparam):
+        tensor_qparams = (True, False)
+        for float_type, torch_type, X, tensor_qparam in itertools.product(float_types, torch_types, Xs, tensor_qparams):
             # pick the scale + zp so that some values get clipped
             X = X.to(float_type)
             obs = torch.ao.quantization.MinMaxObserver(torch_type)
diff --git a/test/quantization/eager/test_bias_correction_eager.py b/test/quantization/eager/test_bias_correction_eager.py
index 5f0c475f934dd..071ea6e2a768f 100644
--- a/test/quantization/eager/test_bias_correction_eager.py
+++ b/test/quantization/eager/test_bias_correction_eager.py
@@ -39,7 +39,7 @@ def correct_artificial_bias_quantize(self, float_model, img_data):
         torch.ao.quantization.convert(artificial_model, inplace=True)
 
         # manually changing bias
-        for name, submodule in artificial_model.named_modules():
+        for submodule in artificial_model.modules():
             if type(submodule) in _supported_modules:
                 x = get_param(submodule, "bias")
                 weight = get_param(submodule, "weight")
diff --git a/test/quantization/fx/test_model_report_fx.py b/test/quantization/fx/test_model_report_fx.py
index 78408c1b5a36d..adf1fee586723 100644
--- a/test/quantization/fx/test_model_report_fx.py
+++ b/test/quantization/fx/test_model_report_fx.py
@@ -205,7 +205,7 @@ def test_multi_linear_model_without_per_channel(self):
             self.assertEqual(len(per_channel_info), 2)
 
             # for each linear layer, should be supported but not used
-            for linear_key in per_channel_info.keys():
+            for linear_key in per_channel_info:
                 module_entry = per_channel_info[linear_key]
 
                 self.assertEqual(module_entry["per_channel_quantization_supported"], True)
@@ -277,7 +277,7 @@ def forward(self, x):
             self.assertEqual(len(per_channel_info), 4)
 
             # for each layer, should be supported but not used
-            for key in per_channel_info.keys():
+            for key in per_channel_info:
                 module_entry = per_channel_info[key]
                 self.assertEqual(module_entry["per_channel_quantization_supported"], True)
 
@@ -327,7 +327,7 @@ def test_sequential_model_format(self):
             self.assertEqual(len(per_channel_info), 4)
 
             # for each layer, should be supported but not used
-            for key in per_channel_info.keys():
+            for key in per_channel_info:
                 module_entry = per_channel_info[key]
 
                 self.assertEqual(module_entry["per_channel_quantization_supported"], True)
@@ -371,7 +371,7 @@ def test_conv_sub_class_considered(self):
             self.assertEqual(len(per_channel_info), 4)
 
             # for each layer, should be supported but not used
-            for key in per_channel_info.keys():
+            for key in per_channel_info:
                 module_entry = per_channel_info[key]
 
                 self.assertEqual(module_entry["per_channel_quantization_supported"], True)
@@ -415,7 +415,7 @@ def test_fusion_layer_in_sequential(self):
             self.assertEqual(len(per_channel_info), 4)
 
             # for each layer, should be supported but not used
-            for key in per_channel_info.keys():
+            for key in per_channel_info:
                 module_entry = per_channel_info[key]
                 self.assertEqual(module_entry["per_channel_quantization_supported"], True)
                 self.assertEqual(module_entry["per_channel_quantization_used"], True)
@@ -482,7 +482,7 @@ def forward(self, x):
             self.assertEqual(len(per_channel_info), 1)
 
             # for the one conv, it should still give advice to use different qconfig
-            for key in per_channel_info.keys():
+            for key in per_channel_info:
                 module_entry = per_channel_info[key]
                 self.assertEqual(module_entry["per_channel_quantization_supported"], True)
                 self.assertEqual(module_entry["per_channel_quantization_used"], False)
@@ -974,7 +974,7 @@ def test_prepare_model_callibration(self):
             # there should be two entries
             self.assertEqual(len(model_report.get_observers_of_interest()), 2)
             for detector in test_detector_set:
-                self.assertTrue(detector.get_detector_name() in model_report.get_observers_of_interest().keys())
+                self.assertTrue(detector.get_detector_name() in model_report.get_observers_of_interest())
 
                 # get number of entries for this detector
                 detector_obs_of_interest_fqns = model_report.get_observers_of_interest()[detector.get_detector_name()]
diff --git a/test/quantization/fx/test_numeric_suite_fx.py b/test/quantization/fx/test_numeric_suite_fx.py
index 2b8afe1c7c8d8..75e4ebffbdf42 100644
--- a/test/quantization/fx/test_numeric_suite_fx.py
+++ b/test/quantization/fx/test_numeric_suite_fx.py
@@ -1787,7 +1787,7 @@ def test_layer_names(self):
         # extract weights
         results = extract_weights('fp32', mp, 'int8', mq)
         mq_node_names = [node.name for node in mq.graph.nodes]
-        for layer_name in results.keys():
+        for layer_name in results:
             self.assertTrue(layer_name in mq_node_names)
 
         # match activations
@@ -1799,7 +1799,7 @@ def test_layer_names(self):
         mq_ns(data)
         results = extract_logger_info(mp_ns, mq_ns, OutputLogger, 'int8')
         mq_node_names = [node.name for node in mq_ns.graph.nodes]
-        for layer_name in results.keys():
+        for layer_name in results:
             self.assertTrue(layer_name in mq_node_names)
 
         # match shadow activations
@@ -1810,7 +1810,7 @@ def test_layer_names(self):
         results = extract_shadow_logger_info(
             mp_shadows_mq, OutputLogger, 'int8')
         mq_node_names = [node.name for node in mp_shadows_mq.graph.nodes]
-        for layer_name in results.keys():
+        for layer_name in results:
             self.assertTrue(layer_name in mq_node_names)
 
     @skipIfNoFBGEMM
@@ -1834,11 +1834,11 @@ def test_extend_logger_results_with_comparison(self):
 
         for layer_results in results.values():
             assert 'sqnr_int8_vs_fp32' in \
-                layer_results['weight']['int8'][0].keys()
+                layer_results['weight']['int8'][0]
             assert 'l2_error_int8_vs_fp32' in \
-                layer_results['weight']['int8'][0].keys()
+                layer_results['weight']['int8'][0]
             assert 'cosine_similarity_int8_vs_fp32' in \
-                layer_results['weight']['int8'][0].keys()
+                layer_results['weight']['int8'][0]
 
     @skipIfNoFBGEMM
     def test_int8_shadows_fp32_simple(self):
diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index cd922d94c60c3..f2b3091b75d6c 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -204,7 +204,8 @@
 import operator
 import unittest
 import io
-from typing import Callable, Optional
+from typing import Optional
+from collections.abc import Callable
 
 class BinaryOp(torch.nn.Module):
     def __init__(self, binary_op, ibinary_op, is_inplace, is_scalar):
@@ -3475,7 +3476,7 @@ def forward(self, x0):
     def test_non_traceable_module(self):
         class NonTraceable(torch.nn.Module):
             def forward(self, x):
-                for k in x.keys():
+                for k in x:
                     print(x[k])
                 return x
 
@@ -4999,7 +5000,7 @@ def from_observed(cls, observed_lstm):
             self.assertTrue(all(arg.target == "dequantize" for arg in node.args))
             # Match following quantize with the specific qparams and dtypes
             expected_scale, expected_zp, expected_dtype = node_name_to_expected_quantize_args[node.name]
-            for user in node.users.keys():
+            for user in node.users:
                 self.assertEqual(user.target, torch.quantize_per_tensor)
                 if expected_scale is not None:
                     self.assertEqual(getattr(cell, user.args[1].target), expected_scale)
@@ -8806,7 +8807,7 @@ def forward(self, indices, offsets):
 
         # check it works in None and static qconfig
         for qconfig in [None, default_qconfig]:
-            qconfig_dict = {"": default_qconfig}
+            qconfig_dict = {"": qconfig}
             m = M().eval()
             m = prepare_fx(model, qconfig_dict, example_inputs=example_inputs)
             self.checkGraphModuleNodes(m, expected_node_occurrence={
@@ -9662,10 +9663,10 @@ def forward(self, input: torch.Tensor, offsets: Optional[torch.Tensor] = None,
                 .set_global(get_default_qat_qconfig(qengine)) \
                 .set_object_type(torch.nn.EmbeddingBag, default_embedding_qat_qconfig)
 
-            train_indices = [[torch.randint(0, 10, (12, 12)), torch.randn((12, 1))] for _ in range(2)]
-            eval_output = [[torch.randint(0, 10, (12, 1))]]
+            train_indices = [[torch.randint(0, 10, (12, 12), device=device), torch.randn((12, 1), device=device)] for _ in range(2)]
+            eval_output = [[torch.randint(0, 10, (12, 1), device=device)]]
 
-            model = EmbeddingBagLinear().train()
+            model = EmbeddingBagLinear().to(device).train()
             prepared_fx_model = prepare_qat_fx(model, qconfig_dict, example_inputs=(train_indices[0][0],))
             test_only_train_fn(prepared_fx_model, train_indices)
             quant_model = convert_fx(prepared_fx_model,
diff --git a/test/quantization/pt2e/test_x86inductor_quantizer.py b/test/quantization/pt2e/test_x86inductor_quantizer.py
index dfd591cb9419c..41b2351997d47 100644
--- a/test/quantization/pt2e/test_x86inductor_quantizer.py
+++ b/test/quantization/pt2e/test_x86inductor_quantizer.py
@@ -1548,7 +1548,7 @@ def _check_annotation(node):
             return annot._annotated, annot._is_output_of_quantized_pattern
 
         for node in gm.graph.nodes:
-            if node.target in expected_stat_dict.keys():
+            if node.target in expected_stat_dict:
                 annotated, is_quant_out = _check_annotation(node)
                 expected_stat_dict[node.target]["annotated"] -= annotated
                 expected_stat_dict[node.target]["is_quant_out"] -= is_quant_out
@@ -2016,7 +2016,7 @@ def test_qat_conv2d_unary(self):
         }
 
         with override_quantized_engine("x86"):
-            for unary_op in unary_map.keys():
+            for unary_op in unary_map:
                 m = TestHelperModules.Conv2dUnaryModule(
                     unary_map[unary_op][0], with_bn=True
                 )
diff --git a/test/run_test.py b/test/run_test.py
index 4b7030d461529..63285f67a27d4 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -73,7 +73,26 @@
     ShardedTest,
     THRESHOLD,
 )
-from tools.testing.upload_artifacts import zip_and_upload_artifacts
+
+
+try:
+    from tools.testing.upload_artifacts import (
+        parse_xml_and_upload_json,
+        upload_adhoc_failure_json,
+        zip_and_upload_artifacts,
+    )
+except ImportError:
+    # some imports in those files might fail, e.g., boto3 not installed. These
+    # functions are only needed under specific circumstances (CI) so we can
+    # define dummy functions here.
+    def parse_xml_and_upload_json():
+        pass
+
+    def zip_and_upload_artifacts(*args, **kwargs):
+        pass
+
+    def upload_adhoc_failure_json(*args, **kwargs):
+        pass
 
 
 # Make sure to remove REPO_ROOT after import is done
@@ -206,7 +225,7 @@ def __contains__(self, item):
     "inductor/test_inplacing_pass",
     "inductor/test_kernel_benchmark",
     "inductor/test_max_autotune",
-    "inductor/test_move_constructors_to_cuda",
+    "inductor/test_move_constructors_to_gpu",
     "inductor/test_multi_kernel",
     "inductor/test_pattern_matcher",
     "inductor/test_perf",
@@ -248,13 +267,7 @@ def __contains__(self, item):
 
 XPU_BLOCKLIST = [
     "test_autograd",
-    "profiler/test_cpp_thread",
-    "profiler/test_execution_trace",
     "profiler/test_memory_profiler",
-    "profiler/test_profiler",
-    "profiler/test_profiler_tree",
-    "profiler/test_record_function",
-    "profiler/test_torch_tidy",
     "test_openreg",
 ]
 
@@ -400,6 +413,7 @@ def __contains__(self, item):
     test for test in TESTS if test.startswith("functorch/test_aotdispatch")
 ]
 FUNCTORCH_TESTS = [test for test in TESTS if test.startswith("functorch")]
+DYNAMO_CORE_TESTS = [test for test in TESTS if test.startswith("dynamo")]
 ONNX_TESTS = [test for test in TESTS if test.startswith("onnx")]
 QUANTIZATION_TESTS = [test for test in TESTS if test.startswith("test_quantization")]
 
@@ -627,6 +641,7 @@ def run_test(
                 output,
                 options.continue_through_error,
                 test_file,
+                options,
             )
         else:
             command.extend([f"--sc={stepcurrent_key}", "--print-items"])
@@ -713,6 +728,7 @@ def run_test_retries(
     output,
     continue_through_error,
     test_file,
+    options,
 ):
     # Run the test with -x to stop at first failure.  Rerun the test by itself.
     # If it succeeds, move on to the rest of the tests in a new process.  If it
@@ -731,6 +747,16 @@ def print_to_file(s):
 
     num_failures = defaultdict(int)
 
+    def read_pytest_cache(key: str) -> Any:
+        cache_file = (
+            REPO_ROOT / ".pytest_cache/v/cache/stepcurrent" / stepcurrent_key / key
+        )
+        try:
+            with open(cache_file) as f:
+                return f.read()
+        except FileNotFoundError:
+            return None
+
     print_items = ["--print-items"]
     sc_command = f"--sc={stepcurrent_key}"
     while True:
@@ -751,12 +777,11 @@ def print_to_file(s):
 
         # Read what just failed/ran
         try:
-            with open(
-                REPO_ROOT / ".pytest_cache/v/cache/stepcurrent" / stepcurrent_key
-            ) as f:
-                current_failure = f.read()
-                if current_failure == "null":
-                    current_failure = f"'{test_file}'"
+            current_failure = read_pytest_cache("lastrun")
+            if current_failure is None:
+                raise FileNotFoundError
+            if current_failure == "null":
+                current_failure = f"'{test_file}'"
         except FileNotFoundError:
             print_to_file(
                 "No stepcurrent file found. Either pytest didn't get to run (e.g. import error)"
@@ -779,6 +804,13 @@ def print_to_file(s):
             # This is for log classifier so it can prioritize consistently
             # failing tests instead of reruns. [1:-1] to remove quotes
             print_to_file(f"FAILED CONSISTENTLY: {current_failure[1:-1]}")
+            if (
+                read_pytest_cache("made_failing_xml") == "false"
+                and IS_CI
+                and options.upload_artifacts_while_running
+            ):
+                upload_adhoc_failure_json(test_file, current_failure[1:-1])
+
             if not continue_through_error:
                 print_to_file("Stopping at first consistent failure")
                 break
@@ -793,8 +825,8 @@ def print_to_file(s):
             print_to_file("Retrying single test...")
         print_items = []  # do not continue printing them, massive waste of space
 
-    consistent_failures = [x[1:-1] for x in num_failures.keys() if num_failures[x] >= 3]
-    flaky_failures = [x[1:-1] for x in num_failures.keys() if 0 < num_failures[x] < 3]
+    consistent_failures = [x[1:-1] for x in num_failures if num_failures[x] >= 3]
+    flaky_failures = [x[1:-1] for x in num_failures if 0 < num_failures[x] < 3]
     if len(flaky_failures) > 0:
         print_to_file(
             "The following tests failed and then succeeded when run in a new process"
@@ -852,7 +884,8 @@ def _test_cpp_extensions_aot(test_directory, options, use_ninja):
         if TEST_CUDA or TEST_XPU:
             exts_to_build.append((wheel_cmd, "python_agnostic_extension"))
         if TEST_CUDA:
-            exts_to_build.append((install_cmd, "libtorch_agnostic_extension"))
+            exts_to_build.append((install_cmd, "libtorch_agnostic_2_9_extension"))
+            exts_to_build.append((install_cmd, "libtorch_agnostic_2_10_extension"))
         for cmd, extension_dir in exts_to_build:
             return_code = shell(
                 cmd,
@@ -880,12 +913,16 @@ def _test_cpp_extensions_aot(test_directory, options, use_ninja):
                 if "-packages" in directory:
                     install_directories.append(os.path.join(root, directory))
 
-        for root, directories, _ in os.walk(
-            os.path.join(cpp_extensions, "libtorch_agnostic_extension", "install")
-        ):
-            for directory in directories:
-                if "-packages" in directory:
-                    install_directories.append(os.path.join(root, directory))
+        for extension_name in [
+            "libtorch_agnostic_2_9_extension",
+            "libtorch_agnostic_2_10_extension",
+        ]:
+            for root, directories, _ in os.walk(
+                os.path.join(cpp_extensions, extension_name, "install")
+            ):
+                for directory in directories:
+                    if "-packages" in directory:
+                        install_directories.append(os.path.join(root, directory))
 
         with extend_python_path(install_directories):
             return run_test(ShardedTest(test_module, 1, 1), test_directory, options)
@@ -1314,6 +1351,16 @@ def parse_args():
         action="store_true",
         help="Run all distributed tests",
     )
+    parser.add_argument(
+        "--include-dynamo-core-tests",
+        "--include-dynamo-core-tests",
+        action="store_true",
+        help=(
+            "If this flag is present, we will only run dynamo tests. "
+            "If this flag is not present, we will run all tests "
+            "(including dynamo tests)."
+        ),
+    )
     parser.add_argument(
         "--functorch",
         "--functorch",
@@ -1580,6 +1627,12 @@ def get_selected_tests(options) -> list[str]:
             filter(lambda test_name: test_name in CORE_TEST_LIST, selected_tests)
         )
 
+    # Filter to only run dynamo tests when --include-dynamo-core-tests option is specified
+    if options.include_dynamo_core_tests:
+        selected_tests = list(
+            filter(lambda test_name: test_name in DYNAMO_CORE_TESTS, selected_tests)
+        )
+
     # Filter to only run functorch tests when --functorch option is specified
     if options.functorch:
         selected_tests = list(
@@ -1672,7 +1725,7 @@ def get_selected_tests(options) -> list[str]:
             ]
         )
 
-    if sys.version_info[:2] < (3, 13):
+    if sys.version_info[:2] < (3, 13) or sys.version_info[:2] >= (3, 14):
         # Skip tests for older Python versions as they may use syntax or features
         # not supported in those versions
         options.exclude.extend(
@@ -1826,9 +1879,14 @@ def run_test_module(
         test_name = test.name
 
         # Printing the date here can help diagnose which tests are slow
-        print_to_stderr(f"Running {str(test)} ... [{datetime.now()}]")
+        start = time.perf_counter()
+        print_to_stderr(f"Running {str(test)} ... [{datetime.now()}][{start}]")
         handler = CUSTOM_HANDLERS.get(test_name, run_test)
         return_code = handler(test, test_directory, options)
+        end = time.perf_counter()
+        print_to_stderr(
+            f"Finished {str(test)} ... [{datetime.now()}][{end}], took {(end - start) / 60:.2f}min"
+        )
         assert isinstance(return_code, int) and not isinstance(return_code, bool), (
             f"While running {str(test)} got non integer return code {return_code}"
         )
@@ -1882,6 +1940,7 @@ def run_tests(
     def handle_complete(failure: Optional[TestFailure]):
         failed = failure is not None
         if IS_CI and options.upload_artifacts_while_running:
+            parse_xml_and_upload_json()
             zip_and_upload_artifacts(failed)
         if not failed:
             return False
diff --git a/test/slow_tests.json b/test/slow_tests.json
index fe23e854cc8e8..c027d3d1d0901 100644
--- a/test/slow_tests.json
+++ b/test/slow_tests.json
@@ -1,245 +1,236 @@
 {
-  "EndToEndLSTM (__main__.RNNTest)": 207.89400227864584,
-  "MultiheadAttention (__main__.ModulesTest)": 141.1396687825521,
-  "test_AllenaiLongformerBase_repro_cpu_halide (__main__.HalideCpuTests)": 214.02366638183594,
-  "test__adaptive_avg_pool2d (__main__.CPUReproTests)": 77.26125049591064,
-  "test_adaptive_max_pool2d1_cpu_halide (__main__.HalideCpuTests)": 116.37000020345052,
-  "test_after_aot_cpu_runtime_error (__main__.MinifierIsolateTests)": 69.25722334120009,
-  "test_after_aot_gpu_runtime_error (__main__.MinifierIsolateTests)": 65.84466807047527,
-  "test_alexnet_prefix_cpu_halide (__main__.HalideCpuTests)": 178.41399637858072,
-  "test_aot_autograd_disable_functionalization_symbolic_exhaustive_linalg_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 63.55014337812151,
-  "test_aot_autograd_disable_functionalization_symbolic_exhaustive_nn_functional_max_pool1d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 122.18047623407273,
-  "test_aot_autograd_disable_functionalization_symbolic_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 192.6405719575428,
-  "test_aot_autograd_disable_functionalization_symbolic_exhaustive_nn_functional_max_pool3d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 111.27904801141648,
-  "test_aot_autograd_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 60.906999588012695,
-  "test_aot_autograd_symbolic_exhaustive_linalg_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 62.244998931884766,
-  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool1d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 150.04100036621094,
-  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 191.85050201416016,
-  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool3d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 111.9276631673177,
-  "test_aot_autograd_symbolic_exhaustive_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 67.31450271606445,
-  "test_aot_autograd_symbolic_module_exhaustive_nn_TransformerDecoderLayer_cpu_float32 (__main__.TestEagerFusionModuleInfoCPU)": 125.24066416422527,
-  "test_associative_scan_partial_grad_combine_mode_generic_compile_mode_compile_dynamic_shape_reverse_False_cpu (__main__.AssociativeScanTests)": 86.47783279418945,
-  "test_associative_scan_partial_grad_combine_mode_generic_compile_mode_compile_dynamic_shape_reverse_True_cpu (__main__.AssociativeScanTests)": 100.46250025431316,
-  "test_avg_pool3d_backward2_cpu (__main__.CpuTests)": 1031.0534973144531,
-  "test_avg_pool3d_backward2_cuda (__main__.GPUTests)": 239.67400105794272,
-  "test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 495.0447726779514,
-  "test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 490.18524169921875,
-  "test_avg_pool3d_backward2_dynamic_shapes_cuda (__main__.DynamicShapesCodegenGPUTests)": 144.06477737426758,
-  "test_avg_pool3d_backward2_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 342.20416259765625,
-  "test_avg_pool3d_backward_cpu_halide (__main__.HalideCpuTests)": 62.01366678873698,
-  "test_backward_nn_functional_multi_head_attention_forward_cpu_float32 (__main__.TestCompositeComplianceCPU)": 71.07200050354004,
-  "test_backward_nn_functional_multi_head_attention_forward_cuda_float32 (__main__.TestCompositeComplianceCUDA)": 73.9221674601237,
-  "test_basic_cpu (__main__.EfficientConvBNEvalCpuTests)": 226.0122528076172,
-  "test_basic_cuda (__main__.EfficientConvBNEvalGpuTests)": 144.97249857584634,
-  "test_checkpointing_without_reentrant_input_requires_grad_False (__main__.TestAutogradWithCompiledAutograd)": 303.20537185668945,
-  "test_checkpointing_without_reentrant_input_requires_grad_True (__main__.TestAutogradWithCompiledAutograd)": 386.0518798828125,
-  "test_collect_callgrind (__main__.TestBenchmarkUtils)": 291.2442270914714,
-  "test_comprehensive_diff_cuda_complex128 (__main__.TestDecompCUDA)": 95.87866719563802,
-  "test_comprehensive_diff_cuda_complex64 (__main__.TestDecompCUDA)": 98.38716634114583,
-  "test_comprehensive_diff_cuda_float32 (__main__.TestDecompCUDA)": 69.08016649881999,
-  "test_comprehensive_diff_cuda_float64 (__main__.TestDecompCUDA)": 69.88233311971028,
-  "test_comprehensive_grid_sampler_2d_cpu_bfloat16 (__main__.TestDecompCPU)": 104.17599995930989,
-  "test_comprehensive_grid_sampler_2d_cpu_float16 (__main__.TestDecompCPU)": 97.41800308227539,
-  "test_comprehensive_grid_sampler_2d_cpu_float32 (__main__.TestDecompCPU)": 474.6719970703125,
-  "test_comprehensive_grid_sampler_2d_cpu_float64 (__main__.TestDecompCPU)": 440.4375,
-  "test_comprehensive_grid_sampler_2d_cuda_bfloat16 (__main__.TestDecompCUDA)": 293.3983332316081,
-  "test_comprehensive_grid_sampler_2d_cuda_float16 (__main__.TestDecompCUDA)": 238.7328338623047,
-  "test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestDecompCUDA)": 1218.4906717936199,
-  "test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 68.73516782124837,
-  "test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestDecompCUDA)": 1156.0123494466145,
-  "test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 72.13916714986165,
-  "test_comprehensive_linalg_lu_solve_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 74.90450032552083,
-  "test_comprehensive_linalg_lu_solve_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 70.42100016276042,
-  "test_comprehensive_linalg_solve_triangular_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 72.98883310953777,
-  "test_comprehensive_linalg_solve_triangular_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 73.34433364868164,
-  "test_comprehensive_linalg_svd_cuda_complex128 (__main__.TestDecompCUDA)": 61.38016573588053,
-  "test_comprehensive_linalg_svd_cuda_complex64 (__main__.TestDecompCUDA)": 67.52783330281575,
-  "test_comprehensive_masked_norm_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 111.06333287556966,
-  "test_comprehensive_masked_norm_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 110.19833374023438,
-  "test_comprehensive_masked_norm_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 113.10083134969075,
-  "test_comprehensive_nn_functional_conv_transpose3d_cuda_complex128 (__main__.TestDecompCUDA)": 63.23766644795736,
-  "test_comprehensive_nn_functional_conv_transpose3d_cuda_complex64 (__main__.TestDecompCUDA)": 70.18666712443034,
-  "test_comprehensive_nn_functional_gaussian_nll_loss_cpu_float32 (__main__.TestDecompCPU)": 62.61399841308594,
-  "test_comprehensive_nn_functional_gaussian_nll_loss_cpu_float64 (__main__.TestDecompCPU)": 67.7816670735677,
-  "test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestDecompCUDA)": 121.6183344523112,
-  "test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float64 (__main__.TestDecompCUDA)": 107.30266698201497,
-  "test_comprehensive_nn_functional_grid_sample_cpu_float32 (__main__.TestDecompCPU)": 130.8143310546875,
-  "test_comprehensive_nn_functional_grid_sample_cpu_float64 (__main__.TestDecompCPU)": 127.27633412679036,
-  "test_comprehensive_nn_functional_grid_sample_cuda_float32 (__main__.TestDecompCUDA)": 303.55183664957684,
-  "test_comprehensive_nn_functional_grid_sample_cuda_float64 (__main__.TestDecompCUDA)": 234.41216532389322,
-  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestDecompCUDA)": 85.3436673482259,
-  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 80.9688326517741,
-  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestDecompCUDA)": 82.55149968465169,
-  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 82.37966791788737,
-  "test_comprehensive_nn_functional_interpolate_trilinear_cuda_float32 (__main__.TestDecompCUDA)": 129.88233184814453,
-  "test_comprehensive_nn_functional_interpolate_trilinear_cuda_float64 (__main__.TestDecompCUDA)": 129.4015007019043,
-  "test_comprehensive_nn_functional_max_pool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 1282.3826497395833,
-  "test_comprehensive_nn_functional_max_pool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 1270.64599609375,
-  "test_comprehensive_nn_functional_max_pool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 1297.9046630859375,
-  "test_comprehensive_nn_functional_max_pool3d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 545.2034962972006,
-  "test_comprehensive_nn_functional_max_pool3d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 572.5616760253906,
-  "test_comprehensive_nn_functional_max_unpool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 64.40316645304362,
-  "test_comprehensive_nn_functional_max_unpool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 64.68383344014485,
-  "test_comprehensive_nn_functional_max_unpool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 61.48333422342936,
-  "test_comprehensive_ormqr_cpu_complex64 (__main__.TestDecompCPU)": 61.959999084472656,
-  "test_comprehensive_ormqr_cuda_complex128 (__main__.TestDecompCUDA)": 105.79100036621094,
-  "test_comprehensive_ormqr_cuda_complex64 (__main__.TestDecompCUDA)": 122.34666570027669,
-  "test_comprehensive_ormqr_cuda_float32 (__main__.TestDecompCUDA)": 68.7205015818278,
-  "test_comprehensive_ormqr_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 74.2183329264323,
-  "test_comprehensive_ormqr_cuda_float64 (__main__.TestDecompCUDA)": 66.86883227030437,
-  "test_comprehensive_svd_cuda_complex128 (__main__.TestDecompCUDA)": 77.48183314005534,
-  "test_comprehensive_svd_cuda_complex64 (__main__.TestDecompCUDA)": 79.1564998626709,
-  "test_constructor_autograd_SparseBSC_cuda (__main__.TestSparseAnyCUDA)": 160.41250228881836,
-  "test_constructor_autograd_SparseBSR_cuda (__main__.TestSparseAnyCUDA)": 79.10633341471355,
-  "test_constructor_autograd_SparseCSC_cuda (__main__.TestSparseAnyCUDA)": 60.106833140055336,
-  "test_conv1d_basic (__main__.TestXNNPACKConv1dTransformPass)": 221.3586196899414,
-  "test_conv1d_with_relu_fc (__main__.TestXNNPACKConv1dTransformPass)": 504.3203754425049,
-  "test_conv2d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 78.03233337402344,
-  "test_conv3d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 152.302001953125,
-  "test_conv3d_cuda (__main__.AOTInductorTestABICompatibleGpu)": 152.99433390299478,
-  "test_conv_bn_fuse_cpu (__main__.CpuTests)": 96.25399971008301,
-  "test_conv_bn_fuse_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 75.70275068283081,
-  "test_conv_transpose_with_output_size_and_no_batch_dim_ConvTranspose3d_cuda (__main__.TestConvolutionNNDeviceTypeCUDA)": 139.14399747674665,
-  "test_conv_unary_fusion_nnc (__main__.TestMkldnnFusion)": 72.7847490310669,
-  "test_correctness_AdamW_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 91.59966786702473,
-  "test_correctness_Adam_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 87.57833353678386,
-  "test_count_nonzero_all (__main__.TestBool)": 664.9986343383789,
-  "test_cp_flex_attention_document_mask (__main__.CPFlexAttentionTest)": 78.31500244140625,
-  "test_ddp_uneven_inputs (__main__.TestDistBackendWithSpawn)": 385.24249792099,
-  "test_dispatch_symbolic_meta_outplace_all_strides_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestMetaCUDA)": 84.70466740926106,
-  "test_dtensor_op_db_nn_functional_gaussian_nll_loss_cpu_float32 (__main__.TestLocalDTensorOpsCPU)": 685.0679931640625,
-  "test_dtensor_op_db_nn_functional_gaussian_nll_loss_cpu_float32 (__main__.TestMultiThreadedDTensorOpsCPU)": 86.26266733805339,
-  "test_eig_check_magma_cuda_float32 (__main__.TestLinalgCUDA)": 292.93699645996094,
-  "test_error_detection_and_propagation (__main__.NcclErrorHandlingTest)": 66.84199905395508,
-  "test_fail_arithmetic_ops.py (__main__.TestTyping)": 69.56212568283081,
-  "test_fail_creation_ops.py (__main__.TestTyping)": 69.80560022989908,
-  "test_fn_fwgrad_bwgrad_cumprod_cuda_complex128 (__main__.TestFwdGradientsCUDA)": 73.36666552225749,
-  "test_fn_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 90.40366744995117,
-  "test_fuse_large_params_cpu (__main__.CpuTests)": 132.73199844360352,
-  "test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 150.16662406921387,
-  "test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 159.28499794006348,
-  "test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesCodegenGPUTests)": 165.19283294677734,
-  "test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 151.12366739908853,
-  "test_grad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 84.61699930826823,
-  "test_gradgrad_nn_LSTM_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 110.00600179036458,
-  "test_gradgrad_nn_LSTM_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 122.3759994506836,
-  "test_gradgrad_nn_TransformerDecoderLayer_cuda_float64 (__main__.TestModuleCUDA)": 190.89249674479166,
-  "test_gradgrad_nn_TransformerEncoder_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 149.6598358154297,
-  "test_gradgrad_nn_TransformerEncoder_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 146.07766723632812,
-  "test_gradgrad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 532.8139902750651,
-  "test_graph_partition_refcount_cuda (__main__.GPUTests)": 69.78400001525878,
-  "test_graph_partition_refcount_dynamic_shapes_cuda (__main__.DynamicShapesCodegenGPUTests)": 267.04988850487604,
-  "test_graph_partition_refcount_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 273.54955800374347,
-  "test_grid_sampler_2d_cpu_halide (__main__.HalideCpuTests)": 195.84733072916666,
-  "test_indirect_device_assert (__main__.TritonCodeGenTests)": 326.0143330891927,
-  "test_inductor_no_recursionerror_on_for_loops_dynamic_shapes (__main__.DynamicShapesReproTests)": 66.96037435531616,
-  "test_inplace_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 77.44933319091797,
-  "test_inputs_overlapping_with_mutation_stress_dynamic_shapes (__main__.DynamicShapesAotAutogradFallbackTests)": 126.81488884819879,
-  "test_jit_cuda_archflags (__main__.TestCppExtensionJIT)": 118.70199839274089,
-  "test_linalg_solve_triangular_large_cuda_complex128 (__main__.TestLinalgCUDA)": 129.20266723632812,
-  "test_linalg_solve_triangular_large_cuda_complex64 (__main__.TestLinalgCUDA)": 97.18800099690755,
-  "test_linear_binary_cpp_wrapper (__main__.TestCppWrapper)": 130.3183339436849,
-  "test_linear_binary_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 140.43233235677084,
-  "test_list_clearing_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 293.122774971856,
-  "test_lobpcg_ortho_cuda_float64 (__main__.TestLinalgCUDA)": 63.835832277933754,
-  "test_longformer_chunk_dynamic_shapes (__main__.DynamicShapesReproTests)": 106.77049922943115,
-  "test_lstm_cpu (__main__.TestMkldnnCPU)": 100.89649963378906,
-  "test_many_overlapping_inputs_does_not_explode_guards_dynamic_shapes (__main__.DynamicShapesReproTests)": 140.07424926757812,
-  "test_max_autotune_addmm_max_autotune_gemm_backends_CK_x_shape2 (__main__.TestCKBackend)": 72.90299733479817,
-  "test_max_autotune_addmm_search_space_EXHAUSTIVE_dynamic_True (__main__.TestMaxAutotuneSubproc)": 82.62433369954427,
-  "test_max_autotune_precompile_matmul_max_autotune_gemm_backends_CKTILE_autotune_in_subproc_False_use_aoti_False (__main__.TestCKBackend)": 87.51499938964844,
-  "test_max_autotune_precompile_matmul_max_autotune_gemm_backends_CKTILE_autotune_in_subproc_True_use_aoti_True (__main__.TestCKBackend)": 71.22416591644287,
-  "test_max_pool2d2_cpu_halide (__main__.HalideCpuTests)": 424.50966389973956,
-  "test_max_pool2d3_cpu_halide (__main__.HalideCpuTests)": 134.14600626627603,
-  "test_max_pool2d5_cpu_halide (__main__.HalideCpuTests)": 358.88099161783856,
-  "test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 63.58866712782118,
-  "test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 62.68674945831299,
-  "test_memory_format_operators_cuda (__main__.TestTorchDeviceTypeCUDA)": 65.85794713936355,
-  "test_ordered_distribute_all_combination (__main__.DistributeWithDeviceOrderTest)": 103.6923344930013,
-  "test_ordered_redistribute_with_partial (__main__.DistributeWithDeviceOrderTest)": 187.6953328450521,
-  "test_ordered_redistribute_with_partial (__main__.DistributeWithDeviceOrderTestWithLocalTensor)": 370.27442932128906,
-  "test_proper_exit (__main__.TestDataLoader)": 227.83111148410373,
-  "test_proper_exit (__main__.TestDataLoaderPersistentWorkers)": 227.1901126437717,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 105.52099990844727,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 106.50249862670898,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True (__main__.TestPatternMatcher)": 92.52400207519531,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 111.75499725341797,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 107.40500259399414,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 83.80450057983398,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 107.46599833170573,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 96.65650177001953,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True (__main__.TestPatternMatcher)": 83.4114990234375,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 107.47100067138672,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 108.55533345540364,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False (__main__.TestPatternMatcher)": 89.23666381835938,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 105.13900375366211,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 100.14550018310547,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 107.33649826049805,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 102.08150100708008,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 97.59600067138672,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 104.82933553059895,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 114.43099721272786,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 110.40333302815755,
-  "test_quick_core_backward__unsafe_masked_index_cpu_float64 (__main__.TestDecompCPU)": 567.2765197753906,
-  "test_quick_core_backward__unsafe_masked_index_cuda_float64 (__main__.TestDecompCUDA)": 1032.5083312988281,
-  "test_quick_core_backward__unsafe_masked_index_put_accumulate_cpu_float64 (__main__.TestDecompCPU)": 852.7170003255209,
-  "test_quick_core_backward__unsafe_masked_index_put_accumulate_cuda_float64 (__main__.TestDecompCUDA)": 1361.954854329427,
-  "test_quick_core_backward_nn_functional_max_unpool3d_grad_cpu_float64 (__main__.TestDecompCPU)": 77.385498046875,
-  "test_quick_core_backward_nn_functional_max_unpool3d_grad_cuda_float64 (__main__.TestDecompCUDA)": 265.0193354288737,
-  "test_quick_core_backward_roll_cpu_float64 (__main__.TestDecompCPU)": 115.31749725341797,
-  "test_quick_core_backward_roll_cuda_float64 (__main__.TestDecompCUDA)": 245.27666727701822,
-  "test_quick_core_backward_select_scatter_cpu_float64 (__main__.TestDecompCPU)": 71.75300216674805,
-  "test_quick_core_backward_select_scatter_cuda_float64 (__main__.TestDecompCUDA)": 141.8895009358724,
-  "test_quick_core_backward_split_cuda_float64 (__main__.TestDecompCUDA)": 71.15749994913737,
-  "test_quick_core_backward_split_with_sizes_copy_cpu_float64 (__main__.TestDecompCPU)": 90.59066772460938,
-  "test_quick_core_backward_split_with_sizes_copy_cuda_float64 (__main__.TestDecompCUDA)": 173.73916625976562,
-  "test_quick_core_backward_std_cuda_float64 (__main__.TestDecompCUDA)": 110.65066655476888,
-  "test_register_spills_cuda (__main__.BenchmarkFusionCudaTest)": 99.21799850463867,
-  "test_replicatepad_64bit_indexing_cuda_float16 (__main__.TestNNDeviceTypeCUDA)": 90.86299896240234,
-  "test_rosenbrock_sparse_with_lrsched_False_SGD_cuda_float64 (__main__.TestOptimRenewedCUDA)": 66.57050196329753,
-  "test_rosenbrock_sparse_with_lrsched_True_SGD_cuda_float64 (__main__.TestOptimRenewedCUDA)": 69.65149958928426,
-  "test_runtime_checks_large_cpu (__main__.AOTInductorTestABICompatibleCpu)": 78.13350168863933,
-  "test_runtime_checks_large_cpu_with_stack_allocation (__main__.AOTInductorTestABICompatibleCpuWithStackAllocation)": 76.85255601671007,
-  "test_runtime_checks_large_cuda (__main__.AOTInductorTestABICompatibleGpu)": 333.04866282145184,
-  "test_save_load_large_string_attribute (__main__.TestSaveLoad)": 146.96599833170572,
-  "test_sdpa_kernel_ctx_manager2_dynamic_shapes (__main__.DynamicShapesCtxManagerTests)": 160.4881100124783,
-  "test_shuffler_iterdatapipe (__main__.IntegrationTestDataLoaderDataPipe)": 124.10055626763238,
-  "test_slow_tasks (__main__.TestFunctionalAutogradBenchmark)": 117.38410907321506,
-  "test_sort_dynamic_shape_with_check_cuda (__main__.TestInductorDynamicCUDA)": 710.2327779134115,
-  "test_sort_stable_cpu (__main__.CpuTritonTests)": 1324.4399820963542,
-  "test_sort_stable_cuda (__main__.GPUTests)": 76.83109970092774,
-  "test_split_cumsum_cpu (__main__.CpuTritonTests)": 88.58433532714844,
-  "test_svd_lowrank_cuda_complex128 (__main__.TestLinalgCUDA)": 160.1271684964498,
-  "test_tensor_split (__main__.TestVmapOperators)": 79.18955569393519,
-  "test_terminate_handler_on_crash (__main__.TestTorch)": 111.30388899644215,
-  "test_terminate_signal (__main__.ForkTest)": 132.3458870516883,
-  "test_terminate_signal (__main__.ParallelForkServerShouldWorkTest)": 132.2043343567186,
-  "test_terminate_signal (__main__.SpawnTest)": 136.1005539894104,
-  "test_torchvision_smoke (__main__.TestTensorBoardPytorchGraph)": 76.20899939537048,
-  "test_train_parity_multi_group_unshard_async_op (__main__.TestFullyShard1DTrainingCore)": 63.82099969046457,
-  "test_triton_bsr_scatter_mm_blocksize_64_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 61.925000508626304,
-  "test_triton_bsr_scatter_mm_blocksize_64_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 60.89849980672201,
-  "test_triton_bsr_scatter_mm_blocksize_64_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 66.88233375549316,
-  "test_triton_bsr_softmax_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 144.9854990641276,
-  "test_triton_bsr_softmax_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 144.4044977823893,
-  "test_triton_bsr_softmax_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 108.19166437784831,
-  "test_unary_ops (__main__.TestTEFuserDynamic)": 96.32655514611139,
-  "test_unary_ops (__main__.TestTEFuserStatic)": 105.33362591266632,
-  "test_upsample_bicubic2d_cpu_halide (__main__.HalideCpuTests)": 97.8336664835612,
-  "test_variant_consistency_jit_nn_functional_max_pool2d_cpu_float32 (__main__.TestJitCPU)": 82.86566925048828,
-  "test_variant_consistency_jit_nn_functional_max_pool2d_cuda_float32 (__main__.TestJitCUDA)": 68.26500002543132,
-  "test_views1_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 97.1120007832845,
-  "test_vmapjvpvjp_linalg_lstsq_grad_oriented_cpu_float32 (__main__.TestOperatorsCPU)": 88.24766794840495,
-  "test_vmapjvpvjp_linalg_lstsq_grad_oriented_cuda_float32 (__main__.TestOperatorsCUDA)": 65.41266759236653,
-  "test_vmapjvpvjp_linalg_lu_solve_cuda_float32 (__main__.TestOperatorsCUDA)": 74.75533294677734,
-  "test_vmapjvpvjp_linalg_multi_dot_cuda_float32 (__main__.TestOperatorsCUDA)": 73.52500089009602,
-  "test_vmapjvpvjp_linalg_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 73.85466639200847,
-  "test_vmapjvpvjp_max_pool2d_with_indices_backward_cuda_float32 (__main__.TestOperatorsCUDA)": 98.39650090535481,
-  "test_vmapjvpvjp_nn_functional_conv2d_cpu_float32 (__main__.TestOperatorsCPU)": 61.39695285615467,
-  "test_vmapjvpvjp_nn_functional_max_pool2d_cuda_float32 (__main__.TestOperatorsCUDA)": 77.88249842325847,
-  "test_vmapjvpvjp_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 73.0695006052653,
-  "test_vmapjvpvjp_unbind_cuda_float32 (__main__.TestOperatorsCUDA)": 81.86250114440918,
-  "test_vmapvjpvjp_meshgrid_list_of_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 98.63116455078125,
-  "test_vmapvjpvjp_meshgrid_variadic_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 94.85683314005534,
-  "test_vmapvjpvjp_nn_functional_bilinear_cuda_float32 (__main__.TestOperatorsCUDA)": 173.00183614095053
+  "EndToEndLSTM (__main__.RNNTest)": 190.48799641927084,
+  "MultiheadAttention (__main__.ModulesTest)": 141.2663370768229,
+  "test__adaptive_avg_pool2d (__main__.CPUReproTests)": 82.87333234151204,
+  "test_after_aot_cpu_runtime_error (__main__.MinifierIsolateTests)": 70.6538565499442,
+  "test_aot_autograd_disable_functionalization_symbolic_exhaustive_nn_functional_max_pool1d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 123.34033711751302,
+  "test_aot_autograd_disable_functionalization_symbolic_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 171.25450134277344,
+  "test_aot_autograd_disable_functionalization_symbolic_exhaustive_nn_functional_max_pool3d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 119.71899922688802,
+  "test_aot_autograd_disable_functionalization_symbolic_exhaustive_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 69.35733322870163,
+  "test_aot_autograd_symbolic_exhaustive_linalg_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 63.64533233642578,
+  "test_aot_autograd_symbolic_exhaustive_masked_norm_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 63.672952016194664,
+  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool1d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 138.04000091552734,
+  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 172.1344985961914,
+  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool3d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 114.02050018310547,
+  "test_aot_autograd_symbolic_exhaustive_ormqr_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 67.25642830984933,
+  "test_aot_autograd_symbolic_exhaustive_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 65.3350003560384,
+  "test_aot_autograd_symbolic_module_exhaustive_nn_TransformerDecoderLayer_cpu_float32 (__main__.TestEagerFusionModuleInfoCPU)": 120.95249938964844,
+  "test_associative_scan_partial_grad_combine_mode_generic_compile_mode_compile_dynamic_shape_reverse_False_cpu (__main__.AssociativeScanTests)": 86.97774887084961,
+  "test_associative_scan_partial_grad_combine_mode_generic_compile_mode_compile_dynamic_shape_reverse_True_cpu (__main__.AssociativeScanTests)": 100.90774917602539,
+  "test_avg_pool3d_backward2_cpu (__main__.CpuTests)": 1144.3935089111328,
+  "test_avg_pool3d_backward2_cuda (__main__.GPUTests)": 222.58500061035156,
+  "test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 501.10033162434894,
+  "test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 517.1875050862631,
+  "test_avg_pool3d_backward2_dynamic_shapes_cuda (__main__.DynamicShapesCodegenGPUTests)": 113.88125228881836,
+  "test_avg_pool3d_backward2_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 235.77350616455078,
+  "test_backward_nn_functional_multi_head_attention_forward_cpu_float32 (__main__.TestCompositeComplianceCPU)": 74.6155014038086,
+  "test_backward_nn_functional_multi_head_attention_forward_cuda_float32 (__main__.TestCompositeComplianceCUDA)": 66.63325119018555,
+  "test_basic_cpu (__main__.EfficientConvBNEvalCpuTests)": 216.2968317667643,
+  "test_basic_cuda (__main__.EfficientConvBNEvalGpuTests)": 153.0915012359619,
+  "test_cat_2k_args (__main__.TestTEFuserDynamic)": 108.80471753561869,
+  "test_cat_2k_args (__main__.TestTEFuserStatic)": 102.20949847949669,
+  "test_checkpointing_without_reentrant_input_requires_grad_False (__main__.TestAutogradWithCompiledAutograd)": 311.7026621500651,
+  "test_checkpointing_without_reentrant_input_requires_grad_True (__main__.TestAutogradWithCompiledAutograd)": 395.0001729329427,
+  "test_collect_callgrind (__main__.TestBenchmarkUtils)": 348.6218566894531,
+  "test_comprehensive_diff_cuda_complex128 (__main__.TestDecompCUDA)": 98.71574974060059,
+  "test_comprehensive_diff_cuda_complex64 (__main__.TestDecompCUDA)": 97.68499946594238,
+  "test_comprehensive_diff_cuda_float32 (__main__.TestDecompCUDA)": 65.0557508468628,
+  "test_comprehensive_diff_cuda_float64 (__main__.TestDecompCUDA)": 65.86899948120117,
+  "test_comprehensive_gradient_cuda_complex64 (__main__.TestDecompCUDA)": 97.15880012512207,
+  "test_comprehensive_grid_sampler_2d_cpu_bfloat16 (__main__.TestDecompCPU)": 103.20700073242188,
+  "test_comprehensive_grid_sampler_2d_cpu_float16 (__main__.TestDecompCPU)": 102.74033610026042,
+  "test_comprehensive_grid_sampler_2d_cpu_float32 (__main__.TestDecompCPU)": 460.4286702473958,
+  "test_comprehensive_grid_sampler_2d_cpu_float64 (__main__.TestDecompCPU)": 435.62066650390625,
+  "test_comprehensive_grid_sampler_2d_cuda_bfloat16 (__main__.TestDecompCUDA)": 287.3090057373047,
+  "test_comprehensive_grid_sampler_2d_cuda_float16 (__main__.TestDecompCUDA)": 265.1860008239746,
+  "test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestDecompCUDA)": 1235.7365112304688,
+  "test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 68.20825004577637,
+  "test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestDecompCUDA)": 1281.2615051269531,
+  "test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 71.90750026702881,
+  "test_comprehensive_linalg_householder_product_cuda_complex64 (__main__.TestDecompCUDA)": 79.04633331298828,
+  "test_comprehensive_linalg_lu_factor_ex_cuda_complex128 (__main__.TestDecompCUDA)": 68.10879821777344,
+  "test_comprehensive_linalg_lu_solve_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 71.43025207519531,
+  "test_comprehensive_linalg_lu_solve_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 68.94575023651123,
+  "test_comprehensive_linalg_solve_triangular_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 72.93649864196777,
+  "test_comprehensive_linalg_solve_triangular_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 72.46275043487549,
+  "test_comprehensive_linalg_svd_cuda_complex128 (__main__.TestDecompCUDA)": 64.10650062561035,
+  "test_comprehensive_linalg_svd_cuda_complex64 (__main__.TestDecompCUDA)": 67.03124904632568,
+  "test_comprehensive_linalg_svd_cuda_float64 (__main__.TestDecompCUDA)": 64.32800025939942,
+  "test_comprehensive_linalg_vector_norm_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 96.41353665865384,
+  "test_comprehensive_linalg_vector_norm_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 100.17661388103778,
+  "test_comprehensive_masked_norm_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 110.95025062561035,
+  "test_comprehensive_masked_norm_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 108.06550025939941,
+  "test_comprehensive_masked_norm_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 104.24150085449219,
+  "test_comprehensive_nn_functional_conv_transpose3d_cuda_complex128 (__main__.TestDecompCUDA)": 63.453749656677246,
+  "test_comprehensive_nn_functional_conv_transpose3d_cuda_complex64 (__main__.TestDecompCUDA)": 61.739999771118164,
+  "test_comprehensive_nn_functional_gaussian_nll_loss_cpu_float32 (__main__.TestDecompCPU)": 69.96549987792969,
+  "test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestDecompCUDA)": 113.65749931335449,
+  "test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float64 (__main__.TestDecompCUDA)": 106.57500076293945,
+  "test_comprehensive_nn_functional_grid_sample_cpu_float32 (__main__.TestDecompCPU)": 117.54049682617188,
+  "test_comprehensive_nn_functional_grid_sample_cpu_float64 (__main__.TestDecompCPU)": 116.19766489664714,
+  "test_comprehensive_nn_functional_grid_sample_cuda_float32 (__main__.TestDecompCUDA)": 272.48475646972656,
+  "test_comprehensive_nn_functional_grid_sample_cuda_float64 (__main__.TestDecompCUDA)": 248.12175369262695,
+  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestDecompCUDA)": 79.66900062561035,
+  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 81.52649879455566,
+  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestDecompCUDA)": 79.29400062561035,
+  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 82.40349960327148,
+  "test_comprehensive_nn_functional_interpolate_trilinear_cuda_float32 (__main__.TestDecompCUDA)": 128.42924880981445,
+  "test_comprehensive_nn_functional_interpolate_trilinear_cuda_float64 (__main__.TestDecompCUDA)": 125.03675079345703,
+  "test_comprehensive_nn_functional_max_pool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 1264.9732360839844,
+  "test_comprehensive_nn_functional_max_pool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 1250.7332458496094,
+  "test_comprehensive_nn_functional_max_pool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 1255.0684814453125,
+  "test_comprehensive_nn_functional_max_pool3d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 574.4627532958984,
+  "test_comprehensive_nn_functional_max_pool3d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 581.7282485961914,
+  "test_comprehensive_nn_functional_max_unpool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 65.052001953125,
+  "test_comprehensive_nn_functional_max_unpool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 61.19200134277344,
+  "test_comprehensive_nn_functional_max_unpool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 63.16874885559082,
+  "test_comprehensive_ormqr_cpu_complex64 (__main__.TestDecompCPU)": 62.39250183105469,
+  "test_comprehensive_ormqr_cuda_complex128 (__main__.TestDecompCUDA)": 113.32574844360352,
+  "test_comprehensive_ormqr_cuda_complex64 (__main__.TestDecompCUDA)": 113.91499900817871,
+  "test_comprehensive_ormqr_cuda_float32 (__main__.TestDecompCUDA)": 74.42549800872803,
+  "test_comprehensive_ormqr_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 76.1560001373291,
+  "test_comprehensive_ormqr_cuda_float64 (__main__.TestDecompCUDA)": 66.76750087738037,
+  "test_comprehensive_svd_cuda_complex128 (__main__.TestDecompCUDA)": 70.69724941253662,
+  "test_comprehensive_svd_cuda_complex64 (__main__.TestDecompCUDA)": 69.87625026702881,
+  "test_constructor_autograd_SparseBSC_cuda (__main__.TestSparseAnyCUDA)": 80.2542495727539,
+  "test_constructor_autograd_SparseBSR_cuda (__main__.TestSparseAnyCUDA)": 69.0419979095459,
+  "test_conv1d_basic (__main__.TestXNNPACKConv1dTransformPass)": 117.03342655726841,
+  "test_conv1d_with_relu_fc (__main__.TestXNNPACKConv1dTransformPass)": 289.50213841029574,
+  "test_conv2d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 67.38800048828125,
+  "test_conv3d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 145.27399444580078,
+  "test_conv3d_binary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 66.9245999654134,
+  "test_conv3d_cuda (__main__.AOTInductorTestABICompatibleGpu)": 151.91099548339844,
+  "test_conv_bn_fuse_cpu (__main__.CpuTests)": 92.79549789428711,
+  "test_conv_bn_fuse_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 64.60149955749512,
+  "test_conv_transpose_with_output_size_and_no_batch_dim_ConvTranspose3d_cuda (__main__.TestConvolutionNNDeviceTypeCUDA)": 69.27724676392972,
+  "test_conv_unary_fusion_nnc (__main__.TestMkldnnFusion)": 76.24971498761859,
+  "test_correctness_AdamW_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 81.93449974060059,
+  "test_correctness_Adam_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 78.87700080871582,
+  "test_count_nonzero_all (__main__.TestBool)": 631.2585144042969,
+  "test_diff_hyperparams_sharding_strategy_str_full_shard (__main__.TestFSDPUseOrigParamsMultipleParamGroups)": 61.042999267578125,
+  "test_dispatch_symbolic_meta_outplace_all_strides_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestMetaCUDA)": 84.49850082397461,
+  "test_dtensor_op_db_nn_functional_poisson_nll_loss_cpu_float32 (__main__.TestLocalDTensorOpsCPU)": 93.03299713134766,
+  "test_eager_sequence_nr_dynamic_shapes (__main__.DynamicShapesAotAutogradFallbackTests)": 228.46711820714614,
+  "test_eig_check_magma_cuda_float32 (__main__.TestLinalgCUDA)": 286.29998779296875,
+  "test_fail_arithmetic_ops.py (__main__.TestTyping)": 68.43842806134906,
+  "test_fail_random.py (__main__.TestTyping)": 74.83523060725285,
+  "test_fn_fwgrad_bwgrad_cumprod_cuda_complex128 (__main__.TestFwdGradientsCUDA)": 72.84900093078613,
+  "test_fn_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 75.86675071716309,
+  "test_fuse_large_params_cpu (__main__.CpuTests)": 151.4199981689453,
+  "test_fuse_large_params_cuda (__main__.GPUTests)": 60.351999282836914,
+  "test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 158.3622828892299,
+  "test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 149.6796646118164,
+  "test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesCodegenGPUTests)": 139.97800064086914,
+  "test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 114.8385009765625,
+  "test_grad_nn_Transformer_cpu_float64 (__main__.TestModuleCPU)": 84.69736822027909,
+  "test_grad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 84.62700080871582,
+  "test_gradgrad_nn_LSTM_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 89.197998046875,
+  "test_gradgrad_nn_LSTM_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 96.46900177001953,
+  "test_gradgrad_nn_TransformerDecoderLayer_cuda_float64 (__main__.TestModuleCUDA)": 187.83824920654297,
+  "test_gradgrad_nn_TransformerEncoder_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 110.49449920654297,
+  "test_gradgrad_nn_TransformerEncoder_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 124.90424919128418,
+  "test_gradgrad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 518.4157485961914,
+  "test_indirect_device_assert (__main__.TritonCodeGenTests)": 304.6440022786458,
+  "test_inductor_dynamic_shapes_broadcasting_dynamic_shapes (__main__.DynamicShapesReproTests)": 143.82052836698645,
+  "test_inductor_no_recursionerror_on_for_loops_dynamic_shapes (__main__.DynamicShapesReproTests)": 77.4985705784389,
+  "test_inplace_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 76.06225109100342,
+  "test_inputs_overlapping_with_mutation_stress_dynamic_shapes (__main__.DynamicShapesAotAutogradFallbackTests)": 138.9222858973912,
+  "test_jit_cuda_archflags (__main__.TestCppExtensionJIT)": 120.62233225504558,
+  "test_linalg_solve_triangular_large_cuda_complex128 (__main__.TestLinalgCUDA)": 148.1219940185547,
+  "test_linalg_solve_triangular_large_cuda_complex64 (__main__.TestLinalgCUDA)": 109.34200286865234,
+  "test_linear_binary_cpp_wrapper (__main__.TestCppWrapper)": 119.36233266194661,
+  "test_linear_binary_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 127.95700073242188,
+  "test_list_clearing_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 61.64850175380707,
+  "test_longformer_chunk_dynamic_shapes (__main__.DynamicShapesReproTests)": 105.3174296787807,
+  "test_low_memory_max_pool_dilation_1_dim_3_cpu_halide (__main__.HalideCpuTests)": 585.9210001627604,
+  "test_low_memory_max_pool_dilation_2_dim_3_cpu_halide (__main__.HalideCpuTests)": 504.3250020345052,
+  "test_lstm_cpu (__main__.TestMkldnnCPU)": 86.21566645304362,
+  "test_many_overlapping_inputs_does_not_explode_guards_dynamic_shapes (__main__.DynamicShapesReproTests)": 129.277715410505,
+  "test_max_autotune_addmm_max_autotune_gemm_backends_CK_x_shape2 (__main__.TestCKBackend)": 64.24800109863281,
+  "test_max_autotune_precompile_matmul_max_autotune_gemm_backends_CKTILE_autotune_in_subproc_False_use_aoti_False (__main__.TestCKBackend)": 77.23899841308594,
+  "test_max_autotune_precompile_matmul_max_autotune_gemm_backends_CKTILE_autotune_in_subproc_False_use_aoti_True (__main__.TestCKBackend)": 65.15649795532227,
+  "test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 62.579833984375,
+  "test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 64.6555004119873,
+  "test_pattern_matcher_multi_user_cpu (__main__.CpuTritonTests)": 142.21566772460938,
+  "test_proper_exit (__main__.TestDataLoader)": 267.74214717320035,
+  "test_proper_exit (__main__.TestDataLoaderPersistentWorkers)": 266.6539971487863,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 101.97100067138672,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 97.3346659342448,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True (__main__.TestPatternMatcher)": 81.50300216674805,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 104.61333465576172,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 99.41133371988933,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 73.37100219726562,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 95.30900065104167,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 96.61750030517578,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True (__main__.TestPatternMatcher)": 79.33600234985352,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 101.2393315633138,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 103.18400192260742,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False (__main__.TestPatternMatcher)": 75.4114990234375,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 96.52833302815755,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 99.72700119018555,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 100.61966705322266,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 102.2750015258789,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 95.17449951171875,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 97.96749877929688,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 106.44049835205078,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 101.7173334757487,
+  "test_quick_core_backward__unsafe_masked_index_cpu_float64 (__main__.TestDecompCPU)": 531.5236612955729,
+  "test_quick_core_backward__unsafe_masked_index_cuda_float64 (__main__.TestDecompCUDA)": 1077.4210205078125,
+  "test_quick_core_backward__unsafe_masked_index_put_accumulate_cpu_float64 (__main__.TestDecompCPU)": 812.0880126953125,
+  "test_quick_core_backward__unsafe_masked_index_put_accumulate_cuda_float64 (__main__.TestDecompCUDA)": 1347.9365234375,
+  "test_quick_core_backward_nn_functional_max_unpool3d_grad_cpu_float64 (__main__.TestDecompCPU)": 88.93533070882161,
+  "test_quick_core_backward_nn_functional_max_unpool3d_grad_cuda_float64 (__main__.TestDecompCUDA)": 269.01949310302734,
+  "test_quick_core_backward_roll_cpu_float64 (__main__.TestDecompCPU)": 131.99799601236978,
+  "test_quick_core_backward_roll_cuda_float64 (__main__.TestDecompCUDA)": 232.36275100708008,
+  "test_quick_core_backward_select_scatter_cpu_float64 (__main__.TestDecompCPU)": 69.80400085449219,
+  "test_quick_core_backward_select_scatter_cuda_float64 (__main__.TestDecompCUDA)": 134.3415012359619,
+  "test_quick_core_backward_split_cuda_float64 (__main__.TestDecompCUDA)": 67.51749992370605,
+  "test_quick_core_backward_split_with_sizes_copy_cpu_float64 (__main__.TestDecompCPU)": 91.21066792805989,
+  "test_quick_core_backward_split_with_sizes_copy_cuda_float64 (__main__.TestDecompCUDA)": 170.97775268554688,
+  "test_quick_core_backward_std_cpu_float64 (__main__.TestDecompCPU)": 61.608266321818036,
+  "test_quick_core_backward_std_cuda_float64 (__main__.TestDecompCUDA)": 110.62575149536133,
+  "test_register_spills_cuda (__main__.BenchmarkFusionGpuTest)": 63.59499969482422,
+  "test_replicatepad_64bit_indexing_cuda_float16 (__main__.TestNNDeviceTypeCUDA)": 88.68299865722656,
+  "test_rnn_decomp_module_nn_LSTM_train_mode_cuda_float32 (__main__.TestDecompCUDA)": 91.50320053100586,
+  "test_runtime_checks_large_cpu (__main__.AOTInductorTestABICompatibleCpu)": 66.10774898529053,
+  "test_runtime_checks_large_cpu_with_stack_allocation (__main__.AOTInductorTestABICompatibleCpuWithStackAllocation)": 66.20533180236816,
+  "test_runtime_checks_large_cuda (__main__.AOTInductorTestABICompatibleGpu)": 243.1092529296875,
+  "test_save_load_large_string_attribute (__main__.TestSaveLoad)": 105.01200103759766,
+  "test_sdpa_kernel_ctx_manager2_dynamic_shapes (__main__.DynamicShapesCtxManagerTests)": 107.93685695103237,
+  "test_shuffler_iterdatapipe (__main__.IntegrationTestDataLoaderDataPipe)": 142.38899993896484,
+  "test_slow_tasks (__main__.TestFunctionalAutogradBenchmark)": 119.90166600545247,
+  "test_sort_bool_cpu (__main__.CpuTritonTests)": 346.2856750488281,
+  "test_sort_dynamic_shape_with_check_cuda (__main__.TestInductorDynamicCUDA)": 423.09974098205566,
+  "test_sort_stable_cuda (__main__.GPUTests)": 117.61659927368164,
+  "test_sort_transpose_cpu (__main__.CpuTritonTests)": 378.31200154622394,
+  "test_svd_lowrank_cuda_complex128 (__main__.TestLinalgCUDA)": 222.822007894516,
+  "test_terminate_handler_on_crash (__main__.TestTorch)": 143.31728431156702,
+  "test_terminate_signal (__main__.ForkTest)": 168.20485967184817,
+  "test_terminate_signal (__main__.ParallelForkServerShouldWorkTest)": 168.19242484867573,
+  "test_terminate_signal (__main__.SpawnTest)": 172.16428443363733,
+  "test_thnn_conv_strided_padded_dilated (__main__.TestConvolutionNN)": 93.30639710426331,
+  "test_train_parity_multi_group (__main__.TestFullyShard1DTrainingCore)": 163.89743041992188,
+  "test_train_parity_with_activation_checkpointing (__main__.TestFullyShard1DTrainingCompose)": 60.47671399797712,
+  "test_triton_bsr_scatter_mm_blocksize_64_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 63.39550018310547,
+  "test_triton_bsr_softmax_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 173.53924942016602,
+  "test_triton_bsr_softmax_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 175.3212537765503,
+  "test_triton_bsr_softmax_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 122.20649909973145,
+  "test_variant_consistency_jit_nn_functional_max_pool2d_cpu_float32 (__main__.TestJitCPU)": 99.9885025024414,
+  "test_variant_consistency_jit_nn_functional_max_pool2d_cuda_float32 (__main__.TestJitCUDA)": 71.64024829864502,
+  "test_view_ops (__main__.TestViewOpsWithLocalTensor)": 73.45887422561646,
+  "test_vmapjvpvjp_linalg_lstsq_grad_oriented_cpu_float32 (__main__.TestOperatorsCPU)": 95.75249862670898,
+  "test_vmapjvpvjp_linalg_lstsq_grad_oriented_cuda_float32 (__main__.TestOperatorsCUDA)": 61.858001708984375,
+  "test_vmapjvpvjp_linalg_lu_solve_cpu_float32 (__main__.TestOperatorsCPU)": 65.11023766653878,
+  "test_vmapjvpvjp_linalg_lu_solve_cuda_float32 (__main__.TestOperatorsCUDA)": 66.35274982452393,
+  "test_vmapjvpvjp_linalg_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 61.196499824523926,
+  "test_vmapjvpvjp_max_pool2d_with_indices_backward_cpu_float32 (__main__.TestOperatorsCPU)": 73.75380906604585,
+  "test_vmapjvpvjp_max_pool2d_with_indices_backward_cuda_float32 (__main__.TestOperatorsCUDA)": 73.64649868011475,
+  "test_vmapjvpvjp_nn_functional_max_pool2d_cpu_float32 (__main__.TestOperatorsCPU)": 75.09799966358003,
+  "test_vmapjvpvjp_nn_functional_max_pool2d_cuda_float32 (__main__.TestOperatorsCUDA)": 70.51450157165527,
+  "test_vmapjvpvjp_unbind_cpu_float32 (__main__.TestOperatorsCPU)": 66.21433276221866,
+  "test_vmapjvpvjp_unbind_cuda_float32 (__main__.TestOperatorsCUDA)": 73.20024871826172,
+  "test_vmapvjpvjp_linalg_lstsq_cuda_float32 (__main__.TestOperatorsCUDA)": 88.1349983215332,
+  "test_vmapvjpvjp_meshgrid_list_of_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 76.89924907684326,
+  "test_vmapvjpvjp_meshgrid_variadic_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 77.32975196838379,
+  "test_vmapvjpvjp_nn_functional_bilinear_cuda_float32 (__main__.TestOperatorsCUDA)": 120.09600067138672
 }
\ No newline at end of file
diff --git a/test/test_accelerator.py b/test/test_accelerator.py
index 21731bd275b60..7daebc01adfe9 100644
--- a/test/test_accelerator.py
+++ b/test/test_accelerator.py
@@ -5,17 +5,22 @@
 import unittest
 
 import torch
-from torch.testing._internal.common_utils import NoTest, run_tests, TEST_MPS, TestCase
+from torch.testing._internal.common_utils import (
+    NoTest,
+    run_tests,
+    TEST_ACCELERATOR,
+    TEST_MPS,
+    TEST_MULTIACCELERATOR,
+    TestCase,
+)
 
 
-if not torch.accelerator.is_available():
+if not TEST_ACCELERATOR:
     print("No available accelerator detected, skipping tests", file=sys.stderr)
     TestCase = NoTest  # noqa: F811
     # Skip because failing when run on cuda build with no GPU, see #150059 for example
     sys.exit()
 
-TEST_MULTIACCELERATOR = torch.accelerator.device_count() > 1
-
 
 class TestAccelerator(TestCase):
     def test_current_accelerator(self):
@@ -234,6 +239,12 @@ def test_memory_stats(self):
         self.assertEqual(torch.accelerator.max_memory_allocated(), prev_max_allocated)
         self.assertEqual(torch.accelerator.max_memory_reserved(), prev_max_reserved)
 
+    @unittest.skipIf(TEST_MPS, "MPS doesn't support torch.accelerator memory API!")
+    def test_get_memory_info(self):
+        free_bytes, total_bytes = torch.accelerator.get_memory_info()
+        self.assertGreaterEqual(free_bytes, 0)
+        self.assertGreaterEqual(total_bytes, 0)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_as_strided.py b/test/test_as_strided.py
new file mode 100644
index 0000000000000..a5bcb8e279247
--- /dev/null
+++ b/test/test_as_strided.py
@@ -0,0 +1,176 @@
+# Owner(s): ["oncall: pt2"]
+
+from collections import deque
+from typing import Optional
+
+import torch
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+def get_state(t: torch.Tensor) -> tuple[tuple[int, ...], tuple[int, ...]]:
+    """Extract (sizes, strides) tuple from a tensor."""
+    return (tuple(t.size()), tuple(t.stride()))
+
+
+def enumerate_reachable_states(
+    initial_size: int,
+) -> set[tuple[tuple[int, ...], tuple[int, ...]]]:
+    """
+    Use BFS with DP to enumerate all reachable (size, stride) states from
+    a 1D contiguous tensor via valid view operations.
+
+    We only explore states with offset=0 (you can retroactively change the offset).
+    We reject states with size=0 or size=1 dimensions as they are degenerate.
+    """
+    # Create initial 1D contiguous tensor
+    initial_tensor = torch.arange(initial_size)
+
+    initial_state = get_state(initial_tensor)
+
+    # Map from state to tensor for that state
+    state_to_tensor: dict[tuple[tuple[int, ...], tuple[int, ...]], torch.Tensor] = {
+        initial_state: initial_tensor
+    }
+    visited: set[tuple[tuple[int, ...], tuple[int, ...]]] = {initial_state}
+    queue: deque[tuple[tuple[int, ...], tuple[int, ...]]] = deque([initial_state])
+
+    while queue:
+        state = queue.popleft()
+        t = state_to_tensor[state]
+        sizes, strides = state
+        ndim = len(sizes)
+
+        def add_state(new_t: torch.Tensor) -> None:
+            new_state = get_state(new_t)
+            sizes, strides = new_state
+            # Skip if has size-0 or size-1 dimensions
+            if any(s == 0 or s == 1 for s in sizes):
+                return
+            # Only accept states where strides are in descending order
+            if list(strides) != sorted(strides, reverse=True):
+                return
+            if new_state not in visited:
+                visited.add(new_state)
+                queue.append(new_state)
+                state_to_tensor[new_state] = new_t
+
+        # 1. Unflatten: try factoring each dimension
+        for dim in range(ndim):
+            size = sizes[dim]
+            assert size > 1
+            # Try all factorizations x * y = size where both x, y >= 2
+            # We only need to check x up to size // 2 since when x > size // 2,
+            # y = size // x < 2, which we reject
+            for x in range(2, size // 2 + 1):
+                if size % x == 0:
+                    y = size // x
+                    add_state(t.unflatten(dim, (x, y)))
+
+        # 2. Slice: exhaustively check all possible slicing parameters
+        for dim in range(ndim):
+            size = sizes[dim]
+            for start in range(size):
+                for stop in range(start + 1, size + 1):
+                    for step in range(1, size + 1):
+                        slices = [slice(None)] * ndim
+                        slices[dim] = slice(start, stop, step)
+                        add_state(t[tuple(slices)])
+
+        # 3. Flatten: merge adjacent dimensions
+        for dim in range(ndim - 1):
+            add_state(t.flatten(dim, dim + 1))
+
+    return visited
+
+
+class TestAsStrided(TestCase):
+    def test_size_10_exhaustive(self) -> None:
+        """Test that size 10 produces exactly the expected 54 states."""
+        expected_states = {
+            ((2,), (1,)),
+            ((2,), (2,)),
+            ((2,), (3,)),
+            ((2,), (4,)),
+            ((2,), (5,)),
+            ((2,), (6,)),
+            ((2,), (7,)),
+            ((2,), (8,)),
+            ((2,), (9,)),
+            ((2, 2), (2, 1)),
+            ((2, 2), (3, 1)),
+            ((2, 2), (3, 2)),
+            ((2, 2), (4, 1)),
+            ((2, 2), (4, 2)),
+            ((2, 2), (4, 3)),
+            ((2, 2), (5, 1)),
+            ((2, 2), (5, 2)),
+            ((2, 2), (5, 3)),
+            ((2, 2), (5, 4)),
+            ((2, 2), (6, 1)),
+            ((2, 2), (6, 2)),
+            ((2, 2), (6, 3)),
+            ((2, 2), (8, 1)),
+            ((2, 2, 2), (4, 2, 1)),
+            ((2, 2, 2), (5, 2, 1)),
+            ((2, 3), (3, 1)),
+            ((2, 3), (4, 1)),
+            ((2, 3), (5, 1)),
+            ((2, 3), (5, 2)),
+            ((2, 3), (6, 1)),
+            ((2, 4), (4, 1)),
+            ((2, 4), (5, 1)),
+            ((2, 5), (5, 1)),
+            ((3,), (1,)),
+            ((3,), (2,)),
+            ((3,), (3,)),
+            ((3,), (4,)),
+            ((3, 2), (2, 1)),
+            ((3, 2), (3, 1)),
+            ((3, 2), (3, 2)),
+            ((3, 2), (4, 1)),
+            ((3, 3), (3, 1)),
+            ((4,), (1,)),
+            ((4,), (2,)),
+            ((4,), (3,)),
+            ((4, 2), (2, 1)),
+            ((5,), (1,)),
+            ((5,), (2,)),
+            ((5, 2), (2, 1)),
+            ((6,), (1,)),
+            ((7,), (1,)),
+            ((8,), (1,)),
+            ((9,), (1,)),
+            ((10,), (1,)),
+        }
+
+        actual_states = enumerate_reachable_states(10)
+
+        self.assertEqual(len(actual_states), 54)
+        self.assertEqual(actual_states, expected_states)
+
+    def test_subset_property(self) -> None:
+        """
+        Test that for sizes 2..10, each smaller tensor results in a strict
+        subset of possible states compared to the next one.
+        """
+        prev_states: Optional[set[tuple[tuple[int, ...], tuple[int, ...]]]] = None
+        for size in range(2, 11):
+            current_states = enumerate_reachable_states(size)
+
+            if prev_states is not None:
+                # Check that prev_states is a strict subset of current_states
+                self.assertTrue(
+                    prev_states.issubset(current_states),
+                    f"States from size {size - 1} are not a subset of size {size}",
+                )
+                # Check that it's a strict subset (not equal)
+                self.assertTrue(
+                    len(prev_states) < len(current_states),
+                    f"States from size {size - 1} should be strictly fewer than size {size}",
+                )
+
+            prev_states = current_states
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 6c3e250df7c7c..5960ac8add36d 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -7364,6 +7364,62 @@ def test_checkpoint_sequential_warns_if_use_reentrant_not_passed_explcitly(self)
         ):
             checkpoint_sequential(modules_list, 3, a)
 
+    @skipIfTorchDynamo("GraphExecGroup does not support compile")
+    def test_checkpoint_graph_execution_group(self):
+        def run(use_graph_execution_group):
+            counter = [0]
+
+            def fn(x):
+                counter[0] += 1
+                y = x.sin().cos()
+                z = y.sin().cos()
+                return y, z
+
+            x = torch.randn(3, 3, requires_grad=True)
+
+            y, z = checkpoint(fn, x, use_reentrant=False)
+
+            group = torch.utils.checkpoint.GraphExecGroup()
+
+            ctx = contextlib.nullcontext()
+            if use_graph_execution_group:
+                ctx = group
+
+            with ctx:
+                (grad_y,) = torch.autograd.grad(
+                    z, inputs=(y,), grad_outputs=(torch.ones(3, 3),)
+                )
+
+                (grad_x,) = torch.autograd.grad(
+                    y,
+                    inputs=(x,),
+                    grad_outputs=(grad_y,),
+                )
+
+            if use_graph_execution_group:
+                self.assertEqual(counter[0], 2)
+            else:
+                self.assertEqual(counter[0], 3)
+
+        run(use_graph_execution_group=True)
+        run(use_graph_execution_group=False)
+
+        # Test the not actually disjoint case (using retain_graph=True since
+        # otherwise autograd itself will catch this)
+        def fn(x):
+            return x.sin().cos()
+
+        x = torch.randn(3, 3, requires_grad=True)
+        out = checkpoint(fn, x, use_reentrant=False)
+        with torch.utils.checkpoint.GraphExecGroup():
+            # Under this context, we will enforce that two backward are disjoint
+            # even if retain_graph=True.
+            out.sum().backward(retain_graph=True)
+            with self.assertRaisesRegex(
+                RuntimeError, "Performing two backward calls that overlap"
+            ):
+                out.sum().backward()
+
     def test_checkpoint_detects_non_determinism(self):
         def save_3_tensors(x):
             out = x.sin().exp()
@@ -10839,6 +10895,34 @@ def func(inp):
 
             self.assertTrue(gradcheck(func, x, fast_mode=True))
 
+    def test_grad_thread_safety(self):
+        import threading
+        from concurrent.futures import ThreadPoolExecutor
+
+        NUM_ITERS = 10
+        NUM_THREADS = 4
+
+        # Concurrent calls to tensor.untyped_storage()
+        def access_grad(tensor, barrier):
+            barrier.wait()
+            return weakref.ref(tensor.grad)
+
+        for i in range(NUM_ITERS):
+            tensor = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
+            (tensor**2).sum().backward()
+
+            barrier = threading.Barrier(NUM_THREADS)
+            with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
+                futures = [
+                    executor.submit(access_grad, tensor, barrier)
+                    for _ in range(NUM_THREADS)
+                ]
+
+                # Check that all the grad tensors returned were the same
+                for future in futures:
+                    self.assertEqual(future.result()(), tensor.grad)
+                self.assertIsNotNone(tensor.grad)
+
 
 def index_perm_variable(shape, max_indices):
     if not isinstance(shape, tuple):
@@ -15242,6 +15326,14 @@ def log_grad_order(grad: torch.Tensor, name: str, order):
         self.assertEqual(y.grad, 2 * torch.ones_like(x))
         self.assertEqual(z.grad, torch.ones_like(x))
 
+    def test_atan2_zero_gradient(self):
+        x = torch.tensor([0.0], requires_grad=True)
+        y = torch.tensor([0.0], requires_grad=True)
+        z = torch.atan2(x, y)
+        z.backward()
+        self.assertEqual(x.grad, torch.zeros_like(x))
+        self.assertEqual(y.grad, torch.zeros_like(y))
+
 
 # Import test cases from below autograd/ here. These are found
 # implicitly by the loader, so Flake8 thinks they are unused, hence
diff --git a/test/test_autograd_fallback.py b/test/test_autograd_fallback.py
index d6252ac6f34a3..5748b5c4cca4b 100644
--- a/test/test_autograd_fallback.py
+++ b/test/test_autograd_fallback.py
@@ -6,6 +6,7 @@
 import numpy as np
 
 import torch
+from torch._library.autograd import autograd_fallback_mode
 from torch.library import _scoped_library
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
@@ -15,16 +16,6 @@
 )
 
 
-@contextlib.contextmanager
-def autograd_fallback_mode(mode):
-    prev = torch._C._get_autograd_fallback_mode()
-    try:
-        torch._C._set_autograd_fallback_mode(mode)
-        yield
-    finally:
-        torch._C._set_autograd_fallback_mode(prev)
-
-
 class TestAutogradFallback(TestCase):
     test_ns = "_test_autograd_fallback"
 
diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py
index 406242964d1c9..2b5606aec98d6 100644
--- a/test/test_binary_ufuncs.py
+++ b/test/test_binary_ufuncs.py
@@ -1876,20 +1876,19 @@ def _scalar_helper(python_op, torch_op):
 
                     expected = python_op(a, b)
 
-                    for op in (operator.truediv, torch.true_divide):
-                        actual_scalar = torch_op(a, b)
+                    actual_scalar = torch_op(a, b)
 
-                        a_t = torch.tensor(a, device=device)
-                        b_t = torch.tensor(b, device=device)
+                    a_t = torch.tensor(a, device=device)
+                    b_t = torch.tensor(b, device=device)
 
-                        actual_tensor = torch_op(a_t, b_t)
-                        actual_first_tensor = torch_op(a_t, b)
-                        actual_second_tensor = torch_op(a, b_t)
+                    actual_tensor = torch_op(a_t, b_t)
+                    actual_first_tensor = torch_op(a_t, b)
+                    actual_second_tensor = torch_op(a, b_t)
 
-                        self.assertEqual(actual_scalar, expected)
-                        self.assertEqual(actual_tensor.item(), expected)
-                        self.assertEqual(actual_first_tensor, actual_tensor)
-                        self.assertEqual(actual_second_tensor, actual_tensor)
+                    self.assertEqual(actual_scalar, expected)
+                    self.assertEqual(actual_tensor.item(), expected)
+                    self.assertEqual(actual_first_tensor, actual_tensor)
+                    self.assertEqual(actual_second_tensor, actual_tensor)
 
         _scalar_helper(operator.truediv, operator.truediv)
         _scalar_helper(operator.truediv, torch.true_divide)
@@ -3540,7 +3539,7 @@ def _test_logaddexp(self, device, dtype, base2):
         if base2:
             ref_func = np.logaddexp2
             our_func = torch.logaddexp2
-        elif dtype in (torch.complex64, torch.complex128):
+        elif dtype in (torch.complex32, torch.complex64, torch.complex128):
             # numpy has not implemented logaddexp for complex
             def complex_logaddexp(x1, x2):
                 x = np.stack((x1, x2))
@@ -3559,6 +3558,13 @@ def _test_helper(a, b):
                 ref = ref_func(a.cpu().float().numpy(), b.cpu().float().numpy())
                 v = our_func(a, b)
                 self.assertEqual(ref, v.float(), atol=0.01, rtol=0.01)
+            elif dtype == torch.complex32:
+                ref = ref_func(
+                    a.cpu().to(torch.complex64).numpy(),
+                    b.cpu().to(torch.complex64).numpy(),
+                )
+                v = our_func(a, b)
+                self.assertEqual(ref, v.to(torch.complex64), atol=0.01, rtol=0.01)
             else:
                 ref = ref_func(a.cpu().numpy(), b.cpu().numpy())
                 v = our_func(a, b)
@@ -3589,12 +3595,23 @@ def _test_helper(a, b):
         _test_helper(a, b)
 
     @skipIfTorchDynamo()  # complex infs/nans differ under Dynamo/Inductor
-    @dtypesIfCUDA(torch.float32, torch.float64, torch.bfloat16)
+    @dtypesIfCUDA(
+        torch.float32,
+        torch.float64,
+        torch.bfloat16,
+        torch.complex32,
+        torch.complex64,
+        torch.complex128,
+    )
     @dtypes(
         torch.float32, torch.float64, torch.bfloat16, torch.complex64, torch.complex128
     )
     def test_logaddexp(self, device, dtype):
-        if sys.version_info >= (3, 12) and dtype in (torch.complex64, torch.complex128):
+        if sys.version_info >= (3, 12) and dtype in (
+            torch.complex32,
+            torch.complex64,
+            torch.complex128,
+        ):
             return self.skipTest("complex flaky in 3.12")
         self._test_logaddexp(device, dtype, base2=False)
 
diff --git a/test/test_bundled_inputs.py b/test/test_bundled_inputs.py
index 221502ae3190a..77acfbe3472b0 100644
--- a/test/test_bundled_inputs.py
+++ b/test/test_bundled_inputs.py
@@ -205,7 +205,7 @@ def foo(self, arg):
         self.assertEqual(all_info["foo"]["info"], info)
 
         # example of how to turn the 'get_inputs_function_name' into the actual list of bundled inputs
-        for func_name in all_info.keys():
+        for func_name in all_info:
             input_func_name = all_info[func_name]["get_inputs_function_name"][0]
             func_to_run = getattr(loaded, input_func_name)
             self.assertEqual(func_to_run(), samples)
diff --git a/test/test_cpp_extensions_jit.py b/test/test_cpp_extensions_jit.py
index 176ac3d044708..bacff3c396569 100644
--- a/test/test_cpp_extensions_jit.py
+++ b/test/test_cpp_extensions_jit.py
@@ -1309,6 +1309,40 @@ def test_cuda_pluggable_allocator_include(self):
         # test if build was successful
         self.assertEqual(success, True)
 
+    @unittest.skipIf(
+        not IS_LINUX or not check_compiler_is_gcc(get_cxx_compiler()),
+        "PCH is only available on Linux with GCC",
+    )
+    def test_pch_command_injection(self):
+        """Tests that PCH compilation is not vulnerable to command injection."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            exploit_file = os.path.join(tmpdir, "pch_exploit")
+            # If executed by shell, this would create exploit_file
+            payload = f'; echo vulnerable > "{exploit_file}"'
+            cpp_source = "void foo() {}"
+
+            # Try to compile with malicious payload in extra_cflags
+            # The compilation may succeed or fail, but the key test is whether
+            # the shell command in the payload gets executed
+            try:
+                torch.utils.cpp_extension.load_inline(
+                    name="test_pch_injection",
+                    cpp_sources=cpp_source,
+                    functions=["foo"],
+                    extra_cflags=[payload],
+                    use_pch=True,
+                    verbose=True,
+                )
+            except RuntimeError:
+                # Compilation failure is expected since payload is not a valid flag
+                pass
+
+            # The critical security check: verify the shell command was NOT executed
+            self.assertFalse(
+                os.path.exists(exploit_file),
+                "Command injection vulnerability detected!",
+            )
+
 
 if __name__ == "__main__":
     common.run_tests()
diff --git a/test/test_cuda.py b/test/test_cuda.py
index a7e373da63824..5712187775ef6 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -1474,13 +1474,6 @@ def test_huge_index(self):
         res_cpu = src.cpu()[idx.cpu()]
         self.assertEqual(res.cpu(), res_cpu)
 
-    def test_fast_index_overflow(self):
-        src = torch.randint(0, 20, (4, 87, 1056, 736), device="cuda")
-        indices = torch.tensor([True, False, False, True], device="cuda")
-        res = src[indices]
-        res_cpu = src.cpu()[indices.cpu()]
-        self.assertEqual(res.cpu(), res_cpu)
-
     def test_randint_randomness_for_large_range(self) -> None:
         # For large ranges, randint generation is slightly different. This lead to a subtle bug where some Philox
         # offsets were not calculated correctly, resulting in reused random states.
@@ -4633,6 +4626,52 @@ def check_output(script: str) -> str:
         rc = check_output(test_script)
         self.assertEqual(rc, "cudaMallocAsync")
 
+    def test_allocator_memory_fraction_setting(self):
+        def make_env(fraction):
+            env = os.environ.copy()
+            var = "PYTORCH_CUDA_ALLOC_CONF"
+            key = "per_process_memory_fraction"
+            value = [
+                x
+                for x in env.get(var, "").split(",")
+                if len(x) > 0 and not x.startswith(f"{key}:")
+            ]
+            value.append(f"{key}:{fraction}")
+            env[var] = ",".join(value)
+            return env
+
+        def run_test(value):
+            test_script = """\
+import os
+import torch
+device = torch._C._cuda_getDevice()
+value = torch.cuda.memory.get_per_process_memory_fraction(device)
+print(value, end="")
+            """
+            return subprocess.run(
+                [sys.executable, "-c", test_script],
+                env=make_env(value),
+                text=True,
+                check=True,
+                capture_output=True,
+            )
+
+        self.assertEqual(run_test(0.0).stdout, "0.0")
+        self.assertEqual(run_test(0.5).stdout, "0.5")
+        self.assertEqual(run_test(1.0).stdout, "1.0")
+
+        with self.assertRaises(subprocess.CalledProcessError) as e:
+            run_test(-0.1)
+        assert "per_process_memory_fraction is invalid" in e.exception.stderr, (
+            e.exception.stderr
+        )
+
+        with self.assertRaises(subprocess.CalledProcessError) as e:
+            run_test(1.1)
+        assert "per_process_memory_fraction is invalid" in e.exception.stderr, (
+            e.exception.stderr
+        )
+
     def test_cachingAllocator_raw_alloc(self):
         # Test that raw_alloc respects the setting that
         # activates/deactivates the caching allocator
@@ -7413,6 +7452,132 @@ def test_graph_external_wait_and_record(self):
         )
 
 
+class TestFXMemoryProfiler(TestCase):
+    """Tests for memory profiler augmentation with original stack traces."""
+
+    class MLPModule(nn.Module):
+        def __init__(self, device):
+            super().__init__()
+            torch.manual_seed(5)
+            self.net1 = nn.Linear(10, 16, bias=True, device=device)
+            self.relu = nn.ReLU()
+            self.net2 = nn.Linear(16, 10, bias=True, device=device)
+
+        def forward(self, x):
+            a = self.net1(x)
+            b = self.relu(a)
+            c = self.net2(b)
+            return c
+
+    class MLPModule2(nn.Module):
+        def __init__(self, device):
+            super().__init__()
+            torch.manual_seed(5)
+            self.net1 = nn.Linear(10, 16, bias=True, device=device)
+            self.relu = nn.ReLU()
+            self.net2 = nn.Linear(16, 10, bias=True, device=device)
+
+        def forward(self, x):
+            d = self.net1(x)
+            e = self.relu(d)
+            f = self.net2(e)
+            return f
+
+    def collect_frames(
+        self, augmented_snapshot, collect_device_traces=True, collect_segments=True
+    ):
+        """Collects all frames that has node metadata from a memory snapshot."""
+        # Collect all frames with FX metadata
+        fx_frames = []
+
+        # Check device traces for FX debug fields
+        if collect_device_traces and "device_traces" in augmented_snapshot:
+            for trace_list in augmented_snapshot["device_traces"]:
+                for trace_entry in trace_list:
+                    if isinstance(trace_entry, dict) and "frames" in trace_entry:
+                        for frame in trace_entry["frames"]:
+                            if isinstance(frame, dict):
+                                # Check for FX debug fields
+                                if "fx_node_op" in frame or "fx_node_name" in frame:
+                                    fx_frames.append(frame)
+
+        # Check segments/blocks for FX debug fields
+        if collect_segments and "segments" in augmented_snapshot:
+            for segment in augmented_snapshot["segments"]:
+                if "blocks" in segment:
+                    for block in segment["blocks"]:
+                        if "frames" in block:
+                            for frame in block["frames"]:
+                                if isinstance(frame, dict):
+                                    if "fx_node_op" in frame or "fx_node_name" in frame:
+                                        fx_frames.append(frame)
+        return fx_frames
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    @torch.fx.experimental._config.patch("enrich_profiler_metadata", True)
+    def test_fx_memory_profiler_augmentation(self):
+        """Test that memory snapshots are augmented with FX debug information."""
+
+        device = "cuda"
+        mod = self.MLPModule(device)
+        # reset cache to start fresh
+        torch.cuda.memory.empty_cache()
+        torch.cuda.memory._record_memory_history()
+        compiled = torch.compile(mod, backend="aot_eager", fullgraph=True)
+        result = compiled(torch.randn(10, 10, device=device))
+        augmented_snapshot = torch.cuda.memory._snapshot(augment_with_fx_traces=True)
+        torch.cuda.memory._record_memory_history(enabled=None, clear_history=True)
+        torch.cuda.empty_cache()
+
+        fx_frames = self.collect_frames(augmented_snapshot)
+        self.assertGreater(len(fx_frames), 2)
+
+        for frame in fx_frames:
+            # Every FX frame should have both node_op and node_name
+            self.assertIn("fx_node_op", frame)
+            self.assertIn("fx_node_name", frame)
+            self.assertIn("fx_node_target", frame)
+            self.assertIn("fx_original_trace", frame)
+
+            self.assertIn(frame["fx_node_name"], ["addmm", "relu", "addmm_1"])
+            fx_node_name = frame["fx_node_name"]
+            if fx_node_name == "addmm":
+                self.assertIn("a = self.net1(x)", frame["fx_original_trace"])
+            elif fx_node_name == "addmm_1":
+                self.assertIn("c = self.net2(b)", frame["fx_original_trace"])
+            elif fx_node_name == "relu":
+                self.assertIn("b = self.relu(a)", frame["fx_original_trace"])
+
+        # Test that when we have two graphs with the same src_code, they're not hashed
+        # to the same metadata
+        mod = self.MLPModule2(device)
+        torch.cuda.memory._record_memory_history()
+        compiled = torch.compile(mod, backend="aot_eager", fullgraph=True)
+        result = compiled(torch.randn(10, 10, device=device))
+        augmented_snapshot = torch.cuda.memory._snapshot(augment_with_fx_traces=True)
+        torch.cuda.memory._record_memory_history(enabled=None, clear_history=True)
+
+        # avoid collecting segments from previous run for unit test purpose
+        fx_frames = self.collect_frames(augmented_snapshot, collect_segments=False)
+        self.assertGreater(len(fx_frames), 0)
+
+        for frame in fx_frames:
+            # Every FX frame should have both node_op and node_name
+            self.assertIn("fx_node_op", frame)
+            self.assertIn("fx_node_name", frame)
+            self.assertIn("fx_node_target", frame)
+            self.assertIn("fx_original_trace", frame)
+
+            self.assertIn(frame["fx_node_name"], ["addmm", "relu", "addmm_1"])
+            fx_node_name = frame["fx_node_name"]
+            if fx_node_name == "addmm":
+                self.assertIn("d = self.net1(x)", frame["fx_original_trace"])
+            elif fx_node_name == "addmm_1":
+                self.assertIn("f = self.net2(e)", frame["fx_original_trace"])
+            elif fx_node_name == "relu":
+                self.assertIn("e = self.relu(d)", frame["fx_original_trace"])
+
+
 instantiate_parametrized_tests(TestCuda)
 instantiate_parametrized_tests(TestCudaMallocAsync)
 instantiate_parametrized_tests(TestCompileKernel)
diff --git a/test/test_cuda_primary_ctx.py b/test/test_cuda_primary_ctx.py
index 7ce0b19ce884f..60d4f36e0c16e 100644
--- a/test/test_cuda_primary_ctx.py
+++ b/test/test_cuda_primary_ctx.py
@@ -24,6 +24,7 @@ class TestCudaPrimaryCtx(TestCase):
     )
 
     def setUp(self):
+        super().setUp()
         for device in range(torch.cuda.device_count()):
             # Ensure context has not been created beforehand
             self.assertFalse(
diff --git a/test/test_cuda_sanitizer.py b/test/test_cuda_sanitizer.py
index 6d2ecc36a093c..e5dae52354a72 100644
--- a/test/test_cuda_sanitizer.py
+++ b/test/test_cuda_sanitizer.py
@@ -143,6 +143,7 @@ def event_id(i: int) -> EventId:
 
 class TestEventHandler(TestCase):
     def setUp(self):
+        super().setUp()
         self.handler = csan.EventHandler()
 
     def kernel_launch(
@@ -397,6 +398,7 @@ def test_event_synchronize(self):
 
 class TestMessages(TestCase):
     def setUp(self):
+        super().setUp()
         self.handler = csan.EventHandler()
 
     def test_ensure_exists(self):
diff --git a/test/test_cuda_trace.py b/test/test_cuda_trace.py
index 124b0ac41b871..0794683f4ef26 100644
--- a/test/test_cuda_trace.py
+++ b/test/test_cuda_trace.py
@@ -20,6 +20,7 @@
 @torch.testing._internal.common_utils.markDynamoStrictTest
 class TestCudaTrace(TestCase):
     def setUp(self):
+        super().setUp()
         torch._C._activate_gpu_trace()
         self.mock = unittest.mock.MagicMock()
 
diff --git a/test/test_custom_ops.py b/test/test_custom_ops.py
index 5898f5a346bac..bcc9c377e5049 100644
--- a/test/test_custom_ops.py
+++ b/test/test_custom_ops.py
@@ -1227,7 +1227,7 @@ def foo_impl(x):
 
         from torch._custom_op.impl import SUPPORTED_DEVICE_TYPE_TO_KEY
 
-        for device_type in SUPPORTED_DEVICE_TYPE_TO_KEY.keys():
+        for device_type in SUPPORTED_DEVICE_TYPE_TO_KEY:
             # Smoke test: should not raise error
             custom_ops.impl(f"{TestCustomOp.test_ns}::foo", device_types=device_type)(
                 foo_impl
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index ba3fe63ed1f1b..d42d1cd56600a 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -2015,6 +2015,26 @@ def test_worker_init_fn(self):
             self.assertEqual(12345, batch[0])
             self.assertEqual(12345, batch[1])
 
+    @unittest.skipIf(
+        IS_WINDOWS or IS_MACOS,
+        "`ValueError: cannot find context for 'forkserver'` in Windows",
+    )
+    def test_worker_init_fn_forkserver(self):
+        def local_init_fn(worker_id):
+            torch.manual_seed(12345)
+
+        import multiprocessing as py_mp
+
+        py_mp.set_start_method("forkserver", force=True)
+
+        dataset = SeedDataset(4)
+        dataloader = self._get_data_loader(
+            dataset, batch_size=2, num_workers=2, worker_init_fn=local_init_fn
+        )
+        with self.assertWarnsRegex(UserWarning, "Got pickle error when"):
+            with self.assertRaises(Exception):
+                next(iter(dataloader))
+
     def test_get_worker_info(self):
         p = ErrorTrackingProcess(target=_test_get_worker_info)
         p.start()
@@ -3460,7 +3480,7 @@ def _run_ind_worker_queue_test(self, batch_size, num_workers):
             batch_size=batch_size,
             shuffle=False,
             num_workers=num_workers,
-            timeout=5,
+            timeout=JOIN_TIMEOUT,
             worker_init_fn=self.dataset.worker_init_fn,
         )
         current_worker_idx = 0
@@ -3478,33 +3498,31 @@ def _run_ind_worker_queue_test(self, batch_size, num_workers):
         "Flaky on Windows and MacOS https://github.com/pytorch/pytorch/issues/68643",
     )
     def test_ind_worker_queue(self):
-        max_num_workers = None
-        if hasattr(os, "sched_getaffinity"):
-            try:
-                max_num_workers = len(os.sched_getaffinity(0))
-            except Exception:
-                pass
-        if max_num_workers is None:
-            cpu_count = os.cpu_count()
-            if cpu_count is not None:
-                # Use half number of CPUs
-                max_num_workers = cpu_count // 2
-
-        if max_num_workers is None:
-            max_num_workers = 1
-
-        for batch_size in (8, 16, 32, 64):
-            for num_workers in range(min(6, max_num_workers)):
+        for batch_size in (8, 32, 64):
+            for num_workers in range(1, 6):
                 self._run_ind_worker_queue_test(
-                    batch_size=batch_size, num_workers=num_workers + 1
+                    batch_size=batch_size, num_workers=num_workers
                 )
 
 
 class SetAffinityDataset(IterableDataset):
+    def __init__(self, expected_affinity=None):
+        self.expected_affinity = expected_affinity
+
     def __iter__(self):
-        torch.randperm(1)
-        after = os.sched_getaffinity(0)
-        return iter(after)
+        affinity_mask = os.sched_getaffinity(0)
+        return iter(affinity_mask)
+
+
+def _worker_set_affinity_init(worker_id):
+    worker_info = torch.utils.data.get_worker_info()
+    if worker_info is not None:
+        dataset = worker_info.dataset
+        if (
+            isinstance(dataset, SetAffinityDataset)
+            and dataset.expected_affinity is not None
+        ):
+            os.sched_setaffinity(0, [dataset.expected_affinity])
 
 
 @unittest.skipIf(
@@ -3519,14 +3537,14 @@ def test_set_affinity_in_worker_init(self):
         # Choose any
         expected_affinity = list(old_affinity)[-1]
 
-        def worker_set_affinity(_):
-            os.sched_setaffinity(0, [expected_affinity])
-
-        dataset = SetAffinityDataset()
-
+        # Pass expected affinity through the dataset
+        dataset = SetAffinityDataset(expected_affinity=expected_affinity)
         dataloader = torch.utils.data.DataLoader(
-            dataset, num_workers=2, worker_init_fn=worker_set_affinity
+            dataset,
+            num_workers=2,
+            worker_init_fn=_worker_set_affinity_init,
         )
+
         for sample in dataloader:
             self.assertEqual(sample, [expected_affinity])
 
diff --git a/test/test_datapipe.py b/test/test_datapipe.py
index 5a535e7e00663..cab86e42734f1 100644
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@@ -1136,7 +1136,7 @@ def test_fork_iterdatapipe(self):
                     )
                 break
         with warnings.catch_warnings(record=True) as wa:
-            for i, (n1, n2) in enumerate(zip(dp1, dp2)):
+            for n1, n2 in zip(dp1, dp2):
                 output1.append(n1)
                 output2.append(n2)
             self.assertEqual(len(wa), 1)
diff --git a/test/test_decomp.py b/test/test_decomp.py
index f5c791c8cbe88..07d14c5d78956 100644
--- a/test/test_decomp.py
+++ b/test/test_decomp.py
@@ -1338,7 +1338,7 @@ def test_aten_core_operators(self):
         # operators, which never appear in AOTAutograd's graph so are never used.
         useful_decomps = {
             op
-            for op in decomposition_table.keys()
+            for op in decomposition_table
             if isinstance(op, torch._ops.OpOverload) and self._can_appear_in_trace(op)
         }
         core_decomps = torch._decomp.core_aten_decompositions().keys()
@@ -1394,6 +1394,357 @@ def check_case(
         check_case(groups=1, C_in=8, C_out=12)  # groups=1 bigger
         check_case(groups=2, C_in=8, C_out=12)  # grouped conv
 
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    @torch._dynamo.config.patch("capture_dynamic_output_shape_ops", True)
+    def test_mm_decompose_mm_dde(self):
+        def fuzzed_program(
+            arg_0,
+            arg_1,
+            arg_2,
+            arg_3,
+            arg_4,
+            arg_5,
+            arg_6,
+            arg_7,
+            arg_8,
+            arg_9,
+            arg_10,
+            arg_11,
+            arg_12,
+            arg_13,
+            arg_14,
+            arg_15,
+            arg_16,
+            arg_17,
+            arg_18,
+            sentinel,
+        ):
+            var_node_6 = (
+                arg_0  # size=(9, 9, 9), stride=(81, 9, 1), dtype=float64, device=cuda
+            )
+            var_node_7 = (
+                arg_1  # size=(9, 9, 11), stride=(99, 11, 1), dtype=float64, device=cuda
+            )
+            var_node_5 = torch.matmul(
+                var_node_6.to(torch.float64), var_node_7.to(torch.float64)
+            )  # size=(9, 9, 11), stride=(99, 11, 1), dtype=float64, device=cuda
+            var_node_9 = torch.full(
+                (9, 11, 12), 1.5758497316910556, dtype=torch.float64
+            )  # size=(9, 11, 12), stride=(132, 12, 1), dtype=float64, device=cuda
+            var_node_10 = (
+                arg_2  # size=(9, 12, 8), stride=(96, 8, 1), dtype=float64, device=cuda
+            )
+            var_node_8 = torch.matmul(
+                var_node_9.to(torch.float64), var_node_10.to(torch.float64)
+            )  # size=(9, 11, 8), stride=(88, 8, 1), dtype=float64, device=cuda
+            var_node_4 = torch.matmul(
+                var_node_5.to(torch.float64), var_node_8.to(torch.float64)
+            )  # size=(9, 9, 8), stride=(72, 8, 1), dtype=float64, device=cuda
+            var_node_13 = arg_3  # size=(9, 8, 13), stride=(104, 13, 1), dtype=float64, device=cuda
+            var_node_14 = (
+                arg_4  # size=(9, 13, 7), stride=(91, 7, 1), dtype=float64, device=cuda
+            )
+            var_node_12 = torch.matmul(
+                var_node_13.to(torch.float64), var_node_14.to(torch.float64)
+            )  # size=(9, 8, 7), stride=(56, 7, 1), dtype=float64, device=cuda
+            var_node_15 = arg_5  # size=(9, 7, 16), stride=(112, 16, 1), dtype=float64, device=cuda
+            var_node_11 = torch.matmul(
+                var_node_12.to(torch.float64), var_node_15.to(torch.float64)
+            )  # size=(9, 8, 16), stride=(128, 16, 1), dtype=float64, device=cuda
+            var_node_3 = torch.matmul(
+                var_node_4.to(torch.float64), var_node_11.to(torch.float64)
+            )  # size=(9, 9, 16), stride=(144, 16, 1), dtype=float64, device=cuda
+            var_node_17 = arg_6  # size=(9, 16, 12), stride=(192, 12, 1), dtype=float64, device=cuda
+            var_node_18 = arg_7  # size=(9, 12, 11), stride=(132, 11, 1), dtype=float64, device=cuda
+            var_node_16 = torch.matmul(
+                var_node_17.to(torch.float64), var_node_18.to(torch.float64)
+            )  # size=(9, 16, 11), stride=(176, 11, 1), dtype=float64, device=cuda
+            var_node_2 = torch.matmul(
+                var_node_3.to(torch.float64), var_node_16.to(torch.float64)
+            )  # size=(9, 9, 11), stride=(99, 11, 1), dtype=float64, device=cuda
+            var_node_23 = torch.full(
+                (156, 8), -0.5249394453404403, dtype=torch.float64
+            )  # size=(156, 8), stride=(8, 1), dtype=float64, device=cuda
+            var_node_24 = torch.full(
+                (8, 9), 0.9331226188585692, dtype=torch.float64
+            )  # size=(8, 9), stride=(9, 1), dtype=float64, device=cuda
+            var_node_22 = torch.matmul(
+                var_node_23.to(torch.float64), var_node_24.to(torch.float64)
+            )  # size=(156, 9), stride=(9, 1), dtype=float64, device=cuda
+            var_node_26 = torch.full(
+                (9, 13), -0.9276381954691514, dtype=torch.float64
+            )  # size=(9, 13), stride=(13, 1), dtype=float64, device=cuda
+            var_node_27 = torch.full(
+                (13, 16), 0.024752238943232543, dtype=torch.float64
+            )  # size=(13, 16), stride=(16, 1), dtype=float64, device=cuda
+            var_node_25 = torch.matmul(
+                var_node_26.to(torch.float64), var_node_27.to(torch.float64)
+            )  # size=(9, 16), stride=(16, 1), dtype=float64, device=cuda
+            var_node_21 = torch.matmul(
+                var_node_22.to(torch.float64), var_node_25.to(torch.float64)
+            )  # size=(156, 16), stride=(16, 1), dtype=float64, device=cuda
+            var_node_29 = arg_8
+            _x_nz = torch.zeros(
+                (9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
+                dtype=torch.bool,
+                device=var_node_29.device,
+            )
+            _x_nz_flat = _x_nz.reshape(-1)
+            _x_nz_flat[:9] = True
+            var_node_28 = torch.nonzero(
+                _x_nz
+            )  # size=(9, 11), stride=(11, 1), dtype=int64, device=cuda
+            var_node_20 = torch.nn.functional.embedding(
+                torch.clamp(var_node_28.to(torch.int64), 0, var_node_21.size(0) - 1),
+                var_node_21,
+            )  # size=(9, 11, 16), stride=(176, 16, 1), dtype=float64, device=cuda
+            var_node_33 = torch.full(
+                (9, 16, 5), 1.0707914920634904, dtype=torch.float64
+            )  # size=(9, 16, 5), stride=(80, 5, 1), dtype=float64, device=cuda
+            var_node_34 = torch.full(
+                (9, 5, 10), -0.44934093079047227, dtype=torch.float64
+            )  # size=(9, 5, 10), stride=(50, 10, 1), dtype=float64, device=cuda
+            var_node_32 = torch.matmul(
+                var_node_33.to(torch.float64), var_node_34.to(torch.float64)
+            )  # size=(9, 16, 10), stride=(160, 10, 1), dtype=float64, device=cuda
+            var_node_36 = (
+                arg_9  # size=(9, 10, 1), stride=(10, 1, 1), dtype=float64, device=cuda
+            )
+            var_node_37 = torch.full(
+                (9, 1, 11), -1.874293687140311, dtype=torch.float64
+            )  # size=(9, 1, 11), stride=(11, 11, 1), dtype=float64, device=cuda
+            var_node_35 = torch.matmul(
+                var_node_36.to(torch.float64), var_node_37.to(torch.float64)
+            )  # size=(9, 10, 11), stride=(110, 11, 1), dtype=float64, device=cuda
+            var_node_31 = torch.matmul(
+                var_node_32.to(torch.float64), var_node_35.to(torch.float64)
+            )  # size=(9, 16, 11), stride=(176, 11, 1), dtype=float64, device=cuda
+            var_node_40 = torch.full(
+                (990, 2), 0.4084376380351558, dtype=torch.float64
+            )  # size=(990, 2), stride=(2, 1), dtype=float64, device=cuda
+            var_node_41 = torch.full(
+                (2,), 0.982671965550022, dtype=torch.float64
+            )  # size=(2,), stride=(1,), dtype=float64, device=cuda
+            var_node_39 = torch.matmul(
+                var_node_40.to(torch.float64), var_node_41.to(torch.float64)
+            )  # size=(990,), stride=(1,), dtype=float64, device=cuda
+            var_node_38 = torch.reshape(
+                var_node_39, [9, 11, 10]
+            )  # size=(9, 11, 10), stride=(110, 10, 1), dtype=float64, device=cuda
+            var_node_30 = torch.matmul(
+                var_node_31.to(torch.float64), var_node_38.to(torch.float64)
+            )  # size=(9, 16, 10), stride=(160, 10, 1), dtype=float64, device=cuda
+            var_node_19 = torch.matmul(
+                var_node_20.to(torch.float64), var_node_30.to(torch.float64)
+            )  # size=(9, 11, 10), stride=(110, 10, 1), dtype=float64, device=cuda
+            var_node_1 = torch.matmul(
+                var_node_2.to(torch.float64), var_node_19.to(torch.float64)
+            )  # size=(9, 9, 10), stride=(90, 10, 1), dtype=float64, device=cuda
+            var_node_47 = arg_10  # size=(9, 10, 15), stride=(150, 15, 1), dtype=float64, device=cuda
+            var_node_48 = torch.full(
+                (9, 15, 2), -0.3349339402390618, dtype=torch.float64
+            )  # size=(9, 15, 2), stride=(30, 2, 1), dtype=float64, device=cuda
+            var_node_46 = torch.matmul(
+                var_node_47.to(torch.float64), var_node_48.to(torch.float64)
+            )  # size=(9, 10, 2), stride=(20, 2, 1), dtype=float64, device=cuda
+            var_node_50 = (
+                arg_11  # size=(9, 2, 7), stride=(14, 7, 1), dtype=float64, device=cuda
+            )
+            var_node_51 = (
+                arg_12  # size=(9, 7, 2), stride=(14, 2, 1), dtype=float64, device=cuda
+            )
+            var_node_49 = torch.matmul(
+                var_node_50.to(torch.float64), var_node_51.to(torch.float64)
+            )  # size=(9, 2, 2), stride=(4, 2, 1), dtype=float64, device=cuda
+            var_node_45 = torch.matmul(
+                var_node_46.to(torch.float64), var_node_49.to(torch.float64)
+            )  # size=(9, 10, 2), stride=(20, 2, 1), dtype=float64, device=cuda
+            var_node_52 = torch.full(
+                (9, 2, 1), -0.4046675639434615, dtype=torch.float64
+            )  # size=(9, 2, 1), stride=(2, 1, 1), dtype=float64, device=cuda
+            var_node_44 = torch.matmul(
+                var_node_45.to(torch.float64), var_node_52.to(torch.float64)
+            )  # size=(9, 10, 1), stride=(10, 1, 1), dtype=float64, device=cuda
+            var_node_56 = (
+                arg_13  # size=(9, 1, 1), stride=(1, 1, 1), dtype=float64, device=cuda
+            )
+            var_node_55 = torch.nn.functional.rms_norm(
+                var_node_56.to(torch.float64), (1,)
+            )  # size=(9, 1, 1), stride=(1, 1, 1), dtype=float64, device=cuda
+            var_node_57 = torch.full(
+                (9, 1, 8), 0.17877664640931384, dtype=torch.float64
+            )  # size=(9, 1, 8), stride=(8, 8, 1), dtype=float64, device=cuda
+            var_node_54 = torch.matmul(
+                var_node_55.to(torch.float64), var_node_57.to(torch.float64)
+            )  # size=(9, 1, 8), stride=(8, 8, 1), dtype=float64, device=cuda
+            var_node_60 = arg_14  # size=(9, 8, 10), stride=(80, 10, 1), dtype=float64, device=cuda
+            var_node_61 = torch.full(
+                (9, 10, 6), 0.43614806380221494, dtype=torch.float64
+            )  # size=(9, 10, 6), stride=(60, 6, 1), dtype=float64, device=cuda
+            var_node_59 = torch.matmul(
+                var_node_60.to(torch.float64), var_node_61.to(torch.float64)
+            )  # size=(9, 8, 6), stride=(48, 6, 1), dtype=float64, device=cuda
+            var_node_63 = (
+                arg_15  # size=(9, 6, 3), stride=(18, 3, 1), dtype=float64, device=cuda
+            )
+            var_node_64 = torch.full(
+                (9, 3, 8), -0.042774422041922854, dtype=torch.float64
+            )  # size=(9, 3, 8), stride=(24, 8, 1), dtype=float64, device=cuda
+            var_node_62 = torch.matmul(
+                var_node_63.to(torch.float64), var_node_64.to(torch.float64)
+            )  # size=(9, 6, 8), stride=(48, 8, 1), dtype=float64, device=cuda
+            var_node_58 = torch.matmul(
+                var_node_59.to(torch.float64), var_node_62.to(torch.float64)
+            )  # size=(9, 8, 8), stride=(64, 8, 1), dtype=float64, device=cuda
+            var_node_53 = torch.matmul(
+                var_node_54.to(torch.float64), var_node_58.to(torch.float64)
+            )  # size=(9, 1, 8), stride=(8, 8, 1), dtype=float64, device=cuda
+            var_node_43 = torch.matmul(
+                var_node_44.to(torch.float64), var_node_53.to(torch.float64)
+            )  # size=(9, 10, 8), stride=(80, 8, 1), dtype=float64, device=cuda
+            var_node_68 = arg_16  # size=(9, 8, 16), stride=(128, 16, 1), dtype=float64, device=cuda
+            var_node_70 = torch.full(
+                (9, 16, 15), 0.24947808634496438, dtype=torch.float64
+            )  # size=(9, 16, 15), stride=(240, 15, 1), dtype=float64, device=cuda
+            var_node_71 = torch.full(
+                (9, 15, 7), -0.09035245509773453, dtype=torch.float64
+            )  # size=(9, 15, 7), stride=(105, 7, 1), dtype=float64, device=cuda
+            var_node_69 = torch.matmul(
+                var_node_70.to(torch.float64), var_node_71.to(torch.float64)
+            )  # size=(9, 16, 7), stride=(112, 7, 1), dtype=float64, device=cuda
+            var_node_67 = torch.matmul(
+                var_node_68.to(torch.float64), var_node_69.to(torch.float64)
+            )  # size=(9, 8, 7), stride=(56, 7, 1), dtype=float64, device=cuda
+            var_node_74 = torch.full(
+                (9, 7, 1), 0.05671950481832341, dtype=torch.float64
+            )  # size=(9, 7, 1), stride=(7, 1, 1), dtype=float64, device=cuda
+            var_node_73 = torch.nn.functional.gelu(
+                var_node_74
+            )  # size=(9, 7, 1), stride=(7, 1, 1), dtype=float64, device=cuda
+            var_node_76 = torch.full(
+                (9, 1, 2), -0.019912810353597852, dtype=torch.float64
+            )  # size=(9, 1, 2), stride=(2, 2, 1), dtype=float64, device=cuda
+            var_node_77 = (
+                arg_17  # size=(9, 2, 7), stride=(14, 7, 1), dtype=float64, device=cuda
+            )
+            var_node_75 = torch.matmul(
+                var_node_76.to(torch.float64), var_node_77.to(torch.float64)
+            )  # size=(9, 1, 7), stride=(7, 7, 1), dtype=float64, device=cuda
+            var_node_72 = torch.matmul(
+                var_node_73.to(torch.float64), var_node_75.to(torch.float64)
+            )  # size=(9, 7, 7), stride=(49, 7, 1), dtype=float64, device=cuda
+            var_node_66 = torch.matmul(
+                var_node_67.to(torch.float64), var_node_72.to(torch.float64)
+            )  # size=(9, 8, 7), stride=(56, 7, 1), dtype=float64, device=cuda
+            var_node_78 = arg_18  # size=(9, 7, 13), stride=(91, 13, 1), dtype=float64, device=cuda
+            var_node_65 = torch.matmul(
+                var_node_66.to(torch.float64), var_node_78.to(torch.float64)
+            )  # size=(9, 8, 13), stride=(104, 13, 1), dtype=float64, device=cuda
+            var_node_42 = torch.matmul(
+                var_node_43.to(torch.float64), var_node_65.to(torch.float64)
+            )  # size=(9, 10, 13), stride=(130, 13, 1), dtype=float64, device=cuda
+            var_node_0 = torch.matmul(
+                var_node_1.to(torch.float64), var_node_42.to(torch.float64)
+            )  # size=(9, 9, 13), stride=(117, 13, 1), dtype=float64, device=cuda
+            # Ensure gradient computation by multiplying with sentinel and taking real part
+            result = var_node_0 * sentinel
+            if result.is_complex():
+                result = result.real
+            return result
+
+        # Sentinel tensor to ensure gradient computation
+        sentinel = torch.tensor(1.0, requires_grad=True)
+
+        arg_0 = torch.as_strided(
+            torch.randn(729).to(torch.float64), (9, 9, 9), (81, 9, 1)
+        )
+        arg_1 = torch.as_strided(
+            torch.randn(891).to(torch.float64), (9, 9, 11), (99, 11, 1)
+        )
+        arg_2 = torch.as_strided(
+            torch.randn(864).to(torch.float64), (9, 12, 8), (96, 8, 1)
+        )
+        arg_3 = torch.as_strided(
+            torch.randn(936).to(torch.float64), (9, 8, 13), (104, 13, 1)
+        )
+        arg_4 = torch.as_strided(
+            torch.randn(819).to(torch.float64), (9, 13, 7), (91, 7, 1)
+        )
+        arg_5 = torch.as_strided(
+            torch.randn(1008).to(torch.float64), (9, 7, 16), (112, 16, 1)
+        )
+        arg_6 = torch.as_strided(
+            torch.randn(1728).to(torch.float64), (9, 16, 12), (192, 12, 1)
+        )
+        arg_7 = torch.as_strided(
+            torch.randn(1188).to(torch.float64), (9, 12, 11), (132, 11, 1)
+        )
+        arg_8 = torch.as_strided(
+            torch.randint(0, 2, (1,), dtype=torch.int8).bool(),
+            (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
+            (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
+        )
+        arg_9 = torch.as_strided(
+            torch.randn(90).to(torch.float64), (9, 10, 1), (10, 1, 1)
+        )
+        arg_10 = torch.as_strided(
+            torch.randn(1350).to(torch.float64), (9, 10, 15), (150, 15, 1)
+        )
+        arg_11 = torch.as_strided(
+            torch.randn(126).to(torch.float64), (9, 2, 7), (14, 7, 1)
+        )
+        arg_12 = torch.as_strided(
+            torch.randn(126).to(torch.float64), (9, 7, 2), (14, 2, 1)
+        )
+        arg_13 = torch.as_strided(
+            torch.randn(9).to(torch.float64), (9, 1, 1), (1, 1, 1)
+        )
+        arg_14 = torch.as_strided(
+            torch.randn(720).to(torch.float64), (9, 8, 10), (80, 10, 1)
+        )
+        arg_15 = torch.as_strided(
+            torch.randn(162).to(torch.float64), (9, 6, 3), (18, 3, 1)
+        )
+        arg_16 = torch.as_strided(
+            torch.randn(1152).to(torch.float64), (9, 8, 16), (128, 16, 1)
+        )
+        arg_17 = torch.as_strided(
+            torch.randn(126).to(torch.float64), (9, 2, 7), (14, 7, 1)
+        )
+        arg_18 = torch.as_strided(
+            torch.randn(819).to(torch.float64), (9, 7, 13), (91, 13, 1)
+        )
+
+        args = (
+            arg_0,
+            arg_1,
+            arg_2,
+            arg_3,
+            arg_4,
+            arg_5,
+            arg_6,
+            arg_7,
+            arg_8,
+            arg_9,
+            arg_10,
+            arg_11,
+            arg_12,
+            arg_13,
+            arg_14,
+            arg_15,
+            arg_16,
+            arg_17,
+            arg_18,
+        ) + (sentinel,)
+        result_original = fuzzed_program(*args)
+        compiled_program = torch.compile(fuzzed_program, fullgraph=True, dynamic=True)
+        result_compiled = compiled_program(*args)
+
+        # Both should succeed without NameError
+        self.assertTrue(
+            torch.allclose(result_original, result_compiled, rtol=1e-5, atol=1e-5)
+        )
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_dynamic_shapes.py b/test/test_dynamic_shapes.py
index fb1d22805d50a..41ce5af6a28be 100644
--- a/test/test_dynamic_shapes.py
+++ b/test/test_dynamic_shapes.py
@@ -425,6 +425,17 @@ def test_symint_bitwise_or(self):
             str(shape_env.guards[0][0]), """Eq(BitwiseFn_bitwise_or(s97, s26), 14)"""
         )
 
+    def test_symint_bitwise_xor(self):
+        shape_env = ShapeEnv()
+        a0 = create_symint(shape_env, 0b1100)
+        b0 = create_symint(shape_env, 0b1010)
+        res_xor = a0 ^ b0
+        self.assertEqual(res_xor, 0b0110)
+        self.assertIsInstance(res_xor, torch.SymInt, msg=type(res_xor))
+        self.assertExpectedInline(
+            str(shape_env.guards[0][0]), """Eq(BitwiseFn_bitwise_xor(s97, s26), 6)"""
+        )
+
     def test_stride(self):
         shape_env = ShapeEnv()
         x = create_symbolic_tensor("x", torch.randn(5, 5), shape_env)
@@ -4401,6 +4412,169 @@ def func(x, y):
 
         self.assertEqual(compiled(a, b), func(a, b))
 
+    @fresh_cache()
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_narrow_unbacked_start(self):
+        def func(x, start, length):
+            # unbacked start
+            u0 = start.item()
+            return torch.narrow(x, 0, u0, length)
+
+        compiled_func = torch.compile(func, fullgraph=True, backend="inductor")
+
+        x = torch.tensor([1, 2, 3, 4, 5, 6])
+
+        # Test cases: (start, length)
+        test_cases = [
+            # Negative starts
+            (-2, 2),  # Start from second-to-last element
+            (-1, 1),  # Start from last element
+            (-3, 3),  # Start from third-to-last element
+            (-6, 2),  # Start from beginning (negative)
+            (-4, 1),  # Start from fourth-to-last element
+            # Positive starts
+            (0, 2),  # Start from beginning
+            (1, 3),  # Start from second element
+            (2, 2),  # Start from third element
+            (4, 2),  # Start near end
+            # Edge cases
+            (0, 6),  # Full tensor
+            (0, 1),  # Single element from start
+            (5, 1),  # Single element from end
+        ]
+
+        for start_val, length in test_cases:
+            with self.subTest(start=start_val, length=length):
+                start = torch.tensor([start_val])
+
+                # Test with compiled function
+                result_compiled = compiled_func(x, start, length)
+
+                # Test with eager function (expected behavior)
+                result_eager = func(x, start, length)
+
+                # Compare results
+                self.assertEqual(result_compiled, result_eager)
+
+    @fresh_cache()
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    @torch._inductor.config.patch("cpp_wrapper", True)
+    def test_narrow_unbacked_start_cpp_wrapper(self):
+        """Test narrow with unbacked start with cpp_wrapper"""
+        self.test_narrow_unbacked_start()
+
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_narrow_with_tensor_start(self):
+        @torch.compile(backend="inductor", fullgraph=True)
+        def f(x, start, end):
+            return torch.narrow(x, 0, start, end)
+
+        x = torch.tensor(
+            [False], device="cuda:0" if torch.cuda.is_available() else "cpu"
+        )
+        start = torch.tensor(0)
+        res = f(x, start, 0)
+        self.assertEqual(res.shape, torch.Size([0]))
+
+    @skipIfTorchDynamo()
+    @torch.fx.experimental._config.patch("backed_size_oblivious", True)
+    def test_backed_size_oblivious_expand(self):
+        cnt = CompileCounterWithBackend("inductor")
+        torch._dynamo.reset()
+
+        def func(a, shape):
+            return a.expand(*shape)
+
+        compiled = torch.compile(func, fullgraph=True, backend=cnt, dynamic=True)
+
+        def run(a, shape):
+            self.assertEqual(compiled(a, shape), func(a, shape))
+
+        # No specialization - matching dimensions
+        run(torch.rand(2, 10), (2, 10))
+        run(torch.rand(5, 5), (5, 5))
+        self.assertEqual(cnt.frame_count, 1)  # Single compilation
+
+        cnt.clear()
+        torch._dynamo.reset()
+
+        # Specialize input dim to 1 (broadcasting)
+        # Input dim is 1, needs to expand to larger size
+        run(torch.rand(1, 10), (9, 10))  # Compile with input[0] == 1
+        run(torch.rand(1, 5), (7, 5))  # Reuse same compiled graph
+        self.assertEqual(cnt.frame_count, 1)
+        # Changing input from 1 triggers recompilation
+        run(torch.rand(5, 10), (5, 10))
+        self.assertEqual(cnt.frame_count, 2)
+
+        cnt.clear()
+        torch._dynamo.reset()
+
+        # Multi-dimensional broadcasting
+        # Multiple dimensions with different broadcast semantics
+        run(torch.rand(1, 1, 10), (5, 7, 10))  # Broadcast first two dims
+        run(torch.rand(1, 1, 5), (3, 4, 5))  # Reuse pattern
+        self.assertEqual(cnt.frame_count, 1)
+
+    @skipIfTorchDynamo()
+    @torch.fx.experimental._config.patch("backed_size_oblivious", True)
+    def test_backed_size_oblivious_broadcast(self):
+        cnt = CompileCounterWithBackend("inductor")
+        torch._dynamo.reset()
+
+        def func(a, b):
+            torch.broadcast_shapes(a.size(), b.size())
+            return a + b
+
+        compiled = torch.compile(func, fullgraph=True, backend=cnt, dynamic=True)
+
+        def run(a, b):
+            self.assertEqual(compiled(a, b), func(a, b))
+
+        # No 0/1 specializations, no broadcasts.
+        # but a[0] == b[0] and a[1] == b[1] are asserted.
+        run(torch.rand(1, 10), torch.rand(1, 10))
+        run(torch.rand(1, 1), torch.rand(1, 1))
+        run(torch.rand(10, 10), torch.rand(10, 10))
+
+        self.assertEqual(cnt.frame_count, 1)
+        run(torch.rand(10, 10), torch.rand(1, 10))
+        self.assertEqual(cnt.frame_count, 2)
+
+        cnt.clear()
+        torch._dynamo.reset()
+
+        # specialize a[0] == 1. b[0] not specialized.
+        run(torch.rand(1, 10), torch.rand(9, 10))
+        run(torch.rand(1, 10), torch.rand(1, 10))
+        self.assertEqual(cnt.frame_count, 1)
+        # if we change a[0] we get recompilation.
+        run(torch.rand(10, 10), torch.rand(10, 10))
+        self.assertEqual(cnt.frame_count, 2)
+
+        cnt.clear()
+        torch._dynamo.reset()
+
+        # TODO duck sizing shall be disabled when backed_size_oblivious
+        # is on probably.
+        # specialize b[0] == 1. a[0] not specialized.
+        run(torch.rand(10, 11), torch.rand(1, 11))
+        run(torch.rand(1, 10), torch.rand(1, 10))
+        self.assertEqual(cnt.frame_count, 1)
+        run(torch.rand(2, 10), torch.rand(2, 10))
+        self.assertEqual(cnt.frame_count, 2)
+
+    @torch._dynamo.config.patch("capture_dynamic_output_shape_ops", True)
+    def test_unbacked_view_extra(self):
+        def fn(x):
+            i0 = x.nonzero().size(0)
+            y = torch.zeros((i0, 192))
+            return y.view([12, -1, 192])
+
+        res1 = torch.compile(fn, fullgraph=True)(torch.ones((12,)))
+        res2 = fn(torch.ones((12,)))
+        self.assertEqual(res1, res2)
+
 
 instantiate_parametrized_tests(TestUnbacked)
 
diff --git a/test/test_fake_tensor.py b/test/test_fake_tensor.py
index 692a37b193d5e..de8cbbe8d6ff1 100644
--- a/test/test_fake_tensor.py
+++ b/test/test_fake_tensor.py
@@ -1862,7 +1862,7 @@ def _read_tensor_and_check(key, sd_loaded, all_bytes, device):
             for k in sd:
                 _read_tensor_and_check(k, sd_loaded, all_bytes, "cuda")
 
-        for k in sd.keys():
+        for k in sd:
             sd[k] = sd[k].to("cuda")
 
         with TemporaryFileName() as f, torch.serialization.safe_globals([TwoTensor]):
@@ -2482,7 +2482,7 @@ def fn(x, y):
 
         def count_invoke_subgraph_keys():
             invoke_subgraph_keys = 0
-            for cache_key in FakeTensorMode.cache.keys():
+            for cache_key in FakeTensorMode.cache:
                 if isinstance(cache_key.key[0], torch._ops.HigherOrderOperator):
                     invoke_subgraph_keys += 1
             return invoke_subgraph_keys
diff --git a/test/test_fx.py b/test/test_fx.py
index 880cc91edc067..71299ddb2400d 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -72,9 +72,16 @@
     IS_WINDOWS,
     run_tests,
     skipIfTorchDynamo,
+    skipIfRocm,
 )
 from torch.testing._internal.jit_utils import JitTestCase
 
+import json
+import tempfile
+from torch.profiler import profile, ProfilerActivity
+from torch.profiler._utils import map_recorded_events_to_aten_ops_with_stack_trace
+from torch.autograd.profiler_util import _canonicalize_profiler_events
+
 try:
     from torchvision import models as torchvision_models
 
@@ -201,6 +208,36 @@ def side_effect_func(x: torch.Tensor):
     print(x)
 
 
+def _enrich_profiler_traces(prof):
+    """
+    Helper function to extract and augment profiler events with stack traces.
+
+    Args:
+        prof: A torch.profiler.profile object
+
+    Returns:
+        A string representing enriched events
+    """
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.json') as f:
+        trace_file = f.name
+        prof.export_chrome_trace(trace_file)
+
+        with open(trace_file) as f:
+            trace_data = json.load(f)
+
+        map_recorded_events_to_aten_ops_with_stack_trace(
+            trace_data
+        )
+
+        events = []
+        for event in trace_data["traceEvents"]:
+            if "args" in event and "stack_trace" in event["args"]:
+                events.append(event)
+
+        actual_traces = _canonicalize_profiler_events(events)
+        return actual_traces
+
+
 class TestFX(JitTestCase):
     def setUp(self):
         super().setUp()
@@ -771,6 +808,7 @@ def forward(self, a, b):
         gm = GraphModule(tracer.root, graph)
         expected = {1: 2, 2: 3, 3: 4, 4: 5}
         self.assertTrue(set(expected.items()).issubset(set(gm._lineno_map.items())))
+        self.assertEqual(gm._prologue_start, 4)
 
         # test custom codegen
         def transform_code(code):
@@ -780,6 +818,7 @@ def transform_code(code):
         gm.recompile()
         expected = {2: 2, 3: 3, 4: 4, 5: 5}
         self.assertTrue(set(expected.items()).issubset(set(gm._lineno_map.items())))
+        self.assertEqual(gm._prologue_start, 4)
 
     def test_graph_unique_names_manual(self):
         graph: torch.fx.Graph = torch.fx.Graph()
@@ -2032,6 +2071,31 @@ def forward(self, x):
         self.assertEqual(interpreter.run(input), gm(input))
         self.assertEqual(interpreter.run(input), m(input))
 
+    def test_interpreter_boxed_run_argument_validation(self):
+        class AddModule(torch.nn.Module):
+            def forward(self, lhs, rhs):
+                return lhs + rhs
+
+        gm = torch.fx.symbolic_trace(AddModule())
+        interpreter = Interpreter(gm)
+
+        lhs = torch.tensor(1.0)
+        rhs = torch.tensor(2.0)
+        good_args = [lhs.clone(), rhs.clone()]
+        result = interpreter.boxed_run(good_args)
+        torch.testing.assert_close(result, lhs + rhs)
+        self.assertEqual(good_args, [])
+
+        extra_args = [lhs.clone(), rhs.clone(), torch.tensor(3.0)]
+        with self.assertRaisesRegex(RuntimeError, "extra arguments"):
+            interpreter.boxed_run(extra_args)
+        self.assertEqual(len(extra_args), 3)
+
+        missing_args = [lhs.clone()]
+        with self.assertRaisesRegex(RuntimeError, "missing arguments"):
+            interpreter.boxed_run(missing_args)
+        self.assertEqual(len(missing_args), 1)
+
     def test_interpreter_other_graph(self):
         class MyModule(torch.nn.Module):
             def __init__(self) -> None:
@@ -2145,7 +2209,7 @@ def test_interpreter_gc_values(self):
         interp = Interpreter(symbolic_trace(rn18))
         inp = torch.rand(5, 3, 224, 224)
         out = interp.run(inp)
-        env_key_names = {n.name for n in interp.env.keys()}
+        env_key_names = {n.name for n in interp.env}
         self.assertEqual(env_key_names, {"output"})
 
     def test_interpreter_default_args(self):
@@ -3407,12 +3471,12 @@ def module_exists(gm: GraphModule, path: str) -> bool:
 
         def parameter_exists(gm: GraphModule, path: str) -> bool:
             return any(path == name for name, _ in gm.named_parameters()) and any(
-                path == name for name in gm.state_dict().keys()
+                path == name for name in gm.state_dict()
             )
 
         def buffer_exists(gm: GraphModule, path: str) -> bool:
             return any(path == name for name, _ in gm.named_buffers()) and any(
-                path == name for name in gm.state_dict().keys()
+                path == name for name in gm.state_dict()
             )
 
         # Test that we added the "dropout" submodule
@@ -4185,6 +4249,153 @@ def fn(a, b, c, d):
         # recorver mutable checking flag
         torch.fx.proxy.TracerBase.check_mutable_operations = orig_tracer_mutable_flag
 
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    @skipIfRocm
+    @torch.fx.experimental._config.patch("enrich_profiler_metadata", True)
+    def test_profiler_stack_trace_augmentation(self):
+        """
+        Test that map_recorded_events_to_aten_ops_with_stack_trace correctly
+        augments profiler events with stack traces from FX metadata registry.
+        """
+
+        # Simple test model
+        class TestModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear1 = torch.nn.Linear(10, 16)
+                self.relu = torch.nn.ReLU()
+                self.linear2 = torch.nn.Linear(16, 10)
+
+            def forward(self, x):
+                x = self.linear1(x)
+                x = self.relu(x)
+                x = self.linear2(x)
+                return x
+
+        model = TestModel().cuda()
+
+        # Compile the model
+        compiled_model = torch.compile(model, backend="aot_eager", fullgraph=True)
+
+        # Warmup
+        for _ in range(3):
+            _ = compiled_model(torch.randn(10, 10, device="cuda"))
+
+        # Profile with the compiled model
+        with profile(
+            activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+        ) as prof:
+            result = compiled_model(torch.randn(10, 10, device="cuda"))
+
+        actual_traces = _enrich_profiler_traces(prof)
+
+        self.assertExpectedInline(actual_traces, """\
+event=aten::t node=t stack_trace=x = self.linear1(x)
+event=aten::transpose node=t stack_trace=x = self.linear1(x)
+event=aten::as_strided node=t stack_trace=x = self.linear1(x)
+event=aten::addmm node=addmm stack_trace=x = self.linear1(x)
+event=cudaLaunchKernel node=addmm stack_trace=x = self.linear1(x)
+event=aten::relu node=relu stack_trace=x = self.relu(x)
+event=aten::clamp_min node=relu stack_trace=x = self.relu(x)
+event=cudaLaunchKernel node=relu stack_trace=x = self.relu(x)
+event=aten::t node=t_1 stack_trace=x = self.linear2(x)
+event=aten::transpose node=t_1 stack_trace=x = self.linear2(x)
+event=aten::as_strided node=t_1 stack_trace=x = self.linear2(x)
+event=aten::addmm node=addmm_1 stack_trace=x = self.linear2(x)
+event=cudaLaunchKernel node=addmm_1 stack_trace=x = self.linear2(x)"""
+            )
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    @skipIfRocm
+    @torch.fx.experimental._config.patch("enrich_profiler_metadata", True)
+    def test_profiler_multiple_modules(self):
+        """
+        Test that multiple compiled modules under the same profiler session
+        have their events correctly augmented with stack traces.
+        """
+
+        class ModelA(torch.nn.Module):
+            def forward(self, x):
+                return x + 1
+
+        class ModelB(torch.nn.Module):
+            def forward(self, x):
+                return x - 1
+
+        model_a = ModelA().cuda()
+        model_b = ModelB().cuda()
+
+        # Compile both models
+        compiled_a = torch.compile(model_a, backend="aot_eager", fullgraph=True)
+        compiled_b = torch.compile(model_b, backend="aot_eager", fullgraph=True)
+
+        # Warmup
+        for _ in range(3):
+            _ = compiled_a(torch.randn(10, 10, device="cuda"))
+            _ = compiled_b(torch.randn(1, 3, 8, 8, device="cuda"))
+
+        # Profile both models in the same session
+        with profile(
+            activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+        ) as prof:
+            result_a = compiled_a(torch.randn(10, 10, device="cuda"))
+            result_b = compiled_b(torch.randn(1, 3, 8, 8, device="cuda"))
+
+        actual_traces = _enrich_profiler_traces(prof)
+        self.assertExpectedInline(actual_traces, """\
+event=aten::add node=add stack_trace=return x + 1
+event=cudaLaunchKernel node=add stack_trace=return x + 1
+event=aten::sub node=sub stack_trace=return x - 1
+event=cudaLaunchKernel node=sub stack_trace=return x - 1"""
+            )
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    @skipIfRocm
+    @torch.fx.experimental._config.patch("enrich_profiler_metadata", True)
+    def test_profiler_nested_graph_modules(self):
+        """
+        Test that nested graph modules (e.g., graph modules calling subgraphs)
+        have their events correctly augmented with stack traces.
+        """
+
+        # Model with nested structure
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.c = 5
+
+            @torch.compiler.nested_compile_region
+            def forward(self, x, y):
+                m = torch.mul(x, y)
+                s = m.sin()
+                a = s + self.c
+                return a
+
+        model = Mod().cuda()
+
+        # Compile the model (this may create nested graph modules)
+        compiled_model = torch.compile(model, backend="aot_eager", fullgraph=True)
+
+        # Warmup
+        for _ in range(3):
+            _ = compiled_model(torch.randn(10, 10, device="cuda"), torch.randn(10, 10, device="cuda"))
+
+        # Profile
+        with profile(
+            activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+        ) as prof:
+            result = compiled_model(torch.randn(10, 10, device="cuda"), torch.randn(10, 10, device="cuda"))
+
+        actual_traces = _enrich_profiler_traces(prof)
+        self.assertExpectedInline(actual_traces, """\
+event=aten::mul node=mul stack_trace=m = torch.mul(x, y)
+event=cudaLaunchKernel node=mul stack_trace=m = torch.mul(x, y)
+event=aten::sin node=sin stack_trace=s = m.sin()
+event=cudaLaunchKernel node=sin stack_trace=s = m.sin()
+event=aten::add node=add stack_trace=a = s + self.c
+event=cudaLaunchKernel node=add stack_trace=a = s + self.c"""
+            )
+
 
 def run_getitem_target():
     from torch.fx._symbolic_trace import _wrapped_methods_to_patch
@@ -4535,7 +4746,7 @@ def check_symbols_have_bc_designation(m, seen):
         check_symbols_have_bc_designation(torch.fx.passes, set())
 
         non_back_compat_strs = [
-            torch.typename(obj) for obj in non_back_compat_objects.keys()
+            torch.typename(obj) for obj in non_back_compat_objects
         ]
         # Only want objects in torch.fx
         non_back_compat_strs = [
@@ -4849,13 +5060,13 @@ def setUpClass(cls):
         def no(*args, **kwargs):
             return False
 
-        for name in cls.TO_PATCH.keys():
+        for name in cls.TO_PATCH:
             cls.TO_PATCH[name] = getattr(torch.nn.functional, name)
             setattr(torch.nn.functional, name, no)
 
     @classmethod
     def tearDownClass(cls):
-        for name in cls.TO_PATCH.keys():
+        for name in cls.TO_PATCH:
             setattr(torch.nn.functional, name, cls.TO_PATCH[name])
 
 
diff --git a/test/test_indexing.py b/test/test_indexing.py
index f69c326939aa6..99c0fccf17552 100644
--- a/test/test_indexing.py
+++ b/test/test_indexing.py
@@ -17,12 +17,14 @@
     dtypesIfCPU,
     dtypesIfCUDA,
     dtypesIfMPS,
+    dtypesIfXPU,
     expectedFailureMPS,
     instantiate_device_type_tests,
     onlyCPU,
-    onlyCUDA,
     onlyNativeDeviceTypes,
+    onlyOn,
     skipXLA,
+    skipXPUIf,
 )
 from torch.testing._internal.common_dtype import (
     all_mps_types_and,
@@ -38,6 +40,7 @@
     skipIfTorchDynamo,
     TEST_CUDA,
     TEST_MPS,
+    TEST_XPU,
     TestCase,
     xfailIfTorchDynamo,
 )
@@ -598,8 +601,8 @@ def validate_setting(x):
 
         # test invalid index fails
         reference = torch.empty(10, dtype=dtype, device=device)
-        # can't test cuda because it is a device assert
-        if not reference.is_cuda:
+        # can't test cuda/xpu because it is a device assert
+        if reference.device.type == "cpu":
             for err_idx in (10, -11):
                 with self.assertRaisesRegex(IndexError, r"out of"):
                     reference[err_idx]
@@ -744,7 +747,7 @@ def get_set_tensor(indexed, indexer):
             assert_get_eq(reference, indexer)
             assert_set_eq(reference, indexer, 212)
             assert_set_eq(reference, indexer, get_set_tensor(reference, indexer))
-            if torch.cuda.is_available():
+            if torch.accelerator.is_available():
                 assert_backward_eq(reference, indexer)
 
         reference = torch.arange(0.0, 1296, dtype=dtype, device=device).view(3, 9, 8, 6)
@@ -1009,7 +1012,7 @@ def test_byte_mask_accumulate(self, device):
     @skipIfTorchDynamo(
         "This test causes SIGKILL when running with dynamo, https://github.com/pytorch/pytorch/issues/88472"
     )
-    @serialTest(TEST_CUDA or TEST_MPS)
+    @serialTest(TEST_CUDA or TEST_XPU or TEST_MPS)
     def test_index_put_accumulate_large_tensor(self, device):
         # This test is for tensors with number of elements >= INT_MAX (2^31 - 1).
         N = (1 << 31) + 5
@@ -1086,7 +1089,7 @@ def test_index_put_accumulate_expanded_values(self, device):
         out_cpu = t.index_put_(indices, values2d, accumulate=True)
         self.assertEqual(out_cuda.cpu(), out_cpu)
 
-    @onlyCUDA
+    @onlyOn(["cuda", "xpu"])
     def test_index_put_large_indices(self, device):
         def generate_indices(num_indices: int, index_range: int):
             indices = []
@@ -1138,7 +1141,7 @@ def generate_indices(num_indices: int, index_range: int):
             a_dev.index_put_(indices=[b_dev], values=c_dev, accumulate=True)
             self.assertEqual(a_dev.cpu(), a)
 
-    @onlyCUDA
+    @onlyOn(["cuda", "xpu"])
     def test_index_put_accumulate_non_contiguous(self, device):
         t = torch.zeros((5, 2, 2))
         t_dev = t.to(device)
@@ -1157,7 +1160,7 @@ def test_index_put_accumulate_non_contiguous(self, device):
 
         self.assertEqual(out_cuda.cpu(), out_cpu)
 
-    @onlyCUDA
+    @onlyOn(["cuda", "xpu"])
     def test_index_put_deterministic_with_optional_tensors(self, device):
         def func(x, i, v):
             with DeterministicGuard(True):
@@ -1188,7 +1191,7 @@ def func1(x, i, v):
         indices = torch.tensor([1, 4, 3])
         indices_dev = indices.to(device)
         val = torch.randn(4)
-        out_cuda = func1(t_dev, indices_dev, val.cuda())
+        out_cuda = func1(t_dev, indices_dev, val.to(device))
         out_cpu = func1(t, indices, val)
         self.assertEqual(out_cuda.cpu(), out_cpu)
 
@@ -1321,6 +1324,14 @@ def test_int_indices(self, device):
         torch.float8_e5m2,
         torch.float8_e4m3fn,
     )
+    @dtypesIfXPU(
+        torch.cfloat,
+        torch.cdouble,
+        torch.half,
+        torch.long,
+        torch.bool,
+        torch.bfloat16,
+    )
     @dtypesIfMPS(torch.float, torch.float16, torch.long, torch.bool)
     def test_index_put_src_datatype(self, device, dtype):
         src = torch.ones(3, 2, 4, device=device, dtype=dtype)
@@ -1332,6 +1343,7 @@ def test_index_put_src_datatype(self, device, dtype):
     @dtypes(torch.float, torch.bfloat16, torch.long, torch.bool)
     @dtypesIfCPU(torch.float, torch.long, torch.bfloat16, torch.bool)
     @dtypesIfCUDA(torch.half, torch.long, torch.bfloat16, torch.bool)
+    @dtypesIfXPU(torch.half, torch.long, torch.bfloat16, torch.bool)
     def test_index_src_datatype(self, device, dtype):
         src = torch.ones(3, 2, 4, device=device, dtype=dtype)
         # test index
@@ -1630,7 +1642,7 @@ def runner():
 
         self.assertRaisesRegex(IndexError, "invalid index", runner)
 
-    @onlyCUDA
+    @onlyOn(["cuda", "xpu"])
     def test_invalid_device(self, device):
         idx = torch.tensor([0, 1])
         b = torch.zeros(5, device=device)
@@ -1642,7 +1654,7 @@ def test_invalid_device(self, device):
                 lambda: torch.index_put_(b, (idx,), c, accumulate=accumulate),
             )
 
-    @onlyCUDA
+    @onlyOn(["cuda", "xpu"])
     def test_cpu_indices(self, device):
         idx = torch.tensor([0, 1])
         b = torch.zeros(2, device=device)
@@ -1718,7 +1730,7 @@ def test_take_along_dim_invalid(self, device, dtype):
         with self.assertRaisesRegex(IndexError, "Dimension out of range"):
             torch.take_along_dim(t, indices, dim=7)
 
-    @onlyCUDA
+    @onlyOn(["cuda", "xpu"])
     @dtypes(torch.float)
     def test_gather_take_along_dim_cross_device(self, device, dtype):
         shape = (2, 3, 1, 4)
@@ -1748,7 +1760,7 @@ def test_gather_take_along_dim_cross_device(self, device, dtype):
         ):
             torch.take_along_dim(t.cpu(), indices, dim=0)
 
-    @onlyCUDA
+    @onlyOn(["cuda", "xpu"])
     def test_cuda_broadcast_index_use_deterministic_algorithms(self, device):
         with DeterministicGuard(True):
             idx1 = torch.tensor([0])
@@ -1969,6 +1981,7 @@ def _prepare_data_for_index_copy_and_add_deterministic(
         return (x, index, src)
 
     @onlyNativeDeviceTypes
+    @skipXPUIf(True, "https://github.com/intel/torch-xpu-ops/issues/1973")
     @expectedFailureMPS  # See https://github.com/pytorch/pytorch/issues/161029
     def test_index_copy_deterministic(self, device: torch.device) -> None:
         for dim in range(3):
@@ -2011,6 +2024,7 @@ def test_index_add_deterministic(self, device: torch.device) -> None:
                     self.assertEqual(y_nd, y0, atol=1e-3, rtol=1e-5)
 
     @onlyNativeDeviceTypes
+    @skipXPUIf(True, "https://github.com/intel/torch-xpu-ops/issues/1973")
     def test_index_put_non_accumulate_deterministic(self, device) -> None:
         with DeterministicGuard(True):
             for i in range(3):
@@ -2048,6 +2062,7 @@ def test_index_fill(self, device, dtype):
     # The test fails for zero-dimensional tensors on XLA
     @onlyNativeDeviceTypes
     @dtypes(*all_types_complex_float8_and(torch.half, torch.bool, torch.bfloat16))
+    @dtypesIfXPU(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     @dtypesIfMPS(*all_mps_types_and(torch.bool, torch.cfloat))
     def test_index_select(self, device, dtype):
         num_src, num_out = 3, 5
@@ -2361,8 +2376,8 @@ def test_broaderrors_indexing(self, device):
     def test_trivial_fancy_out_of_bounds(self, device):
         a = torch.zeros(5, device=device)
         ind = torch.ones(20, dtype=torch.int64, device=device)
-        if a.is_cuda:
-            raise unittest.SkipTest("CUDA asserts instead of raising an exception")
+        if a.device.type in ["cuda", "xpu"]:
+            raise unittest.SkipTest("CUDA/XPU asserts instead of raising an exception")
         ind[-1] = 10
         self.assertRaises(IndexError, a.__getitem__, ind)
         self.assertRaises(IndexError, a.__setitem__, ind, 0)
@@ -2397,9 +2412,9 @@ def test_truncate_leading_1s(self, device):
 
 
 instantiate_device_type_tests(
-    TestIndexing, globals(), except_for="meta", allow_mps=True
+    TestIndexing, globals(), except_for="meta", allow_mps=True, allow_xpu=True
 )
-instantiate_device_type_tests(NumpyTests, globals(), except_for="meta")
+instantiate_device_type_tests(NumpyTests, globals(), except_for="meta", allow_xpu=True)
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_jit.py b/test/test_jit.py
index 99d7e711da305..1cafa30719495 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -9459,7 +9459,7 @@ def forward(self, input):
                 return self.mods(input)
 
         m = M()
-        self.assertTrue('mods.conv.weight' in m.state_dict().keys())
+        self.assertTrue('mods.conv.weight' in m.state_dict())
 
     def test_script_sequential_multi_output_fail(self):
         class Sub(torch.jit.ScriptModule):
@@ -11954,7 +11954,7 @@ def test_dict_keys_values(x):
             # type: (Dict[str, int]) -> Tuple[str, int]
             key_str = ""
             sum = 0
-            for key in x.keys():
+            for key in x:
                 key_str += key
             for val in x.values():
                 sum += val
@@ -16271,8 +16271,10 @@ def test_version(self):
     add_nn_module_test(**test)
 
 if __name__ == '__main__':
-    TestCase._default_dtype_check_enabled = True
-    run_tests()
-    import jit.test_module_interface
-    suite = unittest.findTestCases(jit.test_module_interface)
-    unittest.TextTestRunner().run(suite)
+    if sys.version_info < (3, 14):
+        TestCase._default_dtype_check_enabled = True
+        run_tests()
+        import jit.test_module_interface
+
+        suite = unittest.findTestCases(jit.test_module_interface)
+        unittest.TextTestRunner().run(suite)
diff --git a/test/test_jit_autocast.py b/test/test_jit_autocast.py
index 0559a728aef98..b3f47ed5e7eb6 100644
--- a/test/test_jit_autocast.py
+++ b/test/test_jit_autocast.py
@@ -4,6 +4,7 @@
 from torch.cuda.amp import autocast
 from typing import Optional
 
+import sys
 import unittest
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_utils import parse_cmd_line_args, run_tests, skipIfTorchDynamo
@@ -961,4 +962,5 @@ def check_fn_results(arr):
 
 
 if __name__ == "__main__":
-    run_tests()
+    if sys.version_info < (3, 14):
+        run_tests()
diff --git a/test/test_jit_disabled.py b/test/test_jit_disabled.py
index 6bb694bc794a7..0bc7af2467cf0 100644
--- a/test/test_jit_disabled.py
+++ b/test/test_jit_disabled.py
@@ -88,4 +88,5 @@ def forward(self, input):
         self.compare_enabled_disabled(_program_string)
 
 if __name__ == '__main__':
-    run_tests()
+    if sys.version_info < (3, 14):
+        run_tests()
diff --git a/test/test_jit_fuser.py b/test/test_jit_fuser.py
index 5446770695c43..fba24d81bf5bb 100644
--- a/test/test_jit_fuser.py
+++ b/test/test_jit_fuser.py
@@ -995,4 +995,5 @@ def my_broadcasted_cell(a, b, c):
 
 
 if __name__ == '__main__':
-    run_tests()
+    if sys.version_info < (3, 14):
+        run_tests()
diff --git a/test/test_jit_fuser_legacy.py b/test/test_jit_fuser_legacy.py
index 4100bcc3e182f..376abd95a9067 100644
--- a/test/test_jit_fuser_legacy.py
+++ b/test/test_jit_fuser_legacy.py
@@ -13,4 +13,5 @@
 from test_jit_fuser import *  # noqa: F403
 
 if __name__ == '__main__':
-    run_tests()
+    if sys.version_info < (3, 14):
+        run_tests()
diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index c3018be817d9b..a9bd9f19aa98e 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -5,6 +5,7 @@
 import math
 import operator
 import os
+import sys
 import unittest
 import warnings
 
@@ -1682,11 +1683,8 @@ def apply(fn):
         ]
         dtypes = ["int", "float", "bool"]
         values = {"int": [10, 3], "float": [12.34, 2.78], "bool": [True, False]}
-        devices = self.devices
-        for dtype_x, dtype_y, op, device in product(
-            dtypes, dtypes, binary_ops, devices
-        ):
-            code = ir_template.format(**locals())
+        for dtype_x, dtype_y, op in product(dtypes, dtypes, binary_ops):
+            code = ir_template.format(dtype_x=dtype_x, dtype_y=dtype_y, op=op)
 
             # Interpret the graph
             try:
@@ -1701,9 +1699,7 @@ def apply(fn):
             try:
                 k = torch._C._te.TensorExprKernel(graph)
             except Exception as e:
-                raise RuntimeError(
-                    " ".join(["Compilation failed:", device, str(code)])
-                ) from e
+                raise RuntimeError(" ".join(["Compilation failed:", str(code)])) from e
 
             # Run the graph
             for x, y in product(values[dtype_x], values[dtype_y]):
@@ -1713,9 +1709,7 @@ def apply(fn):
                     self.assertEqual(ref, res)
                 except Exception as e:
                     raise RuntimeError(
-                        " ".join(
-                            ["Failed at runtime:", device, str(x), str(y), str(code)]
-                        )
+                        " ".join(["Failed at runtime:", str(x), str(y), str(code)])
                     ) from e
 
     def test_matmul(self):
@@ -3054,4 +3048,5 @@ def fn_test_relu(x, y):
 
 
 if __name__ == "__main__":
-    run_tests()
+    if sys.version_info < (3, 14):
+        run_tests()
diff --git a/test/test_jit_legacy.py b/test/test_jit_legacy.py
index 480b57a55bd47..90827ca8f465a 100644
--- a/test/test_jit_legacy.py
+++ b/test/test_jit_legacy.py
@@ -12,4 +12,5 @@
 from test_jit import *  # noqa: F403, F401
 
 if __name__ == '__main__':
-    run_tests()
+    if sys.version_info < (3, 14):
+        run_tests()
diff --git a/test/test_jit_llga_fuser.py b/test/test_jit_llga_fuser.py
index 40e658d4af4cf..1707288a318cd 100644
--- a/test/test_jit_llga_fuser.py
+++ b/test/test_jit_llga_fuser.py
@@ -854,4 +854,5 @@ def test(self, dtype=dtype):
 instantiate_device_type_tests(TestOp, globals())
 
 if __name__ == '__main__':
-    run_tests()
+    if sys.version_info < (3, 14):
+        run_tests()
diff --git a/test/test_jit_profiling.py b/test/test_jit_profiling.py
index 22fe6994831e2..fb72b20c5f217 100644
--- a/test/test_jit_profiling.py
+++ b/test/test_jit_profiling.py
@@ -5,4 +5,5 @@
 from test_jit import *  # noqa: F403
 
 if __name__ == '__main__':
-    run_tests()
+    if sys.version_info < (3, 14):
+        run_tests()
diff --git a/test/test_jit_simple.py b/test/test_jit_simple.py
index 7c734434dfba4..9fd3a3dc7d806 100644
--- a/test/test_jit_simple.py
+++ b/test/test_jit_simple.py
@@ -5,4 +5,5 @@
 from test_jit import *  # noqa: F403
 
 if __name__ == '__main__':
-    run_tests()
+    if sys.version_info < (3, 14):
+        run_tests()
diff --git a/test/test_jit_string.py b/test/test_jit_string.py
index 55bd003cd9e3c..a2e9f5c6abc3a 100644
--- a/test/test_jit_string.py
+++ b/test/test_jit_string.py
@@ -1,5 +1,6 @@
 # Owner(s): ["oncall: jit"]
 
+import sys
 from test_jit import JitTestCase
 from torch.testing._internal.common_utils import run_tests
 
@@ -329,4 +330,5 @@ def test_slice(a: str) -> tuple[str, str, str, str, str]:
         self.checkScript(test_slice, ("hellotest",))
 
 if __name__ == '__main__':
-    run_tests()
+    if sys.version_info < (3, 14):
+        run_tests()
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 41a223763d474..7e3a1ebaa6f3a 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -7329,11 +7329,11 @@ def _test_addmm_impl(self, func, activation, device, dtype):
         m2 = torch.randn(50, 25, device=device).to(dtype)
         self._test_addmm_addmv(func, M, m1, m2, activation=activation)
 
-        # vector-shaped bias (or with 1-len dims on the left from the leading dim)
+        # vector (or with 1-len dims in shape[:-1])/matrix-shaped bias
         # and beta=1 result in epilogue fusion in CUDA
         V = torch.randn(25, device=device).to(dtype)
-        self._test_addmm_addmv(func, V, m1, m2, beta=1, activation=activation)
-        self._test_addmm_addmv(func, V.unsqueeze(0), m1, m2, beta=1, activation=activation)
+        for c in (V, V.unsqueeze(0), M):
+            self._test_addmm_addmv(func, c, m1, m2, beta=1, activation=activation)
 
         # Test 0-strided
         M = torch.randn(10, 1, device=device).to(dtype).expand(10, 25)
@@ -7357,12 +7357,10 @@ def maybe_transpose(cond, m):
             M = maybe_transpose(t1, torch.randn(10, 25, device=device).to(dtype))
             m1 = maybe_transpose(t2, torch.randn(10, 50, device=device).to(dtype))
             m2 = maybe_transpose(t3, torch.randn(50, 25, device=device).to(dtype))
-            self._test_addmm_addmv(func, M, m1, m2, transpose_out=t4, activation=activation)
 
-            if t1:
-                # use vector/(1 by k)-shaped V instead of matrix M for epilogue fusion in CUDA (doesn't depend on t1)
-                self._test_addmm_addmv(func, V, m1, m2, beta=1, transpose_out=t4, activation=activation,)
-                self._test_addmm_addmv(func, V.unsqueeze(0), m1, m2, beta=1, transpose_out=t4, activation=activation,)
+            for c, beta in itertools.product((M, V, V.unsqueeze(0)), (0, 1)):
+                # beta=1 to test epilogue fusions with either vector or matrix input
+                self._test_addmm_addmv(func, c, m1, m2, beta=beta, transpose_out=t4, activation=activation)
 
     @precisionOverride({torch.double: 1e-8, torch.float: 1e-4, torch.bfloat16: 0.6,
                         torch.half: 1e-1, torch.cfloat: 1e-4, torch.cdouble: 1e-8})
@@ -10073,6 +10071,65 @@ def test_1_sized_with_0_strided(self, device, dtype):
             a_strided.cpu().numpy() @ b_strided.cpu().numpy()).to(device=device, dtype=dtype)
         self.assertEqual(expect, res)
 
+    @onlyCUDA
+    def test_logaddexp_cpu_vs_cuda_complex(self, device):
+        # test logaddexp with complex values produce the same values (up to machine precision) on cpu and CUDA.
+        input_real = torch.tensor([0.052, -0.2115, 0.6913], dtype=torch.float64)
+        input_img = torch.tensor([-0.3229, -0.8374, 0.8391], dtype=torch.float64)
+        input_complex = torch.complex(input_real, input_img).cuda()
+
+        other_real = torch.tensor([0.2550, 0.8769, -0.4884], dtype=torch.float64)
+        other_img = torch.tensor([0.6063, 0.4343, -1.4166], dtype=torch.float64)
+        other_complex = torch.complex(other_real, other_img).cuda()
+
+        out_gpu = torch.logaddexp(input=input_complex, other=other_complex)
+        out_cpu = torch.logaddexp(input=input_complex.cpu(), other=other_complex.cpu())
+
+        torch.testing.assert_close(out_gpu.cpu(), out_cpu, rtol=1e-12, atol=1e-14)
+
+        # test extreme cases (infty, -infty, and nan) are handled the same between cuda and cpu
+        input_complex = torch.complex(torch.tensor(float('inf')), torch.tensor(float('inf')))
+        other_complex = torch.complex(torch.tensor(float('inf')), torch.tensor(float('inf')))
+        out_gpu = torch.logaddexp(input=input_complex, other=other_complex)
+        out_cpu = torch.logaddexp(input=input_complex.cpu(), other=other_complex.cpu())
+        self.assertEqual(out_gpu.cpu(), out_cpu)
+
+        input_complex = torch.complex(torch.tensor(float('inf')), torch.tensor(float('inf')))
+        other_complex = torch.complex(torch.tensor(float('inf')), torch.tensor(-float('inf')))
+        out_gpu = torch.logaddexp(input=input_complex, other=other_complex)
+        out_cpu = torch.logaddexp(input=input_complex.cpu(), other=other_complex.cpu())
+        self.assertEqual(out_gpu.cpu(), out_cpu)
+
+        input_complex = torch.complex(torch.tensor(-float('inf')), torch.tensor(float('inf')))
+        other_complex = torch.complex(torch.tensor(float('inf')), torch.tensor(float('inf')))
+        out_gpu = torch.logaddexp(input=input_complex, other=other_complex)
+        out_cpu = torch.logaddexp(input=input_complex.cpu(), other=other_complex.cpu())
+        self.assertEqual(out_gpu.cpu(), out_cpu)
+
+        input_complex = torch.complex(torch.tensor(-float('inf')), torch.tensor(float('inf')))
+        other_complex = torch.complex(torch.tensor(-float('inf')), torch.tensor(float('inf')))
+        out_gpu = torch.logaddexp(input=input_complex, other=other_complex)
+        out_cpu = torch.logaddexp(input=input_complex.cpu(), other=other_complex.cpu())
+        self.assertEqual(out_gpu.cpu(), out_cpu)
+
+        input_complex = torch.complex(torch.tensor(-float('inf')), torch.tensor(float('inf')))
+        other_complex = torch.complex(torch.tensor(-float('inf')), torch.tensor(2.))
+        out_gpu = torch.logaddexp(input=input_complex, other=other_complex)
+        out_cpu = torch.logaddexp(input=input_complex.cpu(), other=other_complex.cpu())
+        self.assertEqual(out_gpu.cpu(), out_cpu)
+
+        input_complex = torch.complex(torch.tensor(2.), torch.tensor(float('inf')))
+        other_complex = torch.complex(torch.tensor(float('inf')), torch.tensor(float('inf')))
+        out_gpu = torch.logaddexp(input=input_complex, other=other_complex)
+        out_cpu = torch.logaddexp(input=input_complex.cpu(), other=other_complex.cpu())
+        self.assertEqual(out_gpu.cpu(), out_cpu)
+
+        input_complex = torch.complex(torch.tensor(float('nan')), torch.tensor(float('inf')))
+        other_complex = torch.complex(torch.tensor(float('inf')), torch.tensor(float('inf')))
+        out_gpu = torch.logaddexp(input=input_complex, other=other_complex)
+        out_cpu = torch.logaddexp(input=input_complex.cpu(), other=other_complex.cpu())
+        self.assertEqual(out_gpu.cpu(), out_cpu)
+
 instantiate_device_type_tests(TestLinalg, globals())
 
 if __name__ == '__main__':
diff --git a/test/test_matmul_cuda.py b/test/test_matmul_cuda.py
index 5e54a851812e0..7a6585f3b63a8 100644
--- a/test/test_matmul_cuda.py
+++ b/test/test_matmul_cuda.py
@@ -5,7 +5,7 @@
 import unittest
 from itertools import product
 from functools import partial
-from typing import Callable
+from collections.abc import Callable
 
 import torch
 
@@ -359,6 +359,29 @@ def grouped_mm_helper(self, alist, blist, gOlist, agradlist, bgradlist, outlist)
                 self.assertEqual(agrad, a.grad)
                 self.assertEqual(bgrad, b.grad)
 
+    @onlyCUDA
+    @skipIfRocm
+    @dtypes(torch.half, torch.bfloat16)
+    @unittest.skipIf(not SM100OrLater, "cuBLAS integration for batch invariance is only on Blackwell")
+    @serialTest()
+    def test_cublas_batch_invariance_blackwell(self, device, dtype):
+        orig_bf16 = torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction
+        orig_fp16 = torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction
+        torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = (False, False)
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = (False, False)
+        with blas_library_context('cublaslt'):
+            N = 2048
+            K = 6144
+            M_max = 32
+            x = torch.randn(M_max, K, device="cuda", dtype=torch.bfloat16)
+            w = torch.randn(N, K, device="cuda", dtype=torch.bfloat16).t()
+            full = x @ w
+            xx = x[:1]
+            out = xx @ w
+            self.assertEqual(full[:1], out, atol=0., rtol=0.)
+        torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = orig_bf16
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = orig_fp16
+
     @unittest.skipIf(not SM80OrLater, "Grouped gemm supported only on SM80 or greater")
     @parametrize("strided", [False, True])
     @parametrize("a_row_major", [False, True])
@@ -490,8 +513,6 @@ def test_grouped_gemm_3d_3d(self, strided, a_row_major, b_row_major, dtype):
     @parametrize("b_row_major", [False, True])
     @dtypes(torch.bfloat16, torch.float32, torch.float16)
     def test_grouped_gemm_3d_2d(self, strided, a_row_major, b_row_major, dtype):
-        if TEST_WITH_ROCM and a_row_major and b_row_major and dtype in [torch.bfloat16, torch.float16]:
-            self.skipTest("failed using hipblaslt on rocm 6.4.2")
         device = "cuda"
         s_int = int(strided)
         m, n, k, n_groups = 16, 32, 64, 4
@@ -726,11 +747,13 @@ def create_inputs(B=None):
     @onlyCUDA
     @parametrize("input_dtype", [torch.float32, torch.float16, torch.bfloat16])
     @parametrize("M", [1, 32, 64])
-    @parametrize("N", [1, 32, 64])
+    @parametrize("N", [1, 64])
     @parametrize("K", [1, 32, 64])
-    @parametrize("batch_size", [None, 1, 32])
+    @parametrize("batch_size", [None, 1])
+    @parametrize("broadcast_self", [False, True])
+    @parametrize("high_precision_self", [False, True])
     @parametrize("backend", ["cublas", "cublaslt"])
-    def test_addmm_baddmm_dtype_overload(self, input_dtype, M, N, K, batch_size, backend):
+    def test_addmm_baddmm_dtype_overload(self, input_dtype, M, N, K, batch_size, broadcast_self, high_precision_self, backend):
         if torch.version.hip:
             msg = "accuracy regression in hipblas and hipblaslt in ROCm 7.0 for certain shapes"
             if input_dtype == torch.bfloat16 and N == 1 and K == 32 and batch_size:
@@ -745,19 +768,21 @@ def test_addmm_baddmm_dtype_overload(self, input_dtype, M, N, K, batch_size, bac
         device = "cuda"
         dtype = input_dtype
         with blas_library_context(backend):
-            def create_inputs(B=None):
+            def create_inputs(B, broadcast_self):
                 if B is None:
                     a = torch.randn(M, K, device=device, dtype=dtype)
                     b = torch.randn(K, N, device=device, dtype=dtype)
-                    c = torch.randn(M, N, device=device, dtype=dtype)
+                    c_shape = (M, N) if not broadcast_self else (N)
+                    c = torch.randn(c_shape, device=device, dtype=dtype)
                 else:
                     a = torch.randn(B, M, K, device=device, dtype=dtype)
                     b = torch.randn(B, K, N, device=device, dtype=dtype)
-                    c = torch.randn(B, M, N, device=device, dtype=dtype)
+                    c_shape = (B, M, N) if not broadcast_self else (N)
+                    c = torch.randn(c_shape, device=device, dtype=dtype)
 
                 return a, b, c
 
-            a, b, c = create_inputs(batch_size)
+            a, b, c = create_inputs(batch_size, broadcast_self)
 
             a_fp32, b_fp32, c_fp32 = a.to(torch.float32), b.to(torch.float32), c.to(torch.float32)
 
@@ -779,21 +804,31 @@ def create_inputs(B=None):
                         with self.assertRaises(RuntimeError):
                             torch.addmm(c, a, b, out_dtype=output_dtype)
                 else:
+                    if c.dtype != output_dtype and high_precision_self:
+                        c = c.to(output_dtype)
                     if batch_size:
                         out = torch.baddbmm(c, a, b, out_dtype=output_dtype)
                         if output_dtype == torch.float32:
                             baseline = torch.baddbmm(c_fp32, a_fp32, b_fp32)
                         else:
                             baseline = torch.baddbmm(c, a, b)
+                        # test out variant
+                        out_ten = torch.full_like(out, float("nan"))
+                        torch.baddbmm(c, a, b, out_dtype=output_dtype, out=out_ten)
                     else:
                         out = torch.addmm(c, a, b, out_dtype=output_dtype)
                         if output_dtype == torch.float32:
                             baseline = torch.addmm(c_fp32, a_fp32, b_fp32)
                         else:
                             baseline = torch.addmm(c, a, b)
+                        # test out variant
+                        out_ten = torch.full_like(out, float("nan"))
+                        torch.addmm(c, a, b, out_dtype=output_dtype, out=out_ten)
 
                     self.assertEqual(out.dtype, output_dtype)
+                    self.assertEqual(out_ten.dtype, output_dtype)
                     torch.testing.assert_close(out, baseline, atol=1e-3, rtol=1e-3)
+                    torch.testing.assert_close(out_ten, out, atol=0, rtol=0)
 
 
     @onlyCUDA
diff --git a/test/test_monitor.py b/test/test_monitor.py
index cf9cecc356f87..19d4a6cf2dc25 100644
--- a/test/test_monitor.py
+++ b/test/test_monitor.py
@@ -111,6 +111,7 @@ def test_wait_counter(self) -> None:
 @skipIfTorchDynamo("Really weird error")
 class TestMonitorTensorboard(TestCase):
     def setUp(self):
+        super().setUp()
         global SummaryWriter, event_multiplexer
         try:
             from tensorboard.backend.event_processing import (
diff --git a/test/test_mps.py b/test/test_mps.py
index fad09c2f5eb28..a84ac7d355169 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -647,6 +647,34 @@ def test_large_matmul(self):
 
         self.assertEqual(matmul_cpu, matmul_mps.to("cpu"))
 
+    def test_large_complex_matmul(self):
+        # See https://github.com/pytorch/pytorch/issues/167727
+        M, N, K = 64, 300, 3000
+        a = torch.rand((M, K), device='mps', dtype=torch.cfloat)
+        b = torch.rand((K, N), device='mps', dtype=torch.cfloat)
+        out = torch.mm(a, b)
+        out_cpu = torch.mm(a.cpu(), b.cpu())
+        # Operation order in large matmul can affect the results
+        # Float ulp is 1e-6, multiplied by inner dim results in 5e-3
+        self.assertEqual(out.cpu(), out_cpu, atol=5e-3, rtol=1e-5)
+
+    def test_large_complex_addmm(self):
+        # See https://github.com/pytorch/pytorch/issues/167727
+        M, N, K = 64, 300, 3000
+        a = torch.rand((M, N), device="mps", dtype=torch.cfloat)
+        b = torch.rand((M, K), device='mps', dtype=torch.cfloat)
+        c = torch.rand((K, N), device='mps', dtype=torch.cfloat)
+        out = torch.addmm(a, b, c, alpha=1.0, beta=0.5j)
+        out_cpu = torch.addmm(a.cpu(), b.cpu(), c.cpu(), alpha=1.0, beta=0.5j)
+        # Operation order in large matmul can affect the results
+        # Float ulp is 1e-6, multiplied by inner dim results in 5e-3
+        self.assertEqual(out.cpu(), out_cpu, atol=5e-3, rtol=2e-5)
+
+    def test_empty_matmul_vec(self):
+        tensor_1 = torch.rand((0, 100), device="mps")
+        tensor_2 = torch.rand((100, ), device="mps")
+        self.assertEqual((tensor_1 @ tensor_2).cpu(), tensor_1.cpu() @ tensor_2.cpu())
+
 class MPSLeakyReluTest(TestCaseMPS):
     def _npLeakyRelu(self, np_features, negative_slope=0.1):
         return np.maximum(np_features, negative_slope * np_features).astype(np_features.dtype)
@@ -1762,13 +1790,13 @@ def helper(shape, eps=1, momentum=0.1, wts=False, training=False, channels_last=
                             continue
                         # Running stats must be tracked in eval mode
                         if (track_running_stats):
-                            helper(shape, eps=0, momentum=1, channels_last=channels_last,
+                            helper(shape, eps=1e-5, momentum=1, channels_last=channels_last,
                                    track_running_stats=track_running_stats, test_module=test_module)
                             helper(shape, channels_last=channels_last,
                                    track_running_stats=track_running_stats, test_module=test_module)
                             helper(shape, eps=1e-05, momentum=0.1, wts=False, training=False, channels_last=channels_last,
                                    track_running_stats=track_running_stats, test_module=test_module)
-                            helper(shape, eps=0, momentum=1.0, wts=False, training=False, channels_last=channels_last,
+                            helper(shape, eps=1e-5, momentum=1.0, wts=False, training=False, channels_last=channels_last,
                                    track_running_stats=track_running_stats, test_module=test_module)
                             helper(shape, eps=1, momentum=1, wts=True, training=False, channels_last=channels_last,
                                    track_running_stats=track_running_stats, test_module=test_module)
@@ -1776,7 +1804,7 @@ def helper(shape, eps=1, momentum=0.1, wts=False, training=False, channels_last=
                                    track_running_stats=track_running_stats, test_module=test_module)
                         helper(shape, eps=1e-05, momentum=0.1, wts=False, training=True, channels_last=channels_last,
                                track_running_stats=track_running_stats, test_module=test_module)
-                        helper(shape, eps=0, momentum=1.0, wts=False, training=True, channels_last=channels_last,
+                        helper(shape, eps=1e-5, momentum=1.0, wts=False, training=True, channels_last=channels_last,
                                track_running_stats=track_running_stats, test_module=test_module)
                         helper(shape, eps=1, momentum=1, wts=True, training=True, channels_last=channels_last,
                                track_running_stats=track_running_stats, test_module=test_module)
@@ -3304,6 +3332,14 @@ def helper(shape, dtype=torch.float32, num_repeats=torch.Tensor(), dim=None):
         helper(shape=(10, 15, 8), num_repeats=torch.randint(0, 100, (15, ), device="mps"), dim=1)
         helper(shape=(10, 15, 30), num_repeats=torch.randint(0, 100, (30, ), device="mps"), dim=2)
 
+    def test_repeat_interleave_offset(self):
+        # Regression test for https://github.com/pytorch/pytorch/issues/167924
+        counts = torch.tensor([0, 1, 0], device="mps")
+        data = torch.arange(2, device="mps")
+        out_mps = data.repeat_interleave(counts[1:], dim=0)
+        out_cpu = data.cpu().repeat_interleave(counts.cpu()[1:], dim=0)
+        self.assertEqual(out_mps.cpu(), out_cpu)
+
     def test_count_nonzero(self):
         def helper(dtype):
             n = [
@@ -4470,6 +4506,14 @@ def test_bce_loss_broadcasts_weights(self):
 
         self.assertEqual(out1, out2)
 
+    def test_bce_backward_with_no_reduction_and_one_in_shape(self):
+        # Regression test for https://github.com/pytorch/pytorch/issues/166746
+        output = torch.zeros(3, 2, 1, requires_grad=True, device='mps')
+        target = torch.zeros(3, 2, 1, device='mps')
+        torch.sum(nn.BCELoss(reduction='none')(output, target)).backward()
+        expected_grad = torch.zeros(3, 2, 1, device='mps')
+        self.assertEqual(output.grad, expected_grad)
+
     def test_cross_entropy_loss(self):
         # Regression test for https://github.com/pytorch/pytorch/issues/116095
         loss = nn.CrossEntropyLoss()
@@ -4902,7 +4946,7 @@ def helper(shape):
             input_xs.append(torch.ones(prod, dtype=torch.int).reshape(shape).bool())
             input_xs.append(torch.zeros(prod, dtype=torch.int).reshape(shape).bool())
 
-            for i, cpu_x in enumerate(input_xs):
+            for cpu_x in input_xs:
                 x = cpu_x.detach().clone().to('mps')
                 y = torch.any(x)
                 ref_y = torch.any(cpu_x)
@@ -5580,7 +5624,6 @@ def helper(n, c, h, w):
         helper(2, 8, 4, 5)
 
     # Test clamp_max
-
     def test_clamp_max(self):
         def helper(n, c, h, w):
             cpu_x = torch.randn(n, c, h, w, device='cpu', dtype=torch.float, requires_grad=False)
@@ -5672,6 +5715,27 @@ def helper(n, c, h, w):
 
         helper(2, 8, 4, 5)
 
+    def test_clamp_tensor_bounds_broadcasting(self):
+        def helper(input_shape, bound_shape):
+            cpu_x = torch.randn(input_shape, device="cpu", dtype=torch.float32, requires_grad=False)
+            mps_x = cpu_x.detach().clone().to("mps")
+
+            cpu_min_t = torch.randn(bound_shape, device="cpu", dtype=cpu_x.dtype, requires_grad=False)
+            cpu_max_t = cpu_min_t + torch.rand_like(cpu_min_t).abs()
+
+            mps_min_t = cpu_min_t.detach().clone().to("mps")
+            mps_max_t = cpu_max_t.detach().clone().to("mps")
+
+            clamp_cpu = torch.clamp(cpu_x, min=cpu_min_t, max=cpu_max_t)
+            clamp_mps = torch.clamp(mps_x, min=mps_min_t, max=mps_max_t)
+
+            self.assertEqual(clamp_mps.cpu(), clamp_cpu)
+
+        helper((2, 3), (1, 2, 3))
+        helper((4, 2, 3), (1, 2, 3))
+        helper((2, 3), (2, 3))
+
+
     def test_divmode(self):
         def helper(shape, rounding_mode):
             for dtype in [torch.float32, torch.float16, torch.int32, torch.int64]:
@@ -9452,7 +9516,7 @@ def _test_sdpa_mask(self, dtype: torch.dtype, L: int = 1, S: int = 72, NH: int =
         torch.manual_seed(1729)
         causal_mask = torch.tril(torch.ones(S, S, dtype=torch.bool, device='mps'))
         with torch.nn.attention.sdpa_kernel([torch.nn.attention.SDPBackend.MATH]):
-            i = 42
+            i = 42 if S > 42 else S // 2
 
             q = torch.randn([1, NH, L, HS], dtype=dtype, device="mps")
             k = torch.randn([1, NH, S, HS], dtype=q.dtype, device="mps")
@@ -12690,6 +12754,12 @@ def test_error_inputs(self, device, op):
             with self.assertRaisesRegex(error_type, error_regex):
                 op(*mps_args, **mps_kwargs)
 
+    def test_index_put_out_of_bounds(self, device):
+        x = torch.rand(10, 1, 10, device=device)
+        with self.assertRaises(torch.AcceleratorError):
+            y = x[:, [1]]
+            torch.mps.synchronize()
+
 class TestComplex(TestCase):
     def test_tensor_scalar_binops(self):
         # Regression test for https://github.com/pytorch/pytorch/issues/119088
diff --git a/test/test_native_mha.py b/test/test_native_mha.py
index 0e4489ab135c2..c360bf350e9ee 100644
--- a/test/test_native_mha.py
+++ b/test/test_native_mha.py
@@ -6,11 +6,14 @@
 from torch.testing._internal.common_device_type import (
     dtypes,
     dtypesIfCUDA,
+    dtypesIfXPU,
     instantiate_device_type_tests,
-    onlyCUDA,
+    onlyOn,
     skipMeta,
+    skipXPUIf,
 )
 from torch.testing._internal.common_utils import parametrize, run_tests, TestCase, TEST_WITH_ROCM
+from torch.nn.attention import SDPBackend
 
 class TestMHADeviceType(TestCase):
     @torch.no_grad()
@@ -89,6 +92,7 @@ def embiggen(x):
                 torch.testing.assert_close(v, correct_v)
 
     @dtypesIfCUDA(torch.float)
+    @dtypesIfXPU(torch.float)
     @dtypes(torch.float)
     @skipMeta
     def test_transform_bias_rescale_qkv(self, device, dtype):
@@ -99,9 +103,11 @@ def test_transform_bias_rescale_qkv(self, device, dtype):
                 )
 
     @dtypesIfCUDA(torch.float)
+    @dtypesIfXPU(torch.float)
     @dtypes(torch.float)
     @skipMeta
-    @onlyCUDA
+    @skipXPUIf(True, "https://github.com/intel/torch-xpu-ops/issues/2182")
+    @onlyOn(["cuda", "xpu"])
     def test_transform_bias_rescale_qkv_nested(self, device, dtype):
         for use_padding in (False, True):
             with self.subTest(use_padding=use_padding):
@@ -185,9 +191,9 @@ def forward(self, q, k, v, key_padding_mask):
             embed_dim=embed_dim, num_heads=num_heads, qkv=native_qkv, proj=native_proj
         ).to(dtype)
 
-        if device == "cuda":
-            pt = pt.cuda()
-            npt = npt.cuda()
+        if device == "cuda" or device == "xpu":
+            pt = pt.to(device)
+            npt = npt.to(device)
 
         ypt, weight_pt = pt(
             q,
@@ -266,6 +272,7 @@ def do_pad_all(tensors):
             self.assertEqual(weight_pt, weight_npt)
 
     @dtypesIfCUDA(torch.float, torch.half)
+    @dtypesIfXPU(torch.float, torch.half)
     @dtypes(torch.float)
     @skipMeta
     @parametrize("use_nt", [False, True])
@@ -285,10 +292,25 @@ def test_native_multihead_self_attention(self, device, dtype, use_nt,
             with self.subTest(use_padding=use_padding, pad_all=pad_all,
                               use_nt=use_nt, need_weights=need_weights,
                               average_attn_weights=average_attn_weights):
-                with torch.backends.cuda.sdp_kernel(
-                        enable_flash=False, enable_mem_efficient=False
-                ) if not fused else torch.backends.cuda.sdp_kernel(
-                        enable_flash=True, enable_mem_efficient=True
+                sdpa_backends_fused = [
+                    SDPBackend.MATH,
+                    SDPBackend.OVERRIDEABLE,
+                    SDPBackend.CUDNN_ATTENTION,
+                    SDPBackend.FLASH_ATTENTION,
+                    SDPBackend.EFFICIENT_ATTENTION,
+                ]
+                sdpa_backends_not_fused = [
+                    SDPBackend.MATH,
+                    SDPBackend.OVERRIDEABLE,
+                    SDPBackend.CUDNN_ATTENTION,
+                ]
+                if device == "xpu":
+                    sdpa_backends_fused = [SDPBackend.OVERRIDEABLE, SDPBackend.MATH]
+                    sdpa_backends_not_fused = [SDPBackend.MATH]
+                with torch.nn.attention.sdpa_kernel(
+                        sdpa_backends_not_fused
+                ) if not fused else torch.nn.attention.sdpa_kernel(
+                        sdpa_backends_fused
                 ):
                     self._test_multihead_attention_impl(
                         device,
@@ -302,6 +324,7 @@ def test_native_multihead_self_attention(self, device, dtype, use_nt,
                     )
 
     @dtypesIfCUDA(torch.float, torch.half)
+    @dtypesIfXPU(torch.float, torch.half)
     @dtypes(torch.float)
     @skipMeta
     @torch.no_grad()
@@ -316,6 +339,7 @@ def test_native_multihead_encoder_decoder_attention(self, device, dtype):
         )
 
     @dtypesIfCUDA(torch.float, torch.half)
+    @dtypesIfXPU(torch.float, torch.half)
     @dtypes(torch.float)
     @skipMeta
     @torch.no_grad()
@@ -330,7 +354,7 @@ def test_native_multihead_attention(self, device, dtype):
         )
 
 
-instantiate_device_type_tests(TestMHADeviceType, globals())
+instantiate_device_type_tests(TestMHADeviceType, globals(), allow_xpu=True)
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_nn.py b/test/test_nn.py
index 034cf51d49ff0..176516713feb1 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -1038,13 +1038,13 @@ def check():
                 self.assertIs(modules[k1], module_dict[k2])
             for k in module_dict:
                 self.assertIs(module_dict[k], modules[k])
-            for k in module_dict.keys():
+            for k in module_dict:
                 self.assertIs(module_dict[k], modules[k])
             for k, v in module_dict.items():
                 self.assertIs(modules[k], v)
             for k1, m2 in zip(modules, module_dict.values()):
                 self.assertIs(modules[k1], m2)
-            for k in modules.keys():
+            for k in modules:
                 self.assertTrue(k in module_dict)
         check()
 
@@ -1245,13 +1245,13 @@ def check():
                 self.assertIs(parameters[k1], parameter_dict[k2])
             for k in parameter_dict:
                 self.assertIs(parameter_dict[k], parameters[k])
-            for k in parameter_dict.keys():
+            for k in parameter_dict:
                 self.assertIs(parameter_dict[k], parameters[k])
             for k, v in parameter_dict.items():
                 self.assertIs(v, parameters[k])
             for k1, m2 in zip(parameters, parameter_dict.values()):
                 self.assertIs(parameters[k1], m2)
-            for k in parameters.keys():
+            for k in parameters:
                 self.assertTrue(k in parameter_dict)
 
         check()
@@ -2356,7 +2356,7 @@ def test_state_dict(self):
         self.assertIn('bn.running_var', state_dict)
         self.assertIn('bn.running_mean', state_dict)
         self.assertIn('bn.num_batches_tracked', state_dict)
-        self.assertFalse(any(k.startswith('empty') for k in state_dict.keys()))
+        self.assertFalse(any(k.startswith('empty') for k in state_dict))
         for k, v in state_dict.items():
             param = net
             for component in k.split('.'):
@@ -4123,7 +4123,7 @@ def make_noncontig(tensor):
 
         def compare_cpu_gpu(outputs_cpu, outputs_gpu):
             self.assertEqual(list(outputs_cpu.keys()), list(outputs_gpu.keys()))
-            for key in outputs_cpu.keys():
+            for key in outputs_cpu:
                 if key != 'weights':
                     self.assertEqual(outputs_cpu[key], outputs_gpu[key], atol=5e-5, rtol=0, msg=key)
 
@@ -7281,7 +7281,7 @@ def test_convert_sync_batchnorm(self):
         self.assertEqual(children[1].__class__, torch.nn.InstanceNorm1d)
 
         for layer, converted_layer in zip(comp_module.children(), sync_bn_module.children()):
-            for key in layer.state_dict().keys():
+            for key in layer.state_dict():
                 self.assertEqual(layer.state_dict()[key].device, converted_layer.state_dict()[key].device)
                 self.assertEqual(layer.state_dict()[key], converted_layer.state_dict()[key])
 
@@ -8525,6 +8525,30 @@ def test_avg_pool_large_tensor2(self, device):
         # reduce memory usage
         self.assertEqual(inp.grad.sum(), inp_cpu.grad.sum())
 
+    @onlyCUDA
+    @largeTensorTest("24GB", "cpu")
+    @largeTensorTest("24GB", "cuda")
+    def test_large_max_pool_contig(self, device):
+        # test for https://github.com/pytorch/pytorch/issues/167253
+        size = [74, 32, 24300, 40]
+        out_size = [74, 32, 24300, 20]
+        inp = torch.rand(size, device=device, dtype=torch.bfloat16, requires_grad=True)
+        inp_cpu = inp.detach().cpu()
+        inp_cpu.requires_grad = True
+        o = torch.nn.functional.max_pool2d(
+            inp, kernel_size=(1, 2), stride=(1, 2), ceil_mode=False, padding=0
+        )
+        o_cpu = torch.nn.functional.max_pool2d(
+            inp_cpu, kernel_size=(1, 2), stride=(1, 2), ceil_mode=False, padding=0
+        )
+        o.sum().backward()
+        o_cpu.sum().backward()
+        self.assertEqual(o.shape, out_size)
+        self.assertEqual(o_cpu.shape, out_size)
+        # reduce memory usage
+        self.assertEqual(o.sum(), o_cpu.sum())
+        self.assertEqual(inp.grad.sum(), inp_cpu.grad.sum())
+
     @unittest.skipIf((not TEST_NUMPY) or (not TEST_SCIPY) or (scipy.__version__ < '1.0.0'),
                      "Scipy v1.0 and/or numpy not found")
     @skipIfRocmArch(MI300_ARCH)
@@ -13516,7 +13540,7 @@ def compare_scaling(grads):
 
         # Should warning when parameters generator exhausted
         params = l.parameters()
-        for p in params:
+        for _p in params:
             pass
         with warnings.catch_warnings(record=True) as w:
             warnings.simplefilter("always")
diff --git a/test/test_nnapi.py b/test/test_nnapi.py
index d8a6392d72f1b..6f8d487507f46 100644
--- a/test/test_nnapi.py
+++ b/test/test_nnapi.py
@@ -28,6 +28,7 @@ def nhwc(t):
 )
 class TestNNAPI(TestCase):
     def setUp(self):
+        super().setUp()
         # Avoid saturation in fbgemm
         torch.backends.quantized.engine = "qnnpack"
 
diff --git a/test/test_numpy_interop.py b/test/test_numpy_interop.py
index 724cc974047b7..c30ace4a70f5f 100644
--- a/test/test_numpy_interop.py
+++ b/test/test_numpy_interop.py
@@ -683,6 +683,16 @@ def f(xs):
         ):
             f(xs)
 
+    def test_copy_mode(self):
+        def f(x):
+            return np.array(x, copy=np._CopyMode.IF_NEEDED)
+
+        opt_f = torch.compile(backend="eager", fullgraph=True)(f)
+        x = np.array([1, 2, 3])
+        # Should run without throwing an exception
+        y = opt_f(x)
+        self.assertEqual(y, f(x))
+
 
 instantiate_device_type_tests(TestNumPyInterop, globals())
 
diff --git a/test/test_opaque_obj.py b/test/test_opaque_obj.py
index f78ab4faef8fd..2c47ffc5b59b6 100644
--- a/test/test_opaque_obj.py
+++ b/test/test_opaque_obj.py
@@ -90,7 +90,7 @@ def pop_impl_fake(q: torch._C.ScriptObject) -> torch.Tensor:
             # This is not accurate since the queue could have tensors that are
             # not rank 1
             ctx = torch._custom_op.impl.get_ctx()
-            u0 = ctx.create_unbacked_symint()
+            u0 = ctx.new_dynamic_size()
             return torch.empty(u0)
 
         self.lib._register_fake("queue_pop", pop_impl_fake)
@@ -107,8 +107,7 @@ def size_impl(q: OpaqueType) -> int:
         @size_impl.register_fake
         def size_impl_fake(q: torch._C.ScriptObject) -> int:
             ctx = torch._custom_op.impl.get_ctx()
-            u0 = ctx.create_unbacked_symint()
-            torch._check_is_size(u0)
+            u0 = ctx.new_dynamic_size()
             return u0
 
         super().setUp()
diff --git a/test/test_opaque_obj_v2.py b/test/test_opaque_obj_v2.py
index b2f0f873a853a..7dcddfb0f3906 100644
--- a/test/test_opaque_obj_v2.py
+++ b/test/test_opaque_obj_v2.py
@@ -1,12 +1,22 @@
 # Owner(s): ["module: custom-operators"]
 
 import random
+from contextlib import ExitStack
 
 import torch
 from torch._dynamo.test_case import run_tests, TestCase
+from torch._dynamo.testing import AotEagerAndRecordGraphs
+from torch._functorch.aot_autograd import (
+    aot_compile_joint_with_descriptors,
+    aot_export_joint_with_descriptors,
+    aot_export_module,
+)
+from torch._library.effects import EffectType
 from torch._library.fake_class_registry import FakeScriptObject
 from torch._library.opaque_object import register_opaque_type
+from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.fx.experimental.proxy_tensor import make_fx
+from torch.fx.experimental.symbolic_shapes import ShapeEnv
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
@@ -41,11 +51,21 @@ def size(self) -> int:
 
 class RNGState:
     def __init__(self, seed):
-        self.rng = random.Random(seed)
+        self.seed = seed
+        self.rng = random.Random(self.seed)
+
+
+class Counter:
+    def __init__(self, start):
+        self.counter = torch.tensor(start)
+
+    def increment_counter(self):
+        self.counter += 1
 
 
 register_opaque_type(OpaqueQueue, "_TestOpaqueObject_OpaqueQueue")
 register_opaque_type(RNGState, "_TestOpaqueObject_RNGState")
+register_opaque_type(Counter, "_TestOpaqueObject_Counter")
 
 
 class TestOpaqueObject(TestCase):
@@ -125,6 +145,20 @@ def noisy_inject(x: torch.Tensor, rng_state: RNGState) -> torch.Tensor:
         def noisy_inject_fake(x: torch.Tensor, obj: RNGState) -> torch.Tensor:
             return torch.empty_like(x)
 
+        @torch.library.custom_op(
+            "_TestOpaqueObject::increment_counter",
+            mutates_args=["prev"],
+        )
+        def increment_counter_impl(c: Counter, prev: torch.Tensor) -> torch.Tensor:
+            assert isinstance(c, Counter)
+            prev.copy_(c.counter)
+            c.increment_counter()
+            return c.counter
+
+        @increment_counter_impl.register_fake
+        def increment_counter_fake(c: Counter, prev: torch.Tensor) -> torch.Tensor:
+            return torch.empty_like(prev)
+
         super().setUp()
 
     def tearDown(self):
@@ -233,6 +267,235 @@ def bad_fake2(x, rng_state) -> torch.Tensor:
         ):
             make_fx(f, tracing_mode=make_fx_tracing_mode)(RNGState(0), torch.ones(3))
 
+    def test_aot_export(self):
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(self, rng_state, x):
+                x = torch.ops._TestOpaqueObject.noisy_inject(x, rng_state)
+                x = x * x
+                x = torch.ops._TestOpaqueObject.noisy_inject(x, rng_state)
+                x = x + x
+                return (x,)
+
+        mod = Model()
+        rng = RNGState(0)
+        x = torch.ones(2, 3)
+
+        fake_mode = torch._subclasses.fake_tensor.FakeTensorMode()
+        fake_rng = torch._library.fake_class_registry.maybe_to_fake_obj(fake_mode, rng)
+        fake_x = fake_mode.from_tensor(x)
+        gm = aot_export_module(mod, (fake_rng, fake_x), trace_joint=False)[0]
+
+        # By default we don't register ops containing PyObjs as being effectful
+        self.assertExpectedInline(
+            gm.code.strip(),
+            """\
+def forward(self, arg0_1, arg1_1):
+    noisy_inject = torch.ops._TestOpaqueObject.noisy_inject.default(arg1_1, arg0_1);  arg1_1 = None
+    mul = torch.ops.aten.mul.Tensor(noisy_inject, noisy_inject);  noisy_inject = None
+    noisy_inject_1 = torch.ops._TestOpaqueObject.noisy_inject.default(mul, arg0_1);  mul = arg0_1 = None
+    add = torch.ops.aten.add.Tensor(noisy_inject_1, noisy_inject_1);  noisy_inject_1 = None
+    return (add,)""",  # noqa: B950
+        )
+
+        torch.library._register_effectful_op(
+            "_TestOpaqueObject::noisy_inject", EffectType.ORDERED
+        )
+        try:
+            gm = aot_export_module(mod, (rng, fake_x), trace_joint=False)[0]
+            # inputs: token, rng, x
+            # return: token, res
+            self.assertExpectedInline(
+                gm.code.strip(),
+                """\
+def forward(self, arg0_1, arg1_1, arg2_1):
+    with_effects = torch.ops.higher_order.with_effects(arg0_1, torch.ops._TestOpaqueObject.noisy_inject.default, arg2_1, arg1_1);  arg0_1 = arg2_1 = None
+    getitem = with_effects[0]
+    getitem_1 = with_effects[1];  with_effects = None
+    mul = torch.ops.aten.mul.Tensor(getitem_1, getitem_1);  getitem_1 = None
+    with_effects_1 = torch.ops.higher_order.with_effects(getitem, torch.ops._TestOpaqueObject.noisy_inject.default, mul, arg1_1);  getitem = mul = arg1_1 = None
+    getitem_2 = with_effects_1[0]
+    getitem_3 = with_effects_1[1];  with_effects_1 = None
+    add = torch.ops.aten.add.Tensor(getitem_3, getitem_3);  getitem_3 = None
+    return (getitem_2, add)""",  # noqa: B950
+            )
+        finally:
+            torch.library._register_effectful_op(
+                "_TestOpaqueObject::noisy_inject", None
+            )
+
+    def test_compile(self):
+        def foo(rng_state, x):
+            x = torch.ops._TestOpaqueObject.noisy_inject(x, rng_state)
+            x = x * x
+            x = torch.ops._TestOpaqueObject.noisy_inject(x, rng_state)
+            x = x + x
+            return x
+
+        rng = RNGState(0)
+        x = torch.ones(2, 3)
+
+        res = torch.compile(foo, fullgraph=True, backend="inductor")(rng, x)
+        self.assertFalse(torch.allclose(res, x * x + x))
+
+        backend = AotEagerAndRecordGraphs()
+        torch.compile(foo, fullgraph=True, backend=backend)(rng, x)
+        self.assertExpectedInline(
+            backend.graphs[0].code.strip(),
+            """\
+def forward(self, L_x_ : torch.Tensor, L_rng_state_ : __main___RNGState):
+    l_x_ = L_x_
+    l_rng_state_ = L_rng_state_
+    x = torch.ops._TestOpaqueObject.noisy_inject(l_x_, l_rng_state_);  l_x_ = None
+    x_1 = x * x;  x = None
+    x_2 = torch.ops._TestOpaqueObject.noisy_inject(x_1, l_rng_state_);  x_1 = l_rng_state_ = None
+    x_3 = x_2 + x_2;  x_2 = None
+    return (x_3,)""",  # noqa: B950
+        )
+        self.assertExpectedInline(
+            backend.fw_graphs[0].code.strip(),
+            """\
+def forward(self, arg0_1, arg1_1):
+    noisy_inject = torch.ops._TestOpaqueObject.noisy_inject.default(arg0_1, arg1_1);  arg0_1 = None
+    mul = torch.ops.aten.mul.Tensor(noisy_inject, noisy_inject);  noisy_inject = None
+    noisy_inject_1 = torch.ops._TestOpaqueObject.noisy_inject.default(mul, arg1_1);  mul = arg1_1 = None
+    add = torch.ops.aten.add.Tensor(noisy_inject_1, noisy_inject_1);  noisy_inject_1 = None
+    return (add,)""",  # noqa: B950
+        )
+
+    def test_compile_intermediate(self):
+        counter = Counter(0)
+
+        def foo(x, y):
+            z = torch.ops._TestOpaqueObject.increment_counter(counter, y)
+            x = x * z
+            z = torch.ops._TestOpaqueObject.increment_counter(counter, y)
+            x = x + z
+            return x, counter
+
+        inp = (torch.tensor(1), torch.tensor(0))
+        backend = AotEagerAndRecordGraphs()
+        opt_f = torch.compile(foo, fullgraph=True, backend=backend)
+        res = opt_f(*inp)
+        self.assertEqual(res[0], torch.tensor(3))
+        self.assertEqual(res[1].counter, torch.tensor(2))
+
+        res = opt_f(*inp)
+        self.assertEqual(res[0], torch.tensor(7))
+        self.assertEqual(res[1].counter, torch.tensor(4))
+
+        # counter is automatically lifted as an input
+        # Even though we returned counter in the eager code, it does not get
+        # returned in the graph because dynamo does not detect that the object
+        # is mutated.
+        self.assertExpectedInline(
+            backend.fw_graphs[0].code.strip(),
+            """\
+def forward(self, arg0_1, arg1_1, arg2_1):
+    auto_functionalized_v2 = torch.ops.higher_order.auto_functionalized_v2(torch.ops._TestOpaqueObject.increment_counter.default, c = arg1_1, _prev_base_index = 0, _all_bases = [arg0_1])
+    getitem = auto_functionalized_v2[0]
+    getitem_1 = auto_functionalized_v2[1];  auto_functionalized_v2 = None
+    mul = torch.ops.aten.mul.Tensor(arg2_1, getitem);  arg2_1 = getitem = None
+    auto_functionalized_v2_1 = torch.ops.higher_order.auto_functionalized_v2(torch.ops._TestOpaqueObject.increment_counter.default, c = arg1_1, _prev_base_index = 0, _all_bases = [getitem_1]);  arg1_1 = getitem_1 = None
+    getitem_2 = auto_functionalized_v2_1[0]
+    getitem_3 = auto_functionalized_v2_1[1];  auto_functionalized_v2_1 = None
+    add = torch.ops.aten.add.Tensor(mul, getitem_2);  mul = getitem_2 = None
+    copy_ = torch.ops.aten.copy_.default(arg0_1, getitem_3);  arg0_1 = getitem_3 = copy_ = None
+    return (add,)""",  # noqa: B950
+        )
+
+    def test_compile_attribute(self):
+        counter = Counter(0)
+
+        def foo(counter, x):
+            x = x * x
+            counter.increment_counter()
+            return x
+
+        with self.assertRaisesRegex(
+            RuntimeError, "Attempted to access attributes/methods on an OpaqueObject"
+        ):
+            torch.compile(foo)(counter, torch.ones(2, 3))
+
+        def bar(counter, x):
+            x = x * x
+            x += counter.counter
+            return x
+
+        with self.assertRaisesRegex(
+            RuntimeError, "Attempted to access attributes/methods on an OpaqueObject"
+        ):
+            torch.compile(bar)(counter, torch.ones(2, 3))
+
+    def test_export_joint(self):
+        class Moo(torch.nn.Module):
+            def forward(self, x, y):
+                return x * y
+
+        register_opaque_type(Moo, "_TestOpaqueObject_Moo")
+
+        torch.library.define(
+            "_TestOpaqueObject::module_mul",
+            "(_TestOpaqueObject_Moo a, Tensor b, SymInt c) -> Tensor",
+            tags=torch.Tag.pt2_compliant_tag,
+            lib=self.lib,
+        )
+
+        @torch.library.impl(
+            "_TestOpaqueObject::module_mul", "CompositeExplicitAutograd", lib=self.lib
+        )
+        def module_mul_impl(m: Moo, a: torch.Tensor, b: int) -> torch.Tensor:
+            assert isinstance(m, Moo)
+            return m(a, b)
+
+        @torch.library.register_fake("_TestOpaqueObject::module_mul", lib=self.lib)
+        def module_mul_fake(m: Moo, a: torch.Tensor, b: int) -> torch.Tensor:
+            return torch.empty_like(a)
+
+        def module_mul_setup_context(ctx, inputs, output):
+            m, a, b = inputs
+            ctx.b = b
+
+        def module_mul_backward(ctx, grad) -> torch.Tensor:
+            return None, grad * ctx.b, None
+
+        torch.library.register_autograd(
+            "_TestOpaqueObject::module_mul",
+            module_mul_backward,
+            setup_context=module_mul_setup_context,
+            lib=self.lib,
+        )
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.moo = Moo()
+
+            def forward(self, x, y):
+                b = y.item()
+                return torch.ops._TestOpaqueObject.module_mul(self.moo, x, b)
+
+        inp = (torch.randn(3, requires_grad=True), torch.tensor(4))
+        with ExitStack() as stack:
+            with FakeTensorMode(shape_env=ShapeEnv()):
+                joint = aot_export_joint_with_descriptors(stack, M(), inp)
+                self.assertExpectedInline(
+                    joint.graph_module.code.strip(),
+                    """\
+def forward(self, primals, tangents):
+    primals_1, primals_2, tangents_1, = fx_pytree.tree_flatten_spec([primals, tangents], self._in_spec)
+    _local_scalar_dense = torch.ops.aten._local_scalar_dense.default(primals_2);  primals_2 = None
+    _opaque_obj0 = self._opaque_obj0
+    module_mul = torch.ops._TestOpaqueObject.module_mul.default(_opaque_obj0, primals_1, _local_scalar_dense);  _opaque_obj0 = primals_1 = None
+    mul_1 = torch.ops.aten.mul.Tensor(tangents_1, _local_scalar_dense);  tangents_1 = _local_scalar_dense = None
+    return pytree.tree_unflatten([module_mul, mul_1, None], self._out_spec)""",  # noqa: B950
+                )
+                compiled_fn = aot_compile_joint_with_descriptors(joint)
+
+        self.assertEqual(compiled_fn(*inp), M()(*inp))
+
 
 instantiate_parametrized_tests(TestOpaqueObject)
 
diff --git a/test/test_ops.py b/test/test_ops.py
index 165b284b76d5c..5f44a3ba0841b 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -2992,12 +2992,50 @@ def test_strided_layout(self, device, dtype, op):
             self.assertEqual(strided_result.layout, torch.strided)
 
 
+class TestForwardADWithScalars(TestCase):
+    @ops(
+        [op for op in op_db if op.name in ["mul", "add", "div"]],
+        allowed_dtypes=(torch.float32,),
+    )
+    def test_0d_tensor_with_python_scalar(self, device, dtype, op):
+        """Test that forward AD preserves dtype when combining 0D tensors with Python scalars."""
+        if torch.float not in op.supported_backward_dtypes(device):
+            raise unittest.SkipTest("Does not support autograd")
+
+        # skip if operator doesnt support forward AD
+        if not op.supports_forward_ad:
+            raise unittest.SkipTest("Does not support forward_ad")
+
+        # create 0D tensors
+        primal0d = torch.ones((), device=device, dtype=dtype)
+        tangent0d = torch.ones((), device=device, dtype=dtype)
+
+        with torch.autograd.forward_ad.dual_level():
+            dual0d = torch.autograd.forward_ad.make_dual(primal0d, tangent0d)
+
+            # Test with scalar on RHS
+            if op.supports_rhs_python_scalar:
+                result = op(dual0d, 2.0)
+                p, t = torch.autograd.forward_ad.unpack_dual(result)
+                self.assertEqual(
+                    p.dtype, t.dtype, f"{op.name} and scalar on RHS - dtype mismatch"
+                )
+            # Test with scalar on LHS
+            if op.supports_one_python_scalar:
+                result = op(2.0, dual0d)
+                p, t = torch.autograd.forward_ad.unpack_dual(result)
+                self.assertEqual(
+                    p.dtype, t.dtype, f"{op.name} and scalar on LHS - dtype mismatch"
+                )
+
+
 instantiate_device_type_tests(TestCommon, globals(), allow_xpu=True)
 instantiate_device_type_tests(TestCompositeCompliance, globals())
 instantiate_device_type_tests(TestMathBits, globals())
 instantiate_device_type_tests(TestRefsOpsInfo, globals(), only_for="cpu")
 instantiate_device_type_tests(TestFakeTensor, globals())
 instantiate_device_type_tests(TestTags, globals())
+instantiate_device_type_tests(TestForwardADWithScalars, globals())
 
 if __name__ == "__main__":
     TestCase._default_dtype_check_enabled = True
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index b76895a0a91f3..a605b941d6119 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -32,6 +32,7 @@
 
 import functools
 import itertools
+from pathlib import Path
 
 aten = torch.ops.aten
 
@@ -60,9 +61,7 @@ def process_failures():
 
     and processes them into a list of opinfo xfails
     """
-    f = open('pytest_failures')
-    failures = f.readlines()
-    failures = [i.strip() for i in failures]
+    failures = [i.strip() for i in Path('pytest_failures').read_text().splitlines()]
 
     def process_failure_string(s, matcher):
         out = re.search(matcher, s)
@@ -797,6 +796,27 @@ def f(a, b):
 
         self._test(f, [torch.randn(1, 10), torch.zeros(1, dtype=torch.long)])
 
+    @unittest.skipIf(not HAS_CUDA, 'CUDA-only test')
+    def test_T244632748(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x):
+                return x + (x.shape[0] * 2)
+
+        mod = TestModule()
+        sample = torch.randn((5, 5)).to("cuda")
+        dim0 = torch.export.Dim.DYNAMIC(max=100)
+        dynamic_shapes = {"x": (dim0, torch.export.Dim.STATIC)}
+        ep = torch.export.export(mod, (sample,), dynamic_shapes=dynamic_shapes)
+        gm = ep.module()
+        symint = list(gm.graph.nodes)[3].meta["val"]
+        list(gm.graph.nodes)[3].replace_all_uses_with(symint)
+        gm.graph.eliminate_dead_code()
+
+        inductor_fx = torch._inductor.aot_compile(
+            gm, (sample,), options={"fx_wrapper": True, "compile_threads": 1}
+        )
+
+
 class TestGenericProxyTensorReal(TestGenericProxyTensor):
     tracing_mode = "real"
 
@@ -1987,7 +2007,6 @@ def f(t):
 }
 
 only_fake_tensor_failures = {
-    xfail('narrow'),
     xfail('tensor_split'),
 }
 
diff --git a/test/test_pytree.py b/test/test_pytree.py
index 7cc3b8affc0ef..09cf0bbd47a43 100644
--- a/test/test_pytree.py
+++ b/test/test_pytree.py
@@ -601,6 +601,24 @@ def f(x, y, z):
         for case in cases:
             run_test(case)
 
+    @parametrize_pytree_module
+    def test_tree_map_dict_order(self, pytree):
+        d = {"b": 2, "a": 1, "c": 3}
+        od = OrderedDict([("b", 2), ("a", 1), ("c", 3)])
+        dd = defaultdict(int, {"b": 2, "a": 1, "c": 3})
+        for tree in (d, od, dd):
+            result = pytree.tree_map(lambda x: x, tree)
+            self.assertEqual(
+                list(result.keys()),
+                list(tree.keys()),
+                msg=f"Dictionary keys order changed in tree_map: {tree!r} vs. {result!r}",
+            )
+            self.assertEqual(
+                list(result.values()),
+                list(tree.values()),
+                msg=f"Dictionary keys order changed in tree_map: {tree!r} vs. {result!r}",
+            )
+
     @parametrize_pytree_module
     def test_tree_map_only(self, pytree):
         self.assertEqual(pytree.tree_map_only(int, lambda x: x + 2, [0, "a"]), [2, "a"])
diff --git a/test/test_reductions.py b/test/test_reductions.py
index 4a3235fbc50c9..60c8d65fac912 100644
--- a/test/test_reductions.py
+++ b/test/test_reductions.py
@@ -3755,6 +3755,44 @@ def test_reductions_large_half_tensors(self, device, dtype):
         with ctx:
             self.assertEqual(torch.mean(t), expected)
 
+    def test_scalar_tensor_as_dim_argument(self):
+        """Tests that scalar tensors work correctly as dimension arguments.
+
+        This tests the fix for the PythonArgParser bug where scalar Tensors
+        passed to IntList/SymIntList parameters would be incorrectly handled.
+        """
+        x = torch.ones(1, 2, 3, 4, 5)
+
+        # Scalar tensors should work correctly (same as passing an int)
+        result_tensor = x.sum(dim=torch.tensor(3))
+        result_int = x.sum(dim=3)
+        self.assertEqual(result_tensor.shape, result_int.shape)
+        self.assertEqual(result_tensor.shape, torch.Size([1, 2, 3, 5]))
+
+        # Test with different integer dtypes
+        for dtype in [torch.int32, torch.int64, torch.int16, torch.int8]:
+            dim_tensor = torch.tensor(1, dtype=dtype)
+            result = x.sum(dim=dim_tensor)
+            expected = x.sum(dim=1)
+            self.assertEqual(result.shape, expected.shape)
+
+    @skipIfTorchDynamo("Test uses random.randint which creates FakeTensors")
+    def test_scalar_tensor_dim_compiled_mode(self):
+        """Tests that scalar FakeTensors from random.randint work correctly in compiled mode."""
+        def foo():
+            x = torch.ones(2, 2, 2)
+            return x.sum(dim=random.randint(0, 0))
+
+        @torch.compile
+        def foo_compile():
+            x = torch.ones(2, 2, 2)
+            return x.sum(dim=random.randint(0, 0))
+
+        result_eager = foo()
+        result_compiled = foo_compile()
+        self.assertEqual(result_eager.shape, result_compiled.shape)
+        self.assertEqual(result_eager.shape, torch.Size([2, 2]))
+
 instantiate_device_type_tests(TestReductions, globals())
 
 if __name__ == '__main__':
diff --git a/test/test_scaled_matmul_cuda.py b/test/test_scaled_matmul_cuda.py
index 4d88ccd9cc7dd..94d6ece0f6369 100644
--- a/test/test_scaled_matmul_cuda.py
+++ b/test/test_scaled_matmul_cuda.py
@@ -20,6 +20,7 @@
     PLATFORM_SUPPORTS_MX_GEMM,
     PLATFORM_SUPPORTS_MXFP8_GROUPED_GEMM,
     SM100OrLater,
+    SM120OrLater,
     SM89OrLater,
     SM90OrLater,
     with_tf32_off,
@@ -53,6 +54,7 @@
 
 
 _IS_SM8X = False
+
 if TEST_CUDA:
     _IS_SM8X = torch.cuda.get_device_capability(0)[0] == 8
 
@@ -209,42 +211,36 @@ def infer_scale_swizzle(mat, scale):
         ] == math.ceil(mat.shape[1] // 128):
             return ScalingType.BlockWise128x128, SwizzleType.NO_SWIZZLE
 
+    # if we're checking for nvfp4, need to adjust for packed-K
+    K_multiplier = 2 if mat.dtype == torch.float4_e2m1fn_x2 else 1
     # NVFP4
     if (
         (scale.numel()
-            == round_up(mat.shape[0], 128) * round_up(math.ceil(2 * mat.shape[1] // 16), 4)
+            == round_up(mat.shape[0], 128) * round_up(math.ceil(K_multiplier * mat.shape[1] // 16), 4)
             or scale.numel()
-            == round_up(mat.shape[1], 128) * round_up(math.ceil(2 * mat.shape[0] // 16), 4))
+            == round_up(mat.shape[1], 128) * round_up(math.ceil(K_multiplier * mat.shape[0] // 16), 4))
         and mat.dtype == torch.float4_e2m1fn_x2
         and scale.dtype == torch.float8_e4m3fn
     ):
         return ScalingType.BlockWise1x16, SwizzleType.SWIZZLE_32_4_4
 
-    # MXFP4 w/o swizzle
-    if (
-        (scale.numel() == 2 * math.ceil(mat.shape[0] // 32) * mat.shape[1]
-            or scale.numel() == 2 * math.ceil(mat.shape[1] // 32) * mat.shape[0])
-        and mat.dtype == torch.float4_e2m1fn_x2
-        and scale.dtype == torch.float8_e8m0fnu
-    ):
-        return ScalingType.BlockWise1x32, SwizzleType.NO_SWIZZLE
-
+    # MX formats
     if not torch.version.hip:
-        # MXFP8 w/ swizzle
+        # MX w/swizzle (NVIDIA)
         if (
             (scale.numel()
-                == round_up(mat.shape[0], 128) * round_up(math.ceil(mat.shape[1] // 32), 4)
+                == round_up(mat.shape[0], 128) * round_up(math.ceil(K_multiplier * mat.shape[1] // 32), 4)
                 or scale.numel()
-                == round_up(mat.shape[1], 128) * round_up(math.ceil(mat.shape[0] // 32), 4))
+                == round_up(mat.shape[1], 128) * round_up(math.ceil(K_multiplier * mat.shape[0] // 32), 4))
             and scale.dtype == torch.float8_e8m0fnu
         ):
             return ScalingType.BlockWise1x32, SwizzleType.SWIZZLE_32_4_4
 
     else:
-        # MXFP8 w/o swizzle
+        # MX w/o swizzle (AMD)
         if (
-            (scale.numel() == math.ceil(mat.shape[0] // 32) * mat.shape[1]
-                or scale.numel() == math.ceil(mat.shape[1] // 32) * mat.shape[0])
+            (scale.numel() == math.ceil(mat.shape[0] // 32) * K_multiplier * mat.shape[1]
+                or scale.numel() == math.ceil(K_multiplier * mat.shape[1] // 32) * mat.shape[0])
             and scale.dtype == torch.float8_e8m0fnu
         ):
             return ScalingType.BlockWise1x32, SwizzleType.NO_SWIZZLE
@@ -742,6 +738,10 @@ def test_float8_scale(self, device) -> None:
     @parametrize("format", ["mxfp8"] + (["nvfp4", "mxfp4"] if torch.version.cuda else []))
     def test_mxfp8_nvfp4_scaled_grouped_mm_2d_2d(self, G, M, N, K, format):
         torch.manual_seed(42)
+
+        if format == "mxfp4" and SM120OrLater:
+            raise unittest.SkipTest("MXFP4 on CUDA only supported on B200/B300")
+
         total_K = K  # Alias for clarity, communicating this consists of several groups along this dim
         input_group_end_offsets = generate_jagged_offs(
             G, total_K, multiple_of=32, device="cuda"
@@ -805,6 +805,10 @@ def test_mxfp8_nvfp4_scaled_grouped_mm_2d_2d(self, G, M, N, K, format):
     @parametrize("format", ["mxfp8"] + (["nvfp4", "mxfp4"] if torch.version.cuda else []))
     def test_mxfp8_scaled_grouped_mm_2d_3d(self, G, M, N, K, format):
         torch.manual_seed(42)
+
+        if format == "mxfp4" and SM120OrLater:
+            raise unittest.SkipTest("MXFP4 on CUDA only supported on B200/B300")
+
         # Simulate 2d-3d grouped gemm `out = input @ weight.t()`
         # 2D inputs with groups along M, 3D weights.
         block_size = 32
@@ -1249,8 +1253,12 @@ def e5m2():
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8 or IS_WINDOWS, f8_msg)
     @unittest.skipIf(not SM89OrLater, "rowwise implementation is currently sm89-sm100 specific")
     @parametrize("base_dtype", [torch.bfloat16, torch.float16, torch.float32])
+    @parametrize("shapes", [
+        (128, 512, 256),
+    ])
     @with_tf32_off
-    def test_scaled_mm_vs_emulated_row_wise(self, base_dtype):
+    def test_scaled_mm_vs_emulated_row_wise(self, base_dtype, shapes):
+        M, K, N = shapes
         # Fp32 out_dtype is only supported by cuBLAS, which however only started
         # shipping row-wise kernels in CUDA 12.9, and only for sm90+.
         if base_dtype is torch.float32:
@@ -1271,11 +1279,11 @@ def test_scaled_mm_vs_emulated_row_wise(self, base_dtype):
         input_dtype = e4m3_type
         output_dtype = base_dtype
 
-        x = torch.randn(16, 16, device="cuda", dtype=base_dtype)
-        y = torch.randn(32, 16, device="cuda", dtype=base_dtype).t()
+        x = torch.randn(M, K, device="cuda", dtype=base_dtype)
+        y = torch.randn(N, K, device="cuda", dtype=base_dtype).t()
         bias = None
         if base_dtype in {torch.bfloat16, torch.float16}:
-            bias = torch.randn((32,), device="cuda", dtype=base_dtype)
+            bias = torch.randn((N,), device="cuda", dtype=base_dtype)
 
         x_scales = tensor_to_scale(x, input_dtype, dim=1).float()
         y_scales = tensor_to_scale(y, input_dtype, dim=0).float()
@@ -1306,6 +1314,11 @@ def test():
 
             self.assertEqual(out_scaled_mm, out_emulated, atol=atol, rtol=rtol)
 
+            cosine_sim = torch.nn.functional.cosine_similarity(
+                out_emulated.flatten().float(), out_scaled_mm.flatten().float(), dim=0
+            )
+            self.assertGreaterEqual(float(cosine_sim), 0.999)
+
         # only cuBLAS supports rowwise with fp32 output and cuBLAS only supports
         # rowwise on SM 9.0
         if torch.cuda.get_device_capability() != (9, 0) and output_dtype == torch.float:
@@ -1719,7 +1732,8 @@ def test_honor_sm_carveout(self) -> None:
 
             prof.export_chrome_trace(f.name)
             if torch.version.hip:
-                events = [evt for evt in json.load(open(f.name))["traceEvents"] if evt.get("cat", "") == "kernel"]
+                with open(f.name) as file:
+                    events = [evt for evt in json.load(file)["traceEvents"] if evt.get("cat", "") == "kernel"]
                 # events were returned out of order; need to be sorted on "ts" timestamp
                 events = sorted(events, key=lambda x: x['ts'])
                 # ROCm carveout is invisible except for kernels running slower on fewer CUs
@@ -1742,11 +1756,12 @@ def test_honor_sm_carveout(self) -> None:
                 self.assertTrue(no_carveout != carveout)
                 self.assertTrue(carveout_0 != carveout)
             else:
-                no_carveout, carveout_0, carveout_66, no_carveout_again = [
-                    math.prod(evt.get("args", {}).get("grid", []))
-                    for evt in json.load(open(f.name))["traceEvents"]
-                    if evt.get("cat", "") == "kernel"
-                ]
+                with open(f.name) as file:
+                    no_carveout, carveout_0, carveout_66, no_carveout_again = [
+                        math.prod(evt.get("args", {}).get("grid", []))
+                        for evt in json.load(file)["traceEvents"]
+                        if evt.get("cat", "") == "kernel"
+                    ]
 
                 self.assertEqual(no_carveout, no_carveout_again)
                 capability = torch.cuda.get_device_capability()
@@ -1868,10 +1883,14 @@ def test_blockwise_nvfp4_with_global_scale(self, mkn) -> None:
         (127, 96, 1024),
         (1025, 128, 96)
     ], name_fn=lambda mkn: f"{mkn[0]}_{mkn[1]}_{mkn[2]}")
-    @parametrize("recipe", ["mxfp8", "mxfp4" if torch.version.hip else "nvfp4"])
+    @parametrize("recipe", ["mxfp8", "mxfp4", "nvfp4"])
     def test_blockwise_mxfp8_nvfp4_mxfp4_numerics(self, test_case_name, fast_accum, mkn, recipe) -> None:
+        if torch.version.hip and recipe == "nvfp4":
+            raise unittest.SkipTest("nvfp4 not supported on ROCm, skipping")
         if (recipe == "nvfp4" or recipe == "mxfp4") and fast_accum:
             raise unittest.SkipTest("fast_accum not supported in nvfp4/mxfp4 cublas gemm, skipping")
+        if recipe == "mxfp4" and SM120OrLater:
+            raise unittest.SkipTest("MXFP4 on CUDA only supported on B200/B300")
 
         device = "cuda"
         M, K, N = mkn
@@ -1882,8 +1901,12 @@ def test_blockwise_mxfp8_nvfp4_mxfp4_numerics(self, test_case_name, fast_accum,
             if not (M % 16 == 0 and K % 128 == 0 and N % 16 == 0):
                 raise unittest.SkipTest("M and N must be multiples of 16 and K must be multiple of 128 on ROCm, skipping")
 
-        fp4_scaling_dtype = torch.float8_e8m0fnu if torch.version.hip else torch.float8_e4m3fn
-        BLOCK_SIZE = 32 if torch.version.hip else (16 if recipe == "nvfp4" else 32)
+        fp4_scaling_dtype = torch.float8_e8m0fnu if recipe == "mxfp4" else torch.float8_e4m3fn
+        BLOCK_SIZE = 16 if recipe == "nvfp4" else 32
+
+        if K % BLOCK_SIZE != 0:
+            raise unittest.SkipTest(f"K ({K}) must be divisible by BLOCK_SIZE ({BLOCK_SIZE}), skipping")
+
         require_exact_match = True
         approx_match_sqnr_target = 22.0
 
@@ -2061,7 +2084,7 @@ def test_blockwise_mxfp8_nvfp4_mxfp4_numerics(self, test_case_name, fast_accum,
                 B = B.clamp(min=min_val, max=max_val)
                 B = _bfloat16_to_float4_e2m1fn_x2(B)
 
-                approx_match_sqnr_target = 15 if torch.version.hip else 15.8
+                approx_match_sqnr_target = 15 if recipe == "mxfp4" else 15.8
 
         C_ref = A_ref @ B_ref.t()
 
@@ -2088,6 +2111,8 @@ def test_blockwise_mxfp8_nvfp4_mxfp4_numerics(self, test_case_name, fast_accum,
     @unittest.skipIf(not PLATFORM_SUPPORTS_MX_GEMM or IS_WINDOWS, mx_skip_msg)
     @parametrize("recipe", ["mxfp8", "mxfp4" if torch.version.hip else "nvfp4"])
     def test_blockwise_mxfp8_nvfp4_error_messages(self, device, recipe) -> None:
+        if recipe == "mxfp4" and SM120OrLater:
+            raise unittest.SkipTest("MXFP4 on CUDA only supported on B200/B300")
         M, K, N = (1024, 512, 2048)
         BLOCK_SIZE_K = 16 if recipe == "nvfp4" else 32
         BLOCK_SIZE_MN = 128
diff --git a/test/test_scatter_gather_ops.py b/test/test_scatter_gather_ops.py
index ba967c142f1e7..96768f34affb0 100644
--- a/test/test_scatter_gather_ops.py
+++ b/test/test_scatter_gather_ops.py
@@ -6,7 +6,7 @@
 
 from torch.testing import make_tensor
 from torch.testing._internal.common_utils import \
-    (parametrize, run_tests, TestCase, DeterministicGuard, TEST_WITH_ROCM)
+    (parametrize, run_tests, TestCase, DeterministicGuard, TEST_WITH_ROCM, serialTest)
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, onlyCPU, dtypes, dtypesIfCUDA,
      toleranceOverride, tol,)
@@ -65,10 +65,12 @@ def test_gather(self, device, dtype):
             actual = torch.gather(src, 2, idx)
             self.assertEqual(actual, expected, atol=0, rtol=0)
 
+    @serialTest()
     @dtypes(torch.int8, torch.bfloat16)
     def test_gather_large(self, device, dtype):
         # test larger shapes to check vectorized implementation
-        for (m, n, k) in ((4096, 3072, 4096), (4096, 3072, 4100)):
+        for (m, n, k) in ((4096, 3072, 4096), (4096, 3072, 4100), (4, 4, 16384 * 8192)):
+            torch.cuda.empty_cache()
             src = make_tensor((m, k), device=device, dtype=dtype)
             alloc0 = torch.empty(src.nelement() * 2, device=device, dtype=dtype)
             discontig = alloc0.view(m, 2 * k)[:, ::2].copy_(src)
@@ -111,6 +113,8 @@ def test_gather_large(self, device, dtype):
                 self.assertEqual(res_ind, ref, atol=0, rtol=0)
                 res_gather = torch.gather(misaligned1, dim=dim, index=ind)
                 self.assertEqual(res_gather, ref, atol=0, rtol=0)
+            del src, alloc0, alloc1, alloc2
+            del discontig, misaligned, misaligned1
         # test gather along 1st dim that can accidentally trigger fast path
         # because due to index dimension in the gather dim being 1
         # an unexpected squashing in tensorIterator happens
diff --git a/test/test_serialization.py b/test/test_serialization.py
index dcf67fe3ccf14..39f8b7735663f 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -313,15 +313,17 @@ def test_serialization_fake_zip(self):
     def test_serialization_gzip(self):
         # Test serialization with gzip file
         b = self._test_serialization_data()
-        f1 = tempfile.NamedTemporaryFile(delete=False)
-        f2 = tempfile.NamedTemporaryFile(delete=False)
-        torch.save(b, f1)
-        with open(f1.name, 'rb') as f_in, gzip.open(f2.name, 'wb') as f_out:
-            shutil.copyfileobj(f_in, f_out)
-
-        with gzip.open(f2.name, 'rb') as f:
-            c = torch.load(f)
-        self._test_serialization_assert(b, c)
+        with tempfile.NamedTemporaryFile() as f1, tempfile.NamedTemporaryFile(delete=False) as f2:
+            torch.save(b, f1)
+            f1.seek(0)
+            with gzip.open(f2.name, 'wb') as f_out:
+                shutil.copyfileobj(f1, f_out)
+
+            with gzip.open(f2.name, 'rb') as f:
+                c = torch.load(f)
+                self._test_serialization_assert(b, c)
+            f2.close()
+            os.unlink(f2.name)
 
     @unittest.skipIf(
         not TEST_DILL or HAS_DILL_AT_LEAST_0_3_1,
@@ -382,19 +384,19 @@ def test_serialization_dill(self):
     def test_serialization_offset_gzip(self):
         a = torch.randn(5, 5)
         i = 41
-        f1 = tempfile.NamedTemporaryFile(delete=False)
         f2 = tempfile.NamedTemporaryFile(delete=False)
-        with open(f1.name, 'wb') as f:
-            pickle.dump(i, f)
-            torch.save(a, f)
-        with open(f1.name, 'rb') as f_in, gzip.open(f2.name, 'wb') as f_out:
-            shutil.copyfileobj(f_in, f_out)
-
-        with gzip.open(f2.name, 'rb') as f:
-            j = pickle.load(f)
-            b = torch.load(f)
-        self.assertTrue(torch.equal(a, b))
-        self.assertEqual(i, j)
+        with tempfile.NamedTemporaryFile() as f1:
+            pickle.dump(i, f1)
+            torch.save(a, f1)
+            f1.seek(0)
+            with gzip.open(f2.name, 'wb') as f_out:
+                shutil.copyfileobj(f1, f_out)
+
+            with gzip.open(f2.name, 'rb') as f:
+                j = pickle.load(f)
+                b = torch.load(f)
+                self.assertTrue(torch.equal(a, b))
+                self.assertEqual(i, j)
 
     def _test_serialization_sparse(self, weights_only):
         def _test_serialization(conversion):
@@ -948,7 +950,7 @@ def test_skip_data_load(self):
             with safe_globals([TwoTensor]), skip_data():
                 sd_loaded = torch.load(f)
             self.assertNotEqual(sd_loaded, sd)
-            for k in sd_loaded.keys():
+            for k in sd_loaded:
                 sd_loaded[k] = sd_loaded[k].zero_()
             self.assertEqual(sd_loaded, sd_zeroed)
 
@@ -1281,7 +1283,7 @@ def test_weights_only_safe_globals_newobj(self):
             torch.save(p, f)
             f.seek(0)
             with self.assertRaisesRegex(pickle.UnpicklingError,
-                                        "GLOBAL __main__.Point was not an allowed global by default"):
+                                        f"GLOBAL {__name__}.Point was not an allowed global by default"):
                 torch.load(f, weights_only=True)
             f.seek(0)
             with torch.serialization.safe_globals([Point]):
@@ -1300,7 +1302,7 @@ def fake_set_state(obj, *args):
             torch.save(c, f)
             f.seek(0)
             with self.assertRaisesRegex(pickle.UnpicklingError,
-                                        "GLOBAL __main__.ClassThatUsesBuildInstruction was not an allowed global by default"):
+                                        f"GLOBAL {__name__}.ClassThatUsesBuildInstruction was not an allowed global by default"):
                 torch.load(f, weights_only=True)
             try:
                 with torch.serialization.safe_globals([ClassThatUsesBuildInstruction]):
@@ -1330,7 +1332,7 @@ def test_weights_only_safe_globals_build_with_slots(self, slots):
             torch.save(obj, f)
             f.seek(0)
             with self.assertRaisesRegex(pickle.UnpicklingError,
-                                        f"GLOBAL __main__.{obj_cls.__name__} was not an allowed global by default"):
+                                        f"GLOBAL {__name__}.{obj_cls.__name__} was not an allowed global by default"):
                 torch.load(f, weights_only=True)
 
             f.seek(0)
@@ -4501,9 +4503,10 @@ def fn(t):
         # Test that without materialize_fake_tensor, behavior for fake_tensors is not altered by ctx
         if not materialize_fake:
             ft = converter.from_real_tensor(mode, torch.randn(2, device=t_device))
+            exc = pickle.PicklingError if sys.version_info >= (3, 14) else AttributeError
             with self.assertRaisesRegex(
-                AttributeError,
-                "Can't (get|pickle) local object 'WeakValueDictionary.__init__.<locals>.remove'"
+                exc,
+                r"Can't (get|pickle) local object (<function |')WeakValueDictionary\.__init__\.<locals>\.remove"
             ):
                 with skip_data(), BytesIOContext() as f:
                     torch.save(ft, f)
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 809c30b92a8b7..5ac9b1542aa72 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -184,7 +184,7 @@ def run(self, result=None):
 class TestSparse(TestSparseBase):
 
     def setUp(self):
-        TestCase.setUp(self)
+        super().setUp()
 
         self.index_tensor = lambda *args, **kwargs: torch.tensor(*args, **kwargs, dtype=torch.int64)
 
@@ -636,7 +636,6 @@ def test_shared(self, device, dtype):
         i[0][0] = 0
         self.assertEqual(torch.empty((3, 0), dtype=dtype, device=device), self.safeToDense(x))
 
-    @expectedFailureMPS
     @dtypes(torch.double, torch.cdouble)
     @dtypesIfMPS(torch.float32, torch.complex64)
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
@@ -653,7 +652,8 @@ def test_tensor(x, res):
             def fn(x):
                 return x.to_dense(masked_grad=gradcheck.masked)
             x.requires_grad_(True)
-            gradcheck(fn, (x,))
+            kwargs = {"eps": 1e-4} if device == "mps:0" else {}
+            gradcheck(fn, (x,), **kwargs)
 
         i = self.index_tensor([
             [0, 1, 2, 2],
@@ -1397,9 +1397,9 @@ def run_test(nnz, size):
         # case nnz > size[d]
         run_test(tlen, tlen // 2)
 
-    @onlyCPU
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_mm(self, device, dtype, coalesced):
         def test_shape(di, dj, dk, nnz):
             x, _, _ = self._gen_sparse(2, nnz, [di, dj], dtype, device, coalesced)
@@ -2242,7 +2242,6 @@ def _test_sparse_mask_hybrid_fixed():
 
     @dtypes(torch.double, torch.cdouble)
     @dtypesIfMPS(torch.float32, torch.complex64)
-    @expectedFailureMPS
     @skipIfCrossRef
     def test_sparse_mask_backward(self, device, dtype):
         from itertools import product, repeat
@@ -2252,7 +2251,6 @@ def test_sparse_mask_backward(self, device, dtype):
         nnzs = (0, 5, 15, 25)
 
         lhs_data = torch.arange(1, 26, device=device).reshape(shape).to(dtype).to_sparse(sparse_dims)
-        rhs_data = lhs_data.clone()
 
         for nnz in nnzs:
             for lhs_is_coalesced, rhs_is_coalesced in product(*repeat((True, False), 2)):
@@ -2272,8 +2270,9 @@ def test_sparse_mask_backward(self, device, dtype):
                 # sparsity_pattern(lhs) == sparsity_pattern(lhs.grad).
                 # lhs.sparse_mask(lhs_mask) accomplishes that.
                 lhs_mask = lhs.detach().clone()
-                gradcheck(lambda x: x.sparse_mask(lhs_mask).sparse_mask(rhs).to_dense(masked_grad=True), (lhs,), masked=True)
-                gradcheck(lambda x: x.sparse_mask(rhs).to_dense(masked_grad=False), (lhs,), masked=False)
+                gradcheck(lambda x: x.sparse_mask(lhs_mask).sparse_mask(rhs).to_dense(masked_grad=True), (lhs,),
+                          masked=True, eps=3e-4, atol=5e-5)
+                gradcheck(lambda x: x.sparse_mask(rhs).to_dense(masked_grad=False), (lhs,), masked=False, eps=3e-4, atol=5e-5)
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
@@ -2680,7 +2679,6 @@ def test_asin_arcsin(self, device, dtype, coalesced):
             self._test_asin_arcsin(input_uncoalesced, coalesced)
 
     @coalescedonoff
-    @expectedFailureMPS
     @dtypes(torch.double)
     @dtypesIfMPS(torch.float32)
     def test_mv(self, device, dtype, coalesced):
@@ -3735,7 +3733,6 @@ def test_log_softmax_float(self, device, dtype):
     @coalescedonoff
     @dtypes(*floating_and_complex_types())
     @dtypesIfMPS(*all_mps_types())
-    @expectedFailureMPS
     @dtypesIfCUDA(*floating_types_and(*[torch.half] if SM53OrLater and not TEST_WITH_ROCM else [],
                                       *[torch.bfloat16] if SM80OrLater and not TEST_WITH_ROCM else [],
                                       torch.complex64,
@@ -3832,9 +3829,9 @@ def fn(sparse_dims, nnz, shape_a, shape_b):
             def different_dtypes():
                 a, i_a, v_a = self._gen_sparse(2, 10, [2, 2], dtype, device, coalesced)
                 b, i_b, v_b = self._gen_sparse(2, 10, [2, 2], dtype, device, coalesced)
-                r2 = torch.sparse.mm(a.to(torch.float64), a.to(torch.float32))
+                r2 = torch.sparse.mm(a.to(torch.float32), a.to(torch.float16))
 
-            self.assertRaisesRegex(RuntimeError, 'mat1 dtype Double does not match mat2 dtype Float', different_dtypes)
+            self.assertRaisesRegex(RuntimeError, 'mat1 dtype Float does not match mat2 dtype Half', different_dtypes)
 
         def test_backward_noncontiguous():
             # Sparse.mm backward used to wrong with non-contiguous grads,
diff --git a/test/test_static_runtime.py b/test/test_static_runtime.py
index f7efe9b929168..310962311b396 100644
--- a/test/test_static_runtime.py
+++ b/test/test_static_runtime.py
@@ -556,7 +556,7 @@ def test_fusion_outputs(self):
         torch._C._fuse_to_static_module(og.graph)
         assert "StaticSubgraph" in str(og.graph)
         o_test = og(a, b, b, c)
-        for i in o_ref.keys():
+        for i in o_ref:
             torch.testing.assert_close(o_ref[i], o_test[i])
 
     def test_create_object(self):
diff --git a/test/test_sympy_utils.py b/test/test_sympy_utils.py
index 75396631fedb5..2d4ec68a02c99 100644
--- a/test/test_sympy_utils.py
+++ b/test/test_sympy_utils.py
@@ -67,10 +67,12 @@
     "mod",
     "bitwise_and",
     "bitwise_or",
+    "bitwise_xor",
 ]
 BITWISE_OPS = [
     "bitwise_and",
     "bitwise_or",
+    "bitwise_xor",
 ]
 
 UNARY_BOOL_OPS = ["not_"]
diff --git a/test/test_testing.py b/test/test_testing.py
index c660eb83b8042..09887be17c47a 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -510,7 +510,7 @@ def test_trivial_passing_test(self, device):
         # Test without setting env var should run everything.
         env = dict(os.environ)
         for k in ['CI', PYTORCH_TESTING_DEVICE_ONLY_FOR_KEY, PYTORCH_TESTING_DEVICE_EXCEPT_FOR_KEY]:
-            if k in env.keys():
+            if k in env:
                 del env[k]
         _, stderr = TestCase.run_process_no_exception(test_filter_file_template, env=env)
         self.assertIn(f'Ran {test_bases_count} test', stderr.decode('ascii'))
diff --git a/test/test_torch.py b/test/test_torch.py
index b54ae93baa647..01c6fb39a5a2a 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -259,7 +259,8 @@ def test_storage_setitem(self, device, dtype):
     def test_storage_use_count(self, device):
         a = torch.randn(10, device=device)
         prev_cf = torch._C._storage_Use_Count(a.untyped_storage()._cdata)
-        self.assertEqual(prev_cf, 1)
+        # Two references: 'a' and the wrapper returned by untyped_storage()
+        self.assertEqual(prev_cf, 2)
         b = a.view(2, 5)
         self.assertEqual(torch._C._storage_Use_Count(b.untyped_storage()._cdata), prev_cf + 1)
 
@@ -1781,6 +1782,14 @@ def test_cumsum_64bit_indexing(self, device):
         self.assertEqual(b[0, :], d[0, :], atol=3e-5, rtol=3e-5)
         self.assertEqual(b[-1, :], d[-1, :], atol=3e-5, rtol=3e-5)
 
+    @onlyCUDA
+    @largeTensorTest('48GB')
+    def test_cumsum_outer_dim_64bit_indexing(self, device):
+        x = torch.zeros(309504, 1, 16384, device=device)
+        torch.exp(x)
+        cumsum = torch.cumsum(x, dim=1)
+        self.assertEqual(cumsum.max().item(), 0., atol=0., rtol=0.)
+
     @expectedFailureMeta  # expected a non-determinitic error, but it was not raised
     @onlyNativeDeviceTypes
     def test_nondeterministic_alert_put(self, device):
@@ -9316,7 +9325,7 @@ class BadSubTensor:
             member_var = object()
 
         err_msg = "Creating a Tensor subclass from a class that does not inherit from Tensor"
-        with self.assertRaisesRegex(RuntimeError, err_msg):
+        with self.assertRaisesRegex(TypeError, err_msg):
             s0 = t0.as_subclass(BadSubTensor)
 
     # FIXME: Port to a test suite that better fits slicing
@@ -10316,20 +10325,21 @@ def test_backward_hooks_traverse(self):
 
     @skipIfTorchDynamo("https://github.com/pytorch/torchdynamo/issues/1993")
     def test_tensor_dead_weak_ref(self):
-        x = torch.empty(2)
+        x = torch.ones(2)
         w_x = weakref.ref(x)
-        y = torch.empty(2)
+        y = torch.ones(2)
         y.grad = x
         del x
 
         x = w_x()
-        # Ideally, x would keep the tensor live.  But CPython doesn't
-        # provide enough hooks to do this.  So it will go dead and x
-        # will transmute into an undefined tensor.  Not great, but the
-        # best we can do.
+        # x should keep the tensor live. This didn't happen in earlier PyTorch
+        # versions.
         del y
 
-        self.assertRaises(RuntimeError, lambda: x.sigmoid())
+        self.assertEqual(2, x.sum())
+
+        del x
+        self.assertIsNone(w_x())
 
     @skipIfTorchDynamo("https://github.com/pytorch/torchdynamo/issues/1993")
     def test_storage_dead_weak_ref(self):
@@ -10337,16 +10347,9 @@ def test_storage_dead_weak_ref(self):
         w_x = weakref.ref(x)
         y = torch.tensor(x)
         del x
-
-        x = w_x()
-        # Ideally, x would keep the storage live.  But CPython doesn't
-        # provide enough hooks to do this.  So it will go dead and x
-        # will transmute into storage with null StorageImpl. Not great, but the
-        # best we can do.
+        self.assertIsNotNone(w_x())
         del y
-
-        self.assertRaisesRegex(RuntimeError, "Got a null Storage", lambda: x[0])
-        self.assertRaisesRegex(RuntimeError, "Got a null Storage", lambda: x.float())
+        self.assertIsNone(w_x())
 
     def test_tensor_resurrected_weak_ref(self):
         x = torch.empty(2)
@@ -10407,6 +10410,31 @@ def callback(w):
 
         self.assertTrue(called)
 
+    def test_storage_thread_safety(self):
+        import threading
+        from concurrent.futures import ThreadPoolExecutor
+
+        NUM_ITERS = 10
+        NUM_THREADS = 4
+
+        # Concurrent calls to tensor.untyped_storage()
+        def access_untyped_storage(tensor, barrier):
+            barrier.wait()
+            return weakref.ref(tensor.untyped_storage())
+
+        for i in range(NUM_ITERS):
+            tensor = torch.tensor([1.0, 2.0, 3.0])
+            barrier = threading.Barrier(NUM_THREADS)
+            with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
+                futures = [
+                    executor.submit(access_untyped_storage, tensor, barrier)
+                    for _ in range(NUM_THREADS)
+                ]
+
+                # Check that all the storages returned were the same
+                for future in futures:
+                    self.assertEqual(future.result()(), tensor.untyped_storage())
+
     # FIXME: move to test_linalg
     @torch.inference_mode()
     def test_bmm_multithreaded(self):
diff --git a/test/test_torchfuzz_repros.py b/test/test_torchfuzz_repros.py
index 3b864aae4f477..b77701948d8ce 100644
--- a/test/test_torchfuzz_repros.py
+++ b/test/test_torchfuzz_repros.py
@@ -21,6 +21,7 @@ class TestFuzzerCompileIssues(TestCase):
 
     def setUp(self):
         """Configure common test settings."""
+        super().setUp()
         torch._dynamo.config.capture_scalar_outputs = True
         torch._dynamo.config.capture_dynamic_output_shape_ops = True
         torch._inductor.config.emulate_precision_casts = True
@@ -257,34 +258,6 @@ def foo(arg0, arg1):
         out_compiled.sum().backward()
         print("Compile Success! ✅")
 
-    @pytest.mark.xfail(reason="Issue #163971")
-    def test_fuzzer_issue_163971(self):
-        torch.manual_seed(0)
-
-        def foo(arg0):
-            t0 = arg0  # size=(), stride=(), dtype=bfloat16, device=cuda
-            t1 = torch.softmax(
-                t0, dim=0
-            )  # size=(), stride=(), dtype=bfloat16, device=cuda
-            t2 = torch.nn.functional.gelu(
-                t1
-            )  # size=(), stride=(), dtype=bfloat16, device=cuda
-            t3 = torch.softmax(
-                t2, dim=0
-            )  # size=(), stride=(), dtype=bfloat16, device=cuda
-            output = t3
-            return output
-
-        arg0 = torch.rand([], dtype=torch.bfloat16, device="cuda", requires_grad=True)
-
-        out_eager = foo(arg0)
-        out_eager.sum().backward()
-        print("Eager Success! ✅")
-        compiled_foo = torch.compile(foo, fullgraph=True, dynamic=True)
-        out_compiled = compiled_foo(arg0)
-        out_compiled.sum().backward()
-        print("Compile Success! ✅")
-
     @pytest.mark.xfail(reason="Issue #164059")
     def test_fuzzer_issue_164059(self):
         torch.manual_seed(0)
@@ -450,44 +423,155 @@ def foo(arg0):
         out_compiled.sum().backward()
         print("Compile Success! ✅")
 
-    @pytest.mark.xfail(reason="Issue #163674")
-    def test_fuzzer_issue_163674(self):
-        torch.manual_seed(0)
-
-        def foo(arg0, arg1, arg2):
-            t0 = arg0  # size=(79488, 1, 3, 1), stride=(3, 3, 1, 1), dtype=float16, device=cuda
-            t1 = t0.clone()
-            t1.zero_()  # size=(79488, 1, 3, 1), stride=(3, 3, 1, 1), dtype=float16, device=cuda
-            t2 = arg1  # size=(79488, 1, 3, 1), stride=(3, 3, 1, 1), dtype=float32, device=cuda
-            t3 = arg2  # size=(), stride=(), dtype=float32, device=cuda
-            t4 = t2.clone()
-            t4.fill_(
-                t3.item()
-            )  # size=(79488, 1, 3, 1), stride=(3, 3, 1, 1), dtype=float32, device=cuda
-            t5 = torch.pow(
-                t1, t4
-            )  # size=(79488, 1, 3, 1), stride=(3, 3, 1, 1), dtype=float32, device=cuda
-            t6 = t5.reshape(
-                (96, 69, 36)
-            )  # size=(96, 69, 36), stride=(2484, 36, 1), dtype=float32, device=cuda
-            output = t6
-            return output
-
-        arg0 = torch.rand(
-            [79488, 1, 3, 1], dtype=torch.float16, device="cuda", requires_grad=True
-        )
-        arg1 = torch.rand(
-            [79488, 1, 3, 1], dtype=torch.float32, device="cuda", requires_grad=True
-        )
-        arg2 = torch.rand([], dtype=torch.float32, device="cuda", requires_grad=True)
+    @pytest.mark.xfail(reason="Issue #167937")
+    def test_fuzzer_issue_167937(self):
+        torch.manual_seed(1251149731)
+        torch.set_default_device("cuda")
+
+        def fuzzed_program(
+            arg_0,
+            arg_1,
+            arg_2,
+            arg_3,
+            arg_4,
+            arg_5,
+            arg_6,
+            arg_7,
+            arg_8,
+            arg_9,
+            sentinel,
+        ):
+            var_node_3 = arg_0  # size=(27, 28, 7), stride=(196, 7, 1), dtype=bfloat16, device=cuda
+            var_node_4 = (
+                arg_1  # size=(27, 7, 6), stride=(42, 6, 1), dtype=bfloat16, device=cuda
+            )
+            var_node_2 = torch.matmul(
+                var_node_3.to(torch.bfloat16), var_node_4.to(torch.bfloat16)
+            )  # size=(27, 28, 6), stride=(168, 6, 1), dtype=bfloat16, device=cuda
+            var_node_6 = (
+                arg_2  # size=(27, 6, 9), stride=(54, 9, 1), dtype=bfloat16, device=cuda
+            )
+            var_node_7 = torch.full(
+                (27, 9, 1), -0.310546875, dtype=torch.bfloat16
+            )  # size=(27, 9, 1), stride=(9, 1, 1), dtype=bfloat16, device=cuda
+            var_node_5 = torch.matmul(
+                var_node_6.to(torch.bfloat16), var_node_7.to(torch.bfloat16)
+            )  # size=(27, 6, 1), stride=(6, 1, 1), dtype=bfloat16, device=cuda
+            var_node_1 = torch.matmul(
+                var_node_2.to(torch.bfloat16), var_node_5.to(torch.bfloat16)
+            )  # size=(27, 28, 1), stride=(28, 1, 1), dtype=bfloat16, device=cuda
+            var_node_8 = arg_3  # size=(27, 28, 1), stride=(28, 1, 1), dtype=bfloat16, device=cuda
+            var_node_9 = torch.full(
+                (27, 28, 1), 0.76953125, dtype=torch.bfloat16
+            )  # size=(27, 28, 1), stride=(28, 1, 1), dtype=bfloat16, device=cuda
+            var_node_12 = (
+                arg_4  # size=(3, 4), stride=(4, 1), dtype=bfloat16, device=cuda
+            )
+            var_node_13 = (
+                arg_5  # size=(4, 15), stride=(15, 1), dtype=bfloat16, device=cuda
+            )
+            var_node_11 = torch.matmul(
+                var_node_12.to(torch.bfloat16), var_node_13.to(torch.bfloat16)
+            )  # size=(3, 15), stride=(15, 1), dtype=bfloat16, device=cuda
+            var_node_15 = (
+                arg_6  # size=(15, 12), stride=(12, 1), dtype=bfloat16, device=cuda
+            )
+            var_node_16 = (
+                arg_7  # size=(12, 1), stride=(1, 1), dtype=bfloat16, device=cuda
+            )
+            var_node_14 = torch.matmul(
+                var_node_15.to(torch.bfloat16), var_node_16.to(torch.bfloat16)
+            )  # size=(15, 1), stride=(1, 1), dtype=bfloat16, device=cuda
+            var_node_10 = torch.matmul(
+                var_node_11.to(torch.bfloat16), var_node_14.to(torch.bfloat16)
+            )  # size=(3, 1), stride=(1, 1), dtype=bfloat16, device=cuda
+            var_node_19 = (
+                arg_8  # size=(1, 8), stride=(8, 1), dtype=bfloat16, device=cuda
+            )
+            var_node_20 = (
+                arg_9  # size=(8, 2), stride=(2, 1), dtype=bfloat16, device=cuda
+            )
+            var_node_18 = torch.matmul(
+                var_node_19.to(torch.bfloat16), var_node_20.to(torch.bfloat16)
+            )  # size=(1, 2), stride=(2, 1), dtype=bfloat16, device=cuda
+            var_node_21 = torch.full(
+                (2, 1), 0.000762939453125, dtype=torch.bfloat16
+            )  # size=(2, 1), stride=(1, 1), dtype=bfloat16, device=cuda
+            var_node_17 = torch.matmul(
+                var_node_18.to(torch.bfloat16), var_node_21.to(torch.bfloat16)
+            )  # size=(1, 1), stride=(1, 1), dtype=bfloat16, device=cuda
+            var_node_0, _ = torch.nn.functional.multi_head_attention_forward(
+                var_node_1.to(torch.bfloat16),
+                var_node_8.to(torch.bfloat16),
+                var_node_9.to(torch.bfloat16),
+                1,
+                1,
+                var_node_10.to(torch.bfloat16),
+                None,  # in_proj_bias
+                None,  # bias_k
+                None,  # bias_v
+                False,  # add_zero_attn
+                0.0,  # dropout_p (no dropout for testing)
+                var_node_17.to(torch.bfloat16),
+                None,  # out_proj_bias
+                training=False,  # Use eval mode for deterministic behavior
+                need_weights=False,  # Don't compute attention weights for performance
+            )  # size=(27, 28, 1), stride=(28, 1, 1), dtype=bfloat16, device=cuda
+            # Ensure gradient computation by multiplying with sentinel and taking real part
+            result = var_node_0 * sentinel
+            if result.is_complex():
+                result = result.real
+            return result
 
-        out_eager = foo(arg0, arg1, arg2)
-        out_eager.sum().backward()
-        print("Eager Success! ✅")
-        compiled_foo = torch.compile(foo, fullgraph=True, dynamic=True)
-        out_compiled = compiled_foo(arg0, arg1, arg2)
-        out_compiled.sum().backward()
-        print("Compile Success! ✅")
+        try:
+            # Sentinel tensor to ensure gradient computation
+            sentinel = torch.tensor(1.0, requires_grad=True)
+            arg_0 = torch.as_strided(
+                torch.randn(5292).to(torch.bfloat16), (27, 28, 7), (196, 7, 1)
+            )
+            arg_1 = torch.as_strided(
+                torch.randn(1134).to(torch.bfloat16), (27, 7, 6), (42, 6, 1)
+            )
+            arg_2 = torch.as_strided(
+                torch.randn(1458).to(torch.bfloat16), (27, 6, 9), (54, 9, 1)
+            )
+            arg_3 = torch.as_strided(
+                torch.randn(756).to(torch.bfloat16), (27, 28, 1), (28, 1, 1)
+            )
+            arg_4 = torch.as_strided(torch.randn(12).to(torch.bfloat16), (3, 4), (4, 1))
+            arg_5 = torch.as_strided(
+                torch.randn(60).to(torch.bfloat16), (4, 15), (15, 1)
+            )
+            arg_6 = torch.as_strided(
+                torch.randn(180).to(torch.bfloat16), (15, 12), (12, 1)
+            )
+            arg_7 = torch.as_strided(
+                torch.randn(12).to(torch.bfloat16), (12, 1), (1, 1)
+            )
+            arg_8 = torch.as_strided(torch.randn(8).to(torch.bfloat16), (1, 8), (8, 1))
+            arg_9 = torch.as_strided(torch.randn(16).to(torch.bfloat16), (8, 2), (2, 1))
+            args = (
+                arg_0,
+                arg_1,
+                arg_2,
+                arg_3,
+                arg_4,
+                arg_5,
+                arg_6,
+                arg_7,
+                arg_8,
+                arg_9,
+            ) + (sentinel,)
+
+            out_eager = fuzzed_program(*args)
+            out_eager.sum().backward()
+            print("Eager Success! ✅")
+            compiled_foo = torch.compile(fuzzed_program, fullgraph=True, dynamic=True)
+            out_compiled = compiled_foo(*args)
+            out_compiled.sum().backward()
+            print("Compile Success! ✅")
+        finally:
+            torch.set_default_device(None)
 
 
 if __name__ == "__main__":
diff --git a/test/test_transformers.py b/test/test_transformers.py
index 56e1365d33c44..ad7ae56307eb1 100644
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@@ -1914,6 +1914,7 @@ def test_flash_attention_fail_with_non_square_causal_attention(self, device):
                     q, k, v, None, 0.0, is_causal=True))
 
     @onlyCUDA
+    @unittest.skipIf(not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Does not support Efficient Attention")
     def test_mem_eff_attention_fail_with_batch_size_geq_65536(self):
         batch_size = 2**16
         query = torch.rand([batch_size, 2, 2, 8], device='cuda', dtype=torch.float16, requires_grad=True)
@@ -1935,6 +1936,7 @@ def test_mem_eff_attention_fail_with_batch_size_geq_65536(self):
         self.assertEqual(value.grad, v_cpu.grad, atol=2e-3, rtol=1e-4)
 
     @onlyCUDA
+    @unittest.skipIf(not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Does not support Efficient Attention")
     def test_mem_eff_attention_fail_with_batch_size_geq_65536_error(self):
         query = torch.rand([2**16, 2, 2, 8], device='cuda', dtype=torch.float16)
         key = torch.rand([2**16, 2, 2, 8], device='cuda', dtype=torch.float16)
@@ -1948,6 +1950,7 @@ def test_mem_eff_attention_fail_with_batch_size_geq_65536_error(self):
 
     @largeTensorTest("15GB", "cuda")
     @onlyCUDA
+    @unittest.skipIf(not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Does not support Efficient Attention")
     def test_mem_eff_attention_large_seq_len_uniform_attention(self):
         device = torch.device("cuda")
         dtype = torch.bfloat16
@@ -2854,7 +2857,7 @@ def test_cudnn_attention_broken_166211(self):
         # https://github.com/pytorch/pytorch/issues/166211#issue-3551350377
         shape = (20, 4, 4, 32)
         scale = 10
-        for i in range(100):
+        for _ in range(100):
             q = torch.randn(*shape, device='cuda', dtype=torch.bfloat16) * scale
             k = torch.randn(*shape, device='cuda', dtype=torch.bfloat16) * scale
             v = torch.randn(*shape, device='cuda', dtype=torch.bfloat16) * scale
diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py
index 13f205c4d116b..e446eef974d1e 100644
--- a/test/test_unary_ufuncs.py
+++ b/test/test_unary_ufuncs.py
@@ -1887,6 +1887,28 @@ def test_exp(self, device, dtype):
                 # Ensure we are notified when NumPy changes its behavior
                 self.compare_with_numpy(torch.exp, np.exp, nan_real_inf_imag_in)
 
+    # test for issue #161871 where mvlgamma_ should handle integer input gracefully
+    # with a clear error message on all architectures (not crash with FPE on x86)
+
+    @onlyNativeDeviceTypes
+    @dtypes(torch.int32, torch.int64)
+    def test_mvlgamma_inplace_integer_error(self, device, dtype):
+        tensor = torch.randint(low=1, high=10, size=(5,), device=device, dtype=dtype)
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"result type .* can't be cast to the desired output type"
+        ):
+            tensor.mvlgamma_(2)
+
+    @onlyNativeDeviceTypes
+    @dtypes(torch.int32, torch.int64)
+    def test_mvlgamma_integer_promotion(self, device, dtype):
+        tensor = torch.tensor([5, 6, 7], device=device, dtype=dtype)
+        result = torch.mvlgamma(tensor, 2)
+
+        self.assertTrue(result.dtype.is_floating_point)
+        self.assertTrue(torch.all(torch.isfinite(result)))
 
 instantiate_device_type_tests(TestUnaryUfuncs, globals())
 
diff --git a/test/test_utils.py b/test/test_utils.py
index 40cc969f11665..ab2f133ca3f7e 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -53,8 +53,10 @@
 # sharding on sandcastle. This line silences flake warnings
 load_tests = load_tests  # noqa: PLW0127
 
-HAS_CUDA = torch.cuda.is_available()
-
+device_type = (
+    acc.type if (acc := torch.accelerator.current_accelerator(True)) else "cpu"
+)
+TEST_GPU = torch.xpu.is_available() or torch.cuda.is_available()
 
 from torch.testing._internal.common_utils import run_tests, TestCase
 
@@ -302,24 +304,24 @@ def run_fn(input):
 
             self.assertEqual(grad_with_checkpointing, grad_no_checkpointing)
 
-    @unittest.skipIf(not HAS_CUDA, "No CUDA")
-    def test_checkpoint_rng_cuda(self):
+    @unittest.skipIf(not TEST_GPU, "No accelerator")
+    def test_checkpoint_rng_gpu(self):
         for _ in range(5):
-            inp = torch.randn(20000, device="cuda").requires_grad_()
+            inp = torch.randn(20000, device=device_type).requires_grad_()
             phase1 = torch.nn.Dropout()
             phase2 = torch.nn.Dropout()
 
             def run_fn(input):
                 return phase2(input)
 
-            state = torch.cuda.get_rng_state()
+            state = torch.get_device_module(device_type).get_rng_state()
 
             out = phase1(inp)
             out = checkpoint(run_fn, out, use_reentrant=True)
             out.sum().backward()
             grad_with_checkpointing = inp.grad
 
-            torch.cuda.set_rng_state(state)
+            torch.get_device_module(device_type).set_rng_state(state)
 
             inp.grad = None
 
@@ -330,9 +332,9 @@ def run_fn(input):
 
             self.assertEqual(grad_with_checkpointing, grad_no_checkpointing)
 
-    @unittest.skipIf(not HAS_CUDA, "No CUDA")
+    @unittest.skipIf(not TEST_GPU, "No accelerator")
     def test_checkpoint_not_preserve_rng_state_and_without_reentrant(self):
-        inp = torch.randn(2, device="cuda").requires_grad_()
+        inp = torch.randn(2, device=device_type).requires_grad_()
         layer = torch.nn.Dropout()
 
         def run_fn(input):
@@ -435,10 +437,10 @@ def run_fn2(tensor1, tensor2):
             out = checkpoint(run_fn2, input_var, input_var2, use_reentrant=True)
             out.sum().backward()
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA")
+    @unittest.skipIf(not TEST_GPU, "No accelerator")
     def test_checkpointing_without_reentrant_early_free(self):
         # I don't know how to check if the temporary saved variable buffer
-        # get de-allocated directly. So using cuda memory usage as a proxy
+        # get de-allocated directly. So using GPU memory usage as a proxy
 
         def _do_test(fn, should_free):
             stats: list[int] = []
@@ -449,8 +451,8 @@ def track(x, idx):
                 # emptied at each step)
                 def hook(_unused):
                     self.assertEqual(len(stats), idx)
-                    torch.cuda.synchronize()
-                    stats.append(torch.cuda.memory_allocated())
+                    torch.accelerator.synchronize()
+                    stats.append(torch.accelerator.memory_allocated())
                     if idx > 0:
                         if should_free:
                             self.assertLess(stats[idx], stats[idx - 1])
@@ -475,7 +477,7 @@ def test_fn(x):
 
             return stats
 
-        x = torch.zeros(10, device="cuda", requires_grad=True)
+        x = torch.zeros(10, device=device_type, requires_grad=True)
         x.grad = torch.zeros_like(x)
 
         # In a regular backward, buffers get eagerly freed
@@ -505,8 +507,8 @@ def test_fn(x):
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
     def test_get_device_states_recursive(self):
         inp = {
-            "foo": torch.rand(10, device="cuda:0"),
-            "bar": [torch.rand(10, device="cuda:1")],
+            "foo": torch.rand(10, device=f"{device_type}:0"),
+            "bar": [torch.rand(10, device=f"{device_type}:1")],
         }
         device_ids, device_states = get_device_states(inp)
         self.assertEqual(2, len(device_ids))
@@ -522,42 +524,42 @@ def test_infer_device_state_recursive_meta(self):
         self.assertEqual("meta", device_type)
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
-    def test_infer_device_state_recursive_multi_cuda(self):
-        # Check that no warning is issued for either cuda:0, cuda:1 or
-        # cuda:0, cuda:0 cases since they are both the same device type
+    def test_infer_device_state_recursive_multi_gpu(self):
+        # Check that no warning is issued for either gpu:0, gpu:1 or
+        # gpu:0, gpu:0 cases since they are both the same device type
         inp = {
-            "foo": torch.rand(10, device="cuda:0"),
-            "bar": [torch.rand(10, device="cuda:1")],
+            "foo": torch.rand(10, device=f"{device_type}:0"),
+            "bar": [torch.rand(10, device=f"{device_type}:1")],
         }
         with warnings.catch_warnings():
             warnings.simplefilter("error")
-            device_type = _infer_device_type(inp)
-            self.assertEqual("cuda", device_type)
+            _device_type = _infer_device_type(inp)
+            self.assertEqual(device_type, _device_type)
         inp = {
-            "foo": torch.rand(10, device="cuda:0"),
-            "bar": [torch.rand(10, device="cuda:0")],
+            "foo": torch.rand(10, device=f"{device_type}:0"),
+            "bar": [torch.rand(10, device=f"{device_type}:0")],
         }
         with warnings.catch_warnings():
             warnings.simplefilter("error")
-            device_type = _infer_device_type(inp)
-            self.assertEqual("cuda", device_type)
-        # Check that a warning is issued for cuda:0, meta and that it includes
+            _device_type = _infer_device_type(inp)
+            self.assertEqual(device_type, _device_type)
+        # Check that a warning is issued for gpu:0, meta and that it includes
         # device type information
         inp = {
-            "foo": torch.rand(10, device="cuda:0"),
+            "foo": torch.rand(10, device=f"{device_type}:0"),
             "bar": [torch.rand(10, device="meta")],
         }
         with warnings.catch_warnings(record=True) as w:
-            device_type = _infer_device_type(inp)
-            self.assertEqual("cuda", device_type)
+            _device_type = _infer_device_type(inp)
+            self.assertEqual(device_type, _device_type)
         self.assertEqual(len(w), 1)
         warning_msg = str(w[-1].message)
         self.assertTrue(
             "Tensor arguments, excluding CPU tensors, are detected on at least two types of devices"
             in warning_msg
         )
-        self.assertTrue("Device types: ['cuda', 'meta']" in warning_msg)
-        self.assertTrue("first device type: cuda" in warning_msg)
+        self.assertTrue(f"Device types: ['{device_type}', 'meta']" in warning_msg)
+        self.assertTrue(f"first device type: {device_type}" in warning_msg)
 
 
 class TestDataLoaderUtils(TestCase):
@@ -604,7 +606,7 @@ def test_single_drop(self):
         self.assertEqual(len(list(dataiter)), 1)
 
     @unittest.skip(
-        "FIXME: Intermittent CUDA out-of-memory error on Windows and time-out under ASAN"
+        "FIXME: Intermittent GPU out-of-memory error on Windows and time-out under ASAN"
     )
     def test_multi_keep(self):
         dataloader: DataLoader = DataLoader(
@@ -649,6 +651,7 @@ def test_import_hipify(self):
 
 class TestHipifyTrie(TestCase):
     def setUp(self):
+        super().setUp()
         from torch.utils.hipify import hipify_python
 
         self.trie = hipify_python.Trie()
@@ -860,27 +863,33 @@ def test_get_default_device(self):
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
     def test_get_default_device_more(self):
         try:
-            torch.set_default_device("cuda")
+            torch.set_default_device(device_type)
             self.assertEqual(torch.get_default_device(), torch.tensor([]).device)
             torch.set_default_device(None)
 
-            torch.set_default_device("cuda")
-            torch.cuda.set_device("cuda:1")
+            torch.set_default_device(device_type)
+            torch.get_device_module(device_type).set_device(f"{device_type}:1")
+            self.assertEqual(torch.get_default_device(), torch.tensor([]).device)
+            torch.accelerator.set_device_index(1)
             self.assertEqual(torch.get_default_device(), torch.tensor([]).device)
             torch.set_default_device(None)
 
-            torch.set_default_device("cuda:1")
+            torch.set_default_device(f"{device_type}:1")
             self.assertEqual(torch.get_default_device(), torch.tensor([]).device)
             torch.set_default_device(None)
 
-            torch.set_default_device("cuda:1")
-            with torch.device("cuda:0"):
-                self.assertEqual(torch.get_default_device(), torch.device("cuda", 0))
+            torch.set_default_device(f"{device_type}:1")
+            with torch.device(f"{device_type}:0"):
+                self.assertEqual(
+                    torch.get_default_device(), torch.device(f"{device_type}", 0)
+                )
 
             torch.set_default_device("cpu")
             self.assertEqual(torch.get_default_device(), torch.device("cpu"))
-            with torch.device("cuda:0"):
-                self.assertEqual(torch.get_default_device(), torch.device("cuda", 0))
+            with torch.device(f"{device_type}:0"):
+                self.assertEqual(
+                    torch.get_default_device(), torch.device(f"{device_type}", 0)
+                )
 
             self.assertEqual(torch.get_default_device(), torch.device("cpu"))
         finally:
diff --git a/test/test_weak.py b/test/test_weak.py
index 629ed12db3267..28fa1436b5c23 100644
--- a/test/test_weak.py
+++ b/test/test_weak.py
@@ -84,12 +84,12 @@ def check_update(self, klass, dict):
         weakdict = klass()
         weakdict.update(dict)
         self.assertEqual(len(weakdict), len(dict))
-        for k in weakdict.keys():
+        for k in weakdict:
             self.assertIn(k, dict, "mysterious new key appeared in weak dict")
             v = dict.get(k)
             self.assertIs(v, weakdict[k])
             self.assertIs(v, weakdict.get(k))
-        for k in dict.keys():
+        for k in dict:
             self.assertIn(k, weakdict, "original key disappeared in weak dict")
             v = dict[k]
             self.assertIs(v, weakdict[k])
@@ -328,7 +328,7 @@ def test_write(self):
         for key, value in self.reference.items():
             p[key] = value
             self.assertEqual(p[key], value)
-        for key in self.reference.keys():
+        for key in self.reference:
             del p[key]
             self.assertRaises(KeyError, lambda: p[key])
         p = self._empty_mapping()
@@ -582,6 +582,7 @@ def _full_mapping(self, data):
         return x
 
     def setUp(self):
+        super().setUp()
         if IS_MACOS:
             raise unittest.SkipTest("non-portable load_library call used in test")
 
@@ -661,7 +662,7 @@ def test_write(self):
         for key, value in self.reference.items():
             p[key] = value
             self.assertEqual(p[key], value)
-        for key in self.reference.keys():
+        for key in self.reference:
             del p[key]
             self.assertRaises(KeyError, lambda: p[key])
         p = self._empty_mapping()
diff --git a/test/test_xpu.py b/test/test_xpu.py
index 0e60842605396..6b92dc4c96b38 100644
--- a/test/test_xpu.py
+++ b/test/test_xpu.py
@@ -206,7 +206,8 @@ def test_multi_process(model, input):
     test_multi_process(model, input)
     print(torch.xpu.device_count())
 """
-        rc = check_output(test_script)
+        # XPU have extra lines, so get the last line, refer https://github.com/intel/torch-xpu-ops/issues/2261
+        rc = check_output(test_script).splitlines()[-1]
         self.assertEqual(rc, str(torch.xpu.device_count()))
 
     def test_streams(self):
diff --git a/test/torch_np/numpy_tests/lib/test_histograms.py b/test/torch_np/numpy_tests/lib/test_histograms.py
index 82382cfc147e3..f638e994c1f4c 100644
--- a/test/torch_np/numpy_tests/lib/test_histograms.py
+++ b/test/torch_np/numpy_tests/lib/test_histograms.py
@@ -310,7 +310,7 @@ def test_object_array_of_0d(self):
         )
 
         # these should not crash
-        np.histogram([np.array(0.5) for i in range(10)] + [0.500000000000001])
+        np.histogram([np.array(0.5) for i in range(10)] + [0.500000000000002])
         np.histogram([np.array(0.5) for i in range(10)] + [0.5])
 
     @xpassIfTorchDynamo_np  # (reason="bins='auto'")
diff --git a/test/typing/pass/arithmetic_ops.py b/test/typing/pass/arithmetic_ops.py
index f0d6cc6fd9f97..14dda1cf39772 100644
--- a/test/typing/pass/arithmetic_ops.py
+++ b/test/typing/pass/arithmetic_ops.py
@@ -1,5 +1,5 @@
-from typing import Union
-from typing_extensions import assert_type, TypeAlias
+from typing import TypeAlias, Union
+from typing_extensions import assert_type
 
 from torch import randn, Tensor
 
diff --git a/test/xpu/test_conv.py b/test/xpu/test_conv.py
index d46d7c9461117..f8eac29419f60 100644
--- a/test/xpu/test_conv.py
+++ b/test/xpu/test_conv.py
@@ -367,6 +367,31 @@ def test_conv_double_backward_stride(self):
                     no_weight,
                 )
 
+    @dtypes(torch.float)
+    def test_conv1d_large_input(self, device, dtype):
+        N, C_in, L = 4, 512, 441
+        C_out, K, P = 512, 3, 1
+        torch.manual_seed(42)
+
+        conv_cpu = (
+            nn.Conv1d(C_in, C_out, kernel_size=K, padding=P, bias=True)
+            .to(torch.float32)
+            .requires_grad_()
+        )
+        x_cpu = torch.randn(N, C_in, L, dtype=torch.float32)
+        out_cpu = conv_cpu(x_cpu)
+
+        conv_dev = nn.Conv1d(C_in, C_out, kernel_size=K, padding=P, bias=True).to(
+            device, dtype
+        )
+        conv_dev.weight.data.copy_(conv_cpu.weight.data.to(dtype))
+        conv_dev.bias.data.copy_(conv_cpu.bias.data.to(dtype))
+
+        x_dev = x_cpu.to(device, dtype).requires_grad_()
+        out_dev = conv_dev(x_dev)
+
+        self.assertEqual(out_cpu, out_dev, atol=1e-5, rtol=1e-5, exact_device=False)
+
     @dtypes(torch.float)
     def test_conv1d_same_padding(self, device, dtype):
         test_args = [
diff --git a/third_party/NVTX b/third_party/NVTX
index 2942f167cc30c..3ebbc93ded728 160000
--- a/third_party/NVTX
+++ b/third_party/NVTX
@@ -1 +1 @@
-Subproject commit 2942f167cc30c5e3a44a2aecd5b0d9c07ff61a07
+Subproject commit 3ebbc93ded7285963bff932c678fa367eb393ba6
diff --git a/third_party/fmt b/third_party/fmt
index e424e3f2e607d..407c905e45ad7 160000
--- a/third_party/fmt
+++ b/third_party/fmt
@@ -1 +1 @@
-Subproject commit e424e3f2e607da02742f73db84873b8084fc714c
+Subproject commit 407c905e45ad75fc29bf0f9bb7c5c2fd3475976f
diff --git a/third_party/tensorpipe b/third_party/tensorpipe
index af0118d13e52f..2b4cd91092d33 160000
--- a/third_party/tensorpipe
+++ b/third_party/tensorpipe
@@ -1 +1 @@
-Subproject commit af0118d13e52f5a08841464a768e01a0bf3e3075
+Subproject commit 2b4cd91092d335a697416b2a3cb398283246849d
diff --git a/third_party/xnnpack.buck.bzl b/third_party/xnnpack.buck.bzl
index b353d5d0d5982..217cc8db68864 100644
--- a/third_party/xnnpack.buck.bzl
+++ b/third_party/xnnpack.buck.bzl
@@ -1,7 +1,7 @@
 load("//tools/build_defs:fb_xplat_cxx_library.bzl", "fb_xplat_cxx_library")
 load("//tools/build_defs:fbsource_utils.bzl", "is_arvr_mode")
 load("//tools/build_defs:glob_defs.bzl", "subdir_glob")
-load("//tools/build_defs:platform_defs.bzl", "ANDROID", "APPLE", "APPLETVOS", "CXX", "IOS", "MACOSX", "WINDOWS")
+load("//tools/build_defs:platform_defs.bzl", "ANDROID", "APPLE", "CXX", "IOS", "MACOSX", "WINDOWS")
 load(
     "@fbsource//xplat/caffe2/third_party:xnnpack_buck_shim.bzl",
     "LOGGING_SRCS",
@@ -55,7 +55,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         exported_headers = {
             "xnnpack.h": "XNNPACK/include/xnnpack.h",
         },
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         labels = labels,
         preprocessor_flags = XNN_COMMON_PREPROCESSOR_FLAGS,
         visibility = ["PUBLIC"],
@@ -70,7 +70,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         srcs = SUBGRAPH_SRCS + ["XNNPACK/src/datatype.c"],
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ],
@@ -97,7 +97,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         srcs = TABLE_SRCS,
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ],
@@ -121,7 +121,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         srcs = prod_srcs_for_arch_wrapper("scalar"),
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
             "-fno-fast-math",
@@ -147,7 +147,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         }) if is_arvr_mode() else [],
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ],
@@ -179,7 +179,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         name = "ukernels_sse_ovr_win32",
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ],
@@ -211,7 +211,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         }) if is_arvr_mode() else [],
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ],
@@ -243,7 +243,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         name = "ukernels_sse2_ovr_win32",
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ],
@@ -275,7 +275,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         }) if is_arvr_mode() else [],
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ],
@@ -307,7 +307,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         name = "ukernels_ssse3_ovr_win32",
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ],
@@ -339,7 +339,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         }) if is_arvr_mode() else [],
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ],
@@ -371,7 +371,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         name = "ukernels_sse41_ovr_win32",
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ],
@@ -403,7 +403,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         }) if is_arvr_mode() else [],
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ] + select({
@@ -443,7 +443,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         name = "ukernels_avx_ovr_win32",
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
             "-mavx",
@@ -476,7 +476,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         }) if is_arvr_mode() else [],
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ] + select({
@@ -531,7 +531,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         name = "ukernels_avx512vnnigfni_ovr_win32",
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ],
@@ -568,7 +568,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         }) if is_arvr_mode() else [],
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ] + select({
@@ -625,7 +625,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         name = "ukernels_avx512vnni_ovr_win32",
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ],
@@ -660,7 +660,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         srcs = prod_srcs_for_arch_wrapper("avxvnni") if is_arvr_mode() else [],
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
             "-mavxvnni",
@@ -697,7 +697,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         name = "ukernels_avxvnni_ovr_win32",
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ],
@@ -729,7 +729,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         }) if is_arvr_mode() else [],
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ] + select({
@@ -770,7 +770,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         name = "ukernels_f16c_ovr_win32",
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
             "-mf16c",
@@ -804,7 +804,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         }) if is_arvr_mode() else [],
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ] + select({
@@ -853,7 +853,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         name = "ukernels_fma3_ovr_win32",
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
             "-mfma",
@@ -894,7 +894,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         }) if is_arvr_mode() else [],
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ] + select({
@@ -948,7 +948,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         name = "ukernels_avx2_ovr_win32",
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
             "-mavx2",
@@ -994,7 +994,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         }) if is_arvr_mode() else [],
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ] + select({
@@ -1039,7 +1039,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         }) if is_arvr_mode() else [],
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ] + select({
@@ -1108,7 +1108,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         name = "ukernels_avx512_ovr_win32",
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
             "-mavx512f",
@@ -1141,7 +1141,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         }) if is_arvr_mode() else [],
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ] + select({
@@ -1206,7 +1206,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         name = "ukernels_avx512skx_ovr_win32",
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
             "-mavx512f",
@@ -1259,7 +1259,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         }) if is_arvr_mode() else [],
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
             "-fno-fast-math",
@@ -1301,7 +1301,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         }) if is_arvr_mode() else [],
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ] + select({
@@ -1350,7 +1350,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         }) if is_arvr_mode() else [],
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ],
@@ -1378,7 +1378,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         }) if is_arvr_mode() else [],
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ] + select({
@@ -1430,7 +1430,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         }) if is_arvr_mode() else [],
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ],
@@ -1460,7 +1460,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         }) if is_arvr_mode() else [],
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
             "-Wno-error=missing-braces",  # required since the SGX toolchain does not have this by default
@@ -1532,7 +1532,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         }) if is_arvr_mode() else [],
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ] + select({
@@ -1582,7 +1582,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         }) if is_arvr_mode() else [],
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ] + select({
@@ -1645,7 +1645,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         }) if is_arvr_mode() else [],
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ] + select({
@@ -1690,7 +1690,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         }) if is_arvr_mode() else [],
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ] + select({
@@ -1729,7 +1729,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         }) if is_arvr_mode() else [],
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ] + select({
@@ -1774,7 +1774,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         }) if is_arvr_mode() else [],
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ] + select({
@@ -1815,7 +1815,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         }) if is_arvr_mode() else [],
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ] + select({
@@ -1860,7 +1860,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         }) if is_arvr_mode() else [],
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ] + select({
@@ -1900,7 +1900,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         }) if is_arvr_mode() else [],
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ] + select({
@@ -1959,7 +1959,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         }) if is_arvr_mode() else [],
         headers = get_xnnpack_headers(),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ] + select({
@@ -2004,7 +2004,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
             ("XNNPACK/src", "**/*.S"),
         ]),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ] + select({
@@ -2053,7 +2053,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
             ("XNNPACK/src", "**/*.S"),
         ]),
         header_namespace = "",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         compiler_flags = [
             "-O2",
         ] + select({
@@ -2088,7 +2088,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
 
     fb_xplat_cxx_library(
         name = "arm64_lib",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         labels = labels,
         fbandroid_link_whole = True,
         preferred_linkage = "static",
@@ -2114,7 +2114,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
 
     fb_xplat_cxx_library(
         name = "x86_and_x86_64_lib",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         labels = labels,
         preferred_linkage = "static",
         visibility = ["PUBLIC"],
@@ -2138,7 +2138,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
 
     fb_xplat_cxx_library(
         name = "x86_and_x86_64_lib_ovr_win32",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         labels = labels,
         preferred_linkage = "static",
         visibility = ["PUBLIC"],
@@ -2165,7 +2165,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
 
     fb_xplat_cxx_library(
         name = "arm_lib",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         labels = labels,
         preferred_linkage = "static",
         visibility = ["PUBLIC"],
@@ -2193,7 +2193,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
 
     fb_xplat_cxx_library(
         name = "armv7_lib",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         labels = labels,
         fbandroid_link_whole = True,
         preferred_linkage = "static",
@@ -2209,7 +2209,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
 
     fb_xplat_cxx_library(
         name = "prod_ukernels",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         labels = labels,
         fbandroid_link_whole = True,
         preferred_linkage = "static",
@@ -2234,7 +2234,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
 
     fb_xplat_cxx_library(
         name = "XNNPACK",
-        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        apple_sdks = (IOS, MACOSX),
         labels = labels,
         deps = [
             ":tables",
diff --git a/third_party/xpu.txt b/third_party/xpu.txt
index 91aee0c2a0ffa..f05ce60393d66 100644
--- a/third_party/xpu.txt
+++ b/third_party/xpu.txt
@@ -1 +1 @@
-8d373ba272f9fed348c7684bac4a0c2663844bbd
+1e69f40b3c03492eb3dd7e03462a5566f29674d3
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 13ca3e1389ac1..4796153f24f05 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -763,6 +763,12 @@
 """
 )
 
+FW_DERIVATIVE_UPDATE_WRAPPED_NUM_TEMPLATE = CodeTemplate(
+    """\
+update_wrapped_number(${inp_name}_tensor, ${inp_name}_t);
+"""
+)
+
 FW_DERIVATIVE_DEFINED_PRIMAL_TEMPLATE = CodeTemplate(
     """\
 auto ${inp_name}_p = toNonOptPrimal(${inp});
@@ -1911,6 +1917,13 @@ def emit_fw_derivatives() -> list[str]:
                             zeros_fn=zeros_fn,
                         )
                     )
+                    if zeros_fn == "_efficientzerotensor_symint":
+                        unpacked_arguments += (
+                            FW_DERIVATIVE_UPDATE_WRAPPED_NUM_TEMPLATE.substitute(
+                                inp_name=inp.name
+                            )
+                        )
+
                 if inp.name in (derivative.required_inputs_primal or []):
                     unpacked_arguments += (
                         FW_DERIVATIVE_DEFINED_PRIMAL_TEMPLATE.substitute(
diff --git a/tools/code_analyzer/gen_oplist.py b/tools/code_analyzer/gen_oplist.py
index 0d735cdb3d441..5f9d60528e0a7 100644
--- a/tools/code_analyzer/gen_oplist.py
+++ b/tools/code_analyzer/gen_oplist.py
@@ -135,7 +135,8 @@ def main(argv: list[Any]) -> None:
     if os.path.isfile(options.model_file_list_path):
         print("Processing model file: ", options.model_file_list_path)
         model_dicts = []
-        model_dict = yaml.safe_load(open(options.model_file_list_path))
+        with open(options.model_file_list_path) as model_file:
+            model_dict = yaml.safe_load(model_file)
         model_dicts.append(model_dict)
     else:
         print("Processing model directory: ", options.model_file_list_path)
diff --git a/tools/dynamo/gb_id_mapping.py b/tools/dynamo/gb_id_mapping.py
index 541189eb66792..1333e6d28cf1b 100644
--- a/tools/dynamo/gb_id_mapping.py
+++ b/tools/dynamo/gb_id_mapping.py
@@ -115,7 +115,7 @@ def extract_info_from_keyword(source: str, kw: ast.keyword) -> Any:
         return clean_string(param_source)
 
 
-def find_unimplemented_v2_calls(
+def find_unimplemented_calls(
     path: str, dynamo_dir: Optional[str] = None
 ) -> list[dict[str, Any]]:
     results = []
@@ -135,15 +135,15 @@ def find_unimplemented_v2_calls(
                 for node in ast.walk(tree):
                     if isinstance(node, ast.FunctionDef):
                         if node.name in (
-                            "unimplemented_v2",
-                            "unimplemented_v2_with_warning",
+                            "unimplemented",
+                            "unimplemented_with_warning",
                         ):
                             continue
                     if (
                         isinstance(node, ast.Call)
                         and isinstance(node.func, ast.Name)
                         and node.func.id
-                        in ("unimplemented_v2", "unimplemented_v2_with_warning")
+                        in ("unimplemented", "unimplemented_with_warning")
                     ):
                         info: dict[str, Any] = {
                             "gb_type": None,
@@ -180,7 +180,7 @@ def find_unimplemented_v2_calls(
 
 
 def create_registry(dynamo_dir: str, registry_path: str) -> None:
-    calls = find_unimplemented_v2_calls(dynamo_dir)
+    calls = find_unimplemented_calls(dynamo_dir)
     registry = {}
 
     gb_types = {}
@@ -224,7 +224,7 @@ def main() -> None:
         "--dynamo_dir",
         type=str,
         default=default_dynamo_dir,
-        help="Directory to search for unimplemented_v2 calls.",
+        help="Directory to search for unimplemented calls.",
     )
 
     parser.add_argument(
diff --git a/tools/dynamo/verify_dynamo.py b/tools/dynamo/verify_dynamo.py
index a8ce085e864ea..5b3444a79f112 100644
--- a/tools/dynamo/verify_dynamo.py
+++ b/tools/dynamo/verify_dynamo.py
@@ -216,8 +216,8 @@ def main() -> None:
         f"ROCM version: {rocm_ver}\n"
     )
     for args in _SANITY_CHECK_ARGS:
-        if sys.version_info >= (3, 14):
-            warnings.warn("Dynamo not yet supported in Python 3.14. Skipping check.")
+        if sys.version_info >= (3, 15):
+            warnings.warn("Dynamo not yet supported in Python 3.15.")
         check_dynamo(*args)
     print("All required checks passed")
 
diff --git a/tools/experimental/torchfuzz/codegen.py b/tools/experimental/torchfuzz/codegen.py
index c800aa2b2765c..c06df40a01bb4 100644
--- a/tools/experimental/torchfuzz/codegen.py
+++ b/tools/experimental/torchfuzz/codegen.py
@@ -267,7 +267,10 @@ def imports_codegen(self):
         ]
 
     def flags_codegen(self):
-        return ["torch._dynamo.config.capture_scalar_outputs = True"]
+        return [
+            "torch.set_default_device('cuda')",
+            "torch._dynamo.config.capture_scalar_outputs = True",
+        ]
 
     def epilogue_codegen(self):
         return []
@@ -490,6 +493,7 @@ def imports_codegen(self):
 
     def flags_codegen(self):
         return [
+            "torch.set_default_device('cuda')",
             "torch._dynamo.config.capture_scalar_outputs = True",
             "torch._dynamo.config.capture_dynamic_output_shape_ops = True",
         ]
diff --git a/tools/experimental/torchfuzz/multi_process_fuzzer.py b/tools/experimental/torchfuzz/multi_process_fuzzer.py
index 921ce0c780182..21359b5e9da1a 100644
--- a/tools/experimental/torchfuzz/multi_process_fuzzer.py
+++ b/tools/experimental/torchfuzz/multi_process_fuzzer.py
@@ -48,54 +48,8 @@ def persist_print(msg):
 # List of regex patterns for ignore bucket
 IGNORE_PATTERNS: list[re.Pattern] = [
     re.compile(
-        r"Dynamo failed to run FX node with fake tensors: call_method fill_diagonal_"
-    ),  # https://github.com/pytorch/pytorch/issues/163420
-    re.compile(
-        r"TypeError: unsupported operand type\(s\) for divmod\(\): 'SymInt' and 'int'"
-    ),  # https://github.com/pytorch/pytorch/issues/163457
-    re.compile(
-        r"RuntimeError: self\.stride\(-1\) must be 1 to view ComplexDouble as"
-    ),  # https://github.com/pytorch/pytorch/issues/162561
-    re.compile(
-        r"BooleanAtom not allowed in this context"
-    ),  # https://github.com/pytorch/pytorch/issues/160726
-    re.compile(
-        r"TypeError\(\"unsupported operand type\(s\) for \*: 'SymBool' and 'FakeTensor'\"\)"
-    ),  # https://github.com/pytorch/pytorch/issues/164684
-    re.compile(r"KeyError: u\d+"),  # https://github.com/pytorch/pytorch/issues/164685
-    re.compile(
-        r"torch\._inductor\.exc\.InductorError: CppCompileError: C\+\+ compile error"
-    ),  # https://github.com/pytorch/pytorch/issues/164686
-    re.compile(
-        r"\.item\(\) # dtype="
-    ),  # https://github.com/pytorch/pytorch/issues/164725
-    re.compile(
-        r"dimensionality of sizes \(0\) must match dimensionality of strides \(1\)"
-    ),  # https://github.com/pytorch/pytorch/issues/164814
-    re.compile(
-        r"self and mat2 must have the same dtype"
-    ),  # https://github.com/pytorch/pytorch/issues/165718
-    re.compile(
-        r"free\(\): invalid next size \(fast\)"
-    ),  # TODO: figure out why sometimes heap metadata gets corrupted on program exit (checks actually pass successfully)
-    re.compile(
-        r'assert "int" in str\(indices\.get_dtype\(\)\)'
-    ),  # https://github.com/pytorch/pytorch/issues/166042
-    re.compile(
-        r'self\.shape_env\.guard_or_defer_runtime_assert\(expr, "guard_equals"\)'
-    ),  # https://github.com/pytorch/pytorch/issues/166245
-    re.compile(
-        r"assert len\(self\.stride\) == len\(order\)"
-    ),  # https://github.com/pytorch/pytorch/issues/166270
-    re.compile(
-        r"assert len\(input_size\) == len\(new_size\)"
-    ),  # https://github.com/pytorch/pytorch/issues/166279
-    re.compile(
-        r"torch\._inductor\.exc\.InductorError: IndexError: list index out of range"
-    ),  # https://github.com/pytorch/pytorch/issues/166290
-    re.compile(
-        r"assert bool\(static_expr\)"
-    ),  # https://github.com/pytorch/pytorch/issues/166319
+        r"torch\._inductor\.exc\.InductorError: AssertionError: -1"
+    ),  # https://github.com/pytorch/pytorch/issues/167937
     # Add more patterns here as needed, e.g.:
     # re.compile(r"Some other error message"),
 ]
diff --git a/tools/generate_torch_version.py b/tools/generate_torch_version.py
index ec16bbf4546e2..d1004cdc3a955 100644
--- a/tools/generate_torch_version.py
+++ b/tools/generate_torch_version.py
@@ -83,7 +83,7 @@ def get_torch_version(sha: str | None = None) -> str:
         version = sdist_version
         origin = "PKG-INFO"
     else:
-        version = open(pytorch_root / "version.txt").read().strip()
+        version = Path(pytorch_root / "version.txt").read_text().strip()
         origin = "version.txt"
         if sdist_version is None and sha != UNKNOWN:
             if sha is None:
diff --git a/tools/linter/adapters/gb_registry_linter.py b/tools/linter/adapters/gb_registry_linter.py
index 508fe2f9d1471..ac6bfc3264d51 100644
--- a/tools/linter/adapters/gb_registry_linter.py
+++ b/tools/linter/adapters/gb_registry_linter.py
@@ -15,7 +15,7 @@
 
 
 from tools.dynamo.gb_id_mapping import (
-    find_unimplemented_v2_calls,
+    find_unimplemented_calls,
     load_registry,
     next_gb_id,
 )
@@ -50,7 +50,7 @@ def _collect_all_calls(
     gb_type_calls: dict[str, list[tuple[dict[str, Any], Path]]] = {}
 
     for py_file in dynamo_dir.rglob("*.py"):
-        for call in find_unimplemented_v2_calls(py_file, dynamo_dir):
+        for call in find_unimplemented_calls(py_file, dynamo_dir):
             gb_type = call["gb_type"]
             if gb_type not in gb_type_calls:
                 gb_type_calls[gb_type] = []
diff --git a/tools/linter/adapters/s3_init.py b/tools/linter/adapters/s3_init.py
index c1ca031fe482a..027bb51f2f0fb 100644
--- a/tools/linter/adapters/s3_init.py
+++ b/tools/linter/adapters/s3_init.py
@@ -195,8 +195,8 @@ def download(
         level=logging.INFO,
         stream=sys.stderr,
     )
-
-    config = json.load(open(args.config_json))
+    with open(args.config_json) as f:
+        config = json.load(f)
     config = config[args.linter]
 
     # Allow processor specific binaries for platform (namely Intel and M1 binaries for MacOS)
diff --git a/tools/linter/adapters/update_s3.py b/tools/linter/adapters/update_s3.py
index 426b330fe2fb2..c04ea15892a43 100644
--- a/tools/linter/adapters/update_s3.py
+++ b/tools/linter/adapters/update_s3.py
@@ -63,8 +63,8 @@ def main() -> None:
     )
     args = parser.parse_args()
     logging.basicConfig(level=logging.INFO)
-
-    config = json.load(open(args.config_json))
+    with open(args.config_json) as f:
+        config = json.load(f)
     linter_config = config[args.linter][args.platform]
     bucket = linter_config["s3_bucket"]
     object_name = linter_config["object_name"]
diff --git a/tools/stats/upload_test_stats.py b/tools/stats/upload_test_stats.py
index b5802e8032419..45a390fc5051d 100644
--- a/tools/stats/upload_test_stats.py
+++ b/tools/stats/upload_test_stats.py
@@ -2,6 +2,7 @@
 
 import argparse
 import os
+import re
 import sys
 import xml.etree.ElementTree as ET
 from multiprocessing import cpu_count, Pool
@@ -19,17 +20,32 @@
 )
 
 
+def should_upload_full_test_run(head_branch: str | None, head_repository: str) -> bool:
+    """Return True if we should upload the full test_run dataset.
+
+    Rules:
+    - Only for the main repository (pytorch/pytorch)
+    - If head_branch is 'main', or a tag of form 'trunk/{40-hex-sha}'
+    """
+    is_trunk_tag = bool(re.fullmatch(r"trunk/[0-9a-fA-F]{40}", (head_branch or "")))
+    return head_repository == "pytorch/pytorch" and (
+        head_branch == "main" or is_trunk_tag
+    )
+
+
 def parse_xml_report(
     tag: str,
     report: Path,
     workflow_id: int,
     workflow_run_attempt: int,
+    job_id: int | None = None,
 ) -> list[dict[str, Any]]:
     """Convert a test report xml file into a JSON-serializable list of test cases."""
     print(f"Parsing {tag}s for test report: {report}")
 
-    job_id = get_job_id(report)
-    print(f"Found job id: {job_id}")
+    if job_id is None:
+        job_id = get_job_id(report)
+        print(f"Found job id: {job_id}")
 
     test_cases: list[dict[str, Any]] = []
 
@@ -149,21 +165,6 @@ def get_tests(workflow_run_id: int, workflow_run_attempt: int) -> list[dict[str,
         return flattened
 
 
-def get_tests_for_circleci(
-    workflow_run_id: int, workflow_run_attempt: int
-) -> list[dict[str, Any]]:
-    # Parse the reports and transform them to JSON
-    test_cases = []
-    for xml_report in Path(".").glob("**/test/test-reports/**/*.xml"):
-        test_cases.extend(
-            parse_xml_report(
-                "testcase", xml_report, workflow_run_id, workflow_run_attempt
-            )
-        )
-
-    return test_cases
-
-
 def summarize_test_cases(test_cases: list[dict[str, Any]]) -> list[dict[str, Any]]:
     """Group test cases by classname, file, and job_id. We perform the aggregation
     manually instead of using the `test-suite` XML tag because xmlrunner does
@@ -242,21 +243,11 @@ def init_value(test_case: dict[str, Any]) -> dict[str, Any]:
         required=True,
         help="Head repository of the workflow",
     )
-    parser.add_argument(
-        "--circleci",
-        action="store_true",
-        help="If this is being run through circleci",
-    )
     args = parser.parse_args()
 
     print(f"Workflow id is: {args.workflow_run_id}")
 
-    if args.circleci:
-        test_cases = get_tests_for_circleci(
-            args.workflow_run_id, args.workflow_run_attempt
-        )
-    else:
-        test_cases = get_tests(args.workflow_run_id, args.workflow_run_attempt)
+    test_cases = get_tests(args.workflow_run_id, args.workflow_run_attempt)
 
     # Flush stdout so that any errors in the upload show up last in the logs.
     sys.stdout.flush()
@@ -287,7 +278,8 @@ def init_value(test_case: dict[str, Any]) -> dict[str, Any]:
         remove_nan_inf(failed_tests_cases),
     )
 
-    if args.head_branch == "main" and args.head_repository == "pytorch/pytorch":
+    # Upload full test_run only for trusted refs (main or trunk/{sha} tags)
+    if should_upload_full_test_run(args.head_branch, args.head_repository):
         # For jobs on main branch, upload everything.
         upload_workflow_stats_to_s3(
             args.workflow_run_id,
@@ -296,12 +288,4 @@ def init_value(test_case: dict[str, Any]) -> dict[str, Any]:
             remove_nan_inf(test_cases),
         )
 
-    # Part of an experiment to see if we can handle all the data as is
-    upload_workflow_stats_to_s3(
-        args.workflow_run_id,
-        args.workflow_run_attempt,
-        "all_test_runs",
-        remove_nan_inf(test_cases),
-    )
-
     upload_additional_info(args.workflow_run_id, args.workflow_run_attempt, test_cases)
diff --git a/tools/test/test_gb_registry_linter.py b/tools/test/test_gb_registry_linter.py
index 10f4a701b2c37..837e5910a4abb 100644
--- a/tools/test/test_gb_registry_linter.py
+++ b/tools/test/test_gb_registry_linter.py
@@ -27,10 +27,10 @@ def setUp(self):
             json.dump({}, f)
 
         self.callsite_file = self.test_data_dir / "callsite_test.py"
-        callsite_content = """from torch._dynamo.exc import unimplemented_v2
+        callsite_content = """from torch._dynamo.exc import unimplemented
 
 def test(self):
-    unimplemented_v2(
+    unimplemented(
         gb_type="testing",
         context="testing",
         explanation="testing",
@@ -101,9 +101,9 @@ def test_case2_rename_gb_type(self):
         with open(self.registry_path, "w") as f:
             json.dump(registry_data, f, indent=2)
 
-        renamed_callsite_content = """from torch._dynamo.exc import unimplemented_v2
+        renamed_callsite_content = """from torch._dynamo.exc import unimplemented
 def test(self):
-    unimplemented_v2(gb_type="renamed_testing", context="testing", explanation="testing", hints=["testing"])
+    unimplemented(gb_type="renamed_testing", context="testing", explanation="testing", hints=["testing"])
 """
         with open(self.callsite_file, "w") as f:
             f.write(renamed_callsite_content)
@@ -168,9 +168,9 @@ def test_case3_content_change(self):
         with open(self.registry_path, "w") as f:
             json.dump(registry_data, f, indent=2)
 
-        updated_callsite_content = """from torch._dynamo.exc import unimplemented_v2
+        updated_callsite_content = """from torch._dynamo.exc import unimplemented
 def test(self):
-    unimplemented_v2(gb_type="testing", context="new_context", explanation="new_explanation", hints=["new_hint"])
+    unimplemented(gb_type="testing", context="new_context", explanation="new_explanation", hints=["new_hint"])
 """
         with open(self.callsite_file, "w") as f:
             f.write(updated_callsite_content)
@@ -255,9 +255,9 @@ def test_case5_new_gbid_on_full_change(self):
         with open(self.registry_path, "w") as f:
             json.dump(registry_data, f, indent=2)
 
-        new_callsite_content = """from torch._dynamo.exc import unimplemented_v2
+        new_callsite_content = """from torch._dynamo.exc import unimplemented
 def test(self):
-    unimplemented_v2(
+    unimplemented(
         gb_type="completely_new_testing",
         context="completely_new_context",
         explanation="completely_new_explanation",
@@ -330,11 +330,11 @@ def test_case6_dynamic_hints_from_variable(self):
 
             init_py.touch()
 
-            dynamic_hints_callsite = """from torch._dynamo.exc import unimplemented_v2
+            dynamic_hints_callsite = """from torch._dynamo.exc import unimplemented
 from torch._dynamo import graph_break_hints
 
 def test(self):
-    unimplemented_v2(
+    unimplemented(
         gb_type="testing_with_graph_break_hints",
         context="testing_with_graph_break_hints",
         explanation="testing_with_graph_break_hints",
diff --git a/tools/test/test_upload_gate.py b/tools/test/test_upload_gate.py
new file mode 100644
index 0000000000000..7d9a2e5fe3b0b
--- /dev/null
+++ b/tools/test/test_upload_gate.py
@@ -0,0 +1,28 @@
+import unittest
+
+from tools.stats.upload_test_stats import should_upload_full_test_run
+
+
+class TestUploadGate(unittest.TestCase):
+    def test_main_branch_on_pytorch_repo(self) -> None:
+        self.assertTrue(should_upload_full_test_run("main", "pytorch/pytorch"))
+
+    def test_trunk_tag_valid_sha_on_pytorch_repo(self) -> None:
+        sha = "a" * 40
+        self.assertTrue(should_upload_full_test_run(f"trunk/{sha}", "pytorch/pytorch"))
+
+    def test_trunk_tag_invalid_sha_on_pytorch_repo(self) -> None:
+        # Not 40 hex chars
+        self.assertFalse(should_upload_full_test_run("trunk/12345", "pytorch/pytorch"))
+
+    def test_non_main_branch_on_pytorch_repo(self) -> None:
+        self.assertFalse(
+            should_upload_full_test_run("feature-branch", "pytorch/pytorch")
+        )
+
+    def test_main_branch_on_fork_repo(self) -> None:
+        self.assertFalse(should_upload_full_test_run("main", "someone/fork"))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tools/testing/upload_artifacts.py b/tools/testing/upload_artifacts.py
index 07b62ec9a1b74..50f08c0f33cde 100644
--- a/tools/testing/upload_artifacts.py
+++ b/tools/testing/upload_artifacts.py
@@ -1,11 +1,16 @@
 import glob
 import gzip
+import json
 import os
 import time
 import zipfile
 from functools import lru_cache
 from pathlib import Path
-from typing import Any
+from typing import Any, Optional
+
+from filelock import FileLock, Timeout
+
+from tools.stats.upload_test_stats import parse_xml_report
 
 
 REPO_ROOT = Path(__file__).resolve().parent.parent.parent
@@ -140,3 +145,108 @@ def trigger_upload_test_stats_intermediate_workflow() -> None:
         },
     )
     print(x.text)
+
+
+def parse_xml_and_upload_json() -> None:
+    """
+    Parse xml test reports that do not yet have a corresponding json report
+    uploaded to s3, and upload the json reports to s3. Use filelock to avoid
+    uploading the same file from multiple processes.
+    """
+    try:
+        job_id: Optional[int] = int(os.environ.get("JOB_ID", 0))
+        if job_id == 0:
+            job_id = None
+    except (ValueError, TypeError):
+        job_id = None
+
+    try:
+        for xml_file in glob.glob(
+            f"{REPO_ROOT}/test/test-reports/**/*.xml", recursive=True
+        ):
+            xml_path = Path(xml_file)
+            json_file = xml_path.with_suffix(".json")
+            lock = FileLock(str(json_file) + ".lock")
+
+            try:
+                lock.acquire(timeout=0)  # immediately fails if already locked
+                if json_file.exists():
+                    continue  # already uploaded
+                test_cases = parse_xml_report(
+                    "testcase",
+                    xml_path,
+                    int(os.environ.get("GITHUB_RUN_ID", "0")),
+                    int(os.environ.get("GITHUB_RUN_ATTEMPT", "0")),
+                    job_id,
+                )
+                line_by_line_jsons = "\n".join([json.dumps(tc) for tc in test_cases])
+
+                gzipped = gzip.compress(line_by_line_jsons.encode("utf-8"))
+                s3_key = (
+                    json_file.relative_to(REPO_ROOT / "test/test-reports")
+                    .as_posix()
+                    .replace("/", "_")
+                )
+
+                get_s3_resource().put_object(
+                    Body=gzipped,
+                    Bucket="gha-artifacts",
+                    Key=f"test_jsons_while_running/{os.environ.get('GITHUB_RUN_ID')}/{job_id}/{s3_key}",
+                    ContentType="application/json",
+                    ContentEncoding="gzip",
+                )
+
+                # We don't need to save the json file locally, but doing so lets us
+                # track which ones have been uploaded already. We could probably also
+                # check S3
+                with open(json_file, "w") as f:
+                    f.write(line_by_line_jsons)
+            except Timeout:
+                continue  # another process is working on this file
+            finally:
+                if lock.is_locked:
+                    lock.release()
+    except Exception as e:
+        print(f"Failed to parse and upload json test reports: {e}")
+
+
+def upload_adhoc_failure_json(invoking_file: str, current_failure: str) -> None:
+    """
+    manually upload a json to s3 indicating that the entire test file failed
+    since xml was probably not generated in this case
+    """
+    try:
+        job_id = int(os.environ["JOB_ID"])
+        workflow_id = int(os.environ["GITHUB_RUN_ID"])
+    except Exception as e:
+        print(f"Failed to get job_id or workflow_id: {e}")
+        return
+
+    split_failure = current_failure.split("::")
+    if len(split_failure) >= 2:
+        className = split_failure[-2]
+        testName = split_failure[-1]
+    else:
+        testName = current_failure
+        className = ""
+
+    message = "The test file failed but pytest did not generate xml.  The most likely cause is a segfault"
+    j = {
+        "invoking_file": invoking_file,
+        "file": f"{invoking_file}.py",
+        "name": testName,
+        "classname": className,
+        "workflow_id": workflow_id,
+        "workflow_run_attempt": os.environ.get("GITHUB_RUN_ATTEMPT"),
+        "job_id": job_id,
+        "failure": {"message": message, "text": message},
+    }
+    gzipped = gzip.compress(json.dumps(j).encode("utf-8"))
+    s3_key = f"{invoking_file.replace('/', '_')}_{os.urandom(8).hex()}.json"
+    get_s3_resource().put_object(
+        Body=gzipped,
+        Bucket="gha-artifacts",
+        Key=f"test_jsons_while_running/{workflow_id}/{job_id}/{s3_key}",
+        ContentType="application/json",
+        ContentEncoding="gzip",
+    )
diff --git a/torch/_C/_VariableFunctions.pyi.in b/torch/_C/_VariableFunctions.pyi.in
index 374f5661060e0..2a633b401ca79 100644
--- a/torch/_C/_VariableFunctions.pyi.in
+++ b/torch/_C/_VariableFunctions.pyi.in
@@ -3,9 +3,9 @@
 # mypy: allow-untyped-defs
 # ruff: noqa: F401,PYI054
 
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from types import EllipsisType
-from typing import Any, Callable, Literal, overload, TypeVar
+from typing import Any, Literal, overload, TypeVar
 
 import torch
 from torch import (
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 4acffdb1997f9..e9b58b9ce71eb 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -3,23 +3,25 @@
 # mypy: allow-untyped-defs
 # ruff: noqa: F401
 
-from collections.abc import Iterable, Iterator, Sequence
+from collections.abc import Callable, Iterable, Iterator, Sequence
 from enum import Enum, IntEnum
 from pathlib import Path
 from types import EllipsisType
 from typing import (
     Any,
     AnyStr,
-    Callable,
     Generic,
     IO,
     Literal,
     NamedTuple,
     overload,
+    Protocol,
+    runtime_checkable,
     SupportsIndex,
+    TypeAlias,
     TypeVar,
 )
-from typing_extensions import ParamSpec, Protocol, runtime_checkable, Self, TypeAlias
+from typing_extensions import ParamSpec, Self
 
 import numpy
 
@@ -69,6 +71,7 @@ from torch.types import (
     Storage,
 )
 from torch.utils._python_dispatch import TorchDispatchMode
+from torch.utils.checkpoint import GraphExecGroup
 
 # This module is defined in torch/csrc/Module.cpp
 
@@ -1491,6 +1494,8 @@ def _is_multithreading_enabled() -> _bool: ...
 def _set_multithreading_enabled(enabled: _bool) -> None: ...
 def _set_view_replay_enabled(enabled: _bool) -> None: ...
 def _is_view_replay_enabled() -> _bool: ...
+def _set_graph_exec_group(group: GraphExecGroup | None) -> None: ...
+def _get_graph_exec_group() -> GraphExecGroup | None: ...
 def _enter_dual_level() -> _int: ...
 def _exit_dual_level(level: _int) -> None: ...
 def _make_dual(tensor: Tensor, tangent: Tensor, level: _int) -> Tensor: ...
@@ -1962,6 +1967,8 @@ def _DTensor_OpSchema_recompute_comparison_key(self: OpSchema) -> None: ...
 def _DTensor_compute_global_tensor_info(
     tensor: Tensor, mesh: DeviceMesh, placements: Sequence[Placement]
 ) -> tuple[list[_int], list[_int]]: ...
+def _get_DTensor_sharding_propagator_cache_stats() -> tuple[_int, _int]: ...
+def _clear_DTensor_sharding_propagator_cache() -> None: ...
 
 # Defined in torch/csrc/multiprocessing/init.cpp
 def _multiprocessing_init() -> None: ...
@@ -2003,6 +2010,16 @@ def _mtia_attachOutOfMemoryObserver(
 def _mtia_getDeviceCount() -> _int: ...
 def _mtia_resetPeakMemoryStats(device: _int) -> None: ...
 
+# Defined in torch/csrc/mtia/Module.cpp
+class _MTIAGraph:
+    def __new__(cls, keep_graph: _bool = ...) -> Self: ...
+    def capture_begin(self, pool: tuple[_int, _int]) -> None: ...
+    def capture_end(self) -> None: ...
+    def instantiate(self) -> None: ...
+    def replay(self) -> None: ...
+    def reset(self) -> None: ...
+    def pool(self) -> tuple[_int, _int]: ...
+
 # Defined in torch/csrc/mps/Module.cpp
 def _mps_deviceSynchronize() -> None: ...
 def _mps_get_core_count() -> _int: ...
@@ -2486,6 +2503,7 @@ def _accelerator_emptyCache() -> None: ...
 def _accelerator_getDeviceStats(device_index: _int) -> dict[str, Any]: ...
 def _accelerator_resetAccumulatedStats(device_index: _int) -> None: ...
 def _accelerator_resetPeakStats(device_index: _int) -> None: ...
+def _accelerator_getMemoryInfo(device_index: _int) -> tuple[_int, _int]: ...
 def _accelerator_setAllocatorSettings(env: str) -> None: ...
 
 # Defined in torch/csrc/jit/python/python_tracer.cpp
diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi
index 737362be62b48..752bd594d066f 100644
--- a/torch/_C/_distributed_c10d.pyi
+++ b/torch/_C/_distributed_c10d.pyi
@@ -1,5 +1,6 @@
 # mypy: allow-untyped-defs
 # mypy: disable-error-code="type-arg"
+from collections.abc import Callable
 from datetime import timedelta
 from enum import Enum
 from typing import Any, Optional, overload, Union
@@ -214,6 +215,7 @@ class Store:
     def queue_pop(self, key: str, block: bool = True) -> bytes: ...
     def queue_push(self, key: str, value: Union[bytes, str]) -> None: ...
     def queue_len(self, key: str) -> int: ...
+    def list_keys(self) -> list[str]: ...
 
 class FileStore(Store):
     def __init__(self, path: str, numWorkers: int = ...) -> None: ...
@@ -616,6 +618,11 @@ class FakeWork(Work):
     def wait(self, timeout: timedelta = ...) -> bool: ...
     def getFuture(self) -> Future: ...
 
+class PythonCallbackWork(Work):
+    def __init__(self, callback: Callable[[timedelta], bool]) -> None: ...
+    def wait(self, timeout: timedelta = ...) -> bool: ...
+    def get_future(self) -> Future: ...
+
 class ProcessGroupGloo(Backend):
     class Device: ...
 
diff --git a/torch/_C/_dynamo/guards.pyi b/torch/_C/_dynamo/guards.pyi
index b8c0a93e35fa3..e3003f0e97b12 100644
--- a/torch/_C/_dynamo/guards.pyi
+++ b/torch/_C/_dynamo/guards.pyi
@@ -449,3 +449,4 @@ def dict_version(d: dict[Any, Any]) -> int: ...
 def compute_overlapping_tensors(
     tensors: list[torch.Tensor], symbolic: bool = True
 ) -> set[int]: ...
+def set_is_in_mode_without_ignore_compile_internals(value: bool) -> None: ...
diff --git a/torch/_C/_functorch.pyi b/torch/_C/_functorch.pyi
index c23240e13170a..a35befcad392d 100644
--- a/torch/_C/_functorch.pyi
+++ b/torch/_C/_functorch.pyi
@@ -5,6 +5,8 @@ from torch import Tensor
 
 # Defined in torch/csrc/functorch/init.cpp
 
+def set_inplace_requires_grad_allowed(allowed: bool) -> None: ...
+def get_inplace_requires_grad_allowed() -> bool: ...
 def _set_dynamic_layer_keys_included(included: bool) -> None: ...
 def get_unwrapped(tensor: Tensor) -> Tensor: ...
 def is_batchedtensor(tensor: Tensor) -> bool: ...
diff --git a/torch/__init__.py b/torch/__init__.py
index 05a34bdd93200..e39e50a1f8409 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -303,8 +303,8 @@ def _get_cuda_dep_paths(path: str, lib_folder: str, lib_name: str) -> list[str]:
     return nvidia_lib_paths + lib_paths
 
 
-def _preload_cuda_deps(lib_folder: str, lib_name: str, required: bool = True) -> None:  # type: ignore[valid-type]
-    """Preloads cuda deps if they could not be found otherwise."""
+def _preload_cuda_lib(lib_folder: str, lib_name: str, required: bool = True) -> None:  # type: ignore[valid-type]
+    """Preloads cuda library if it could not be found otherwise."""
     # Should only be called on Linux if default path resolution have failed
     assert platform.system() == "Linux", "Should only be called on Linux"
 
@@ -320,6 +320,39 @@ def _preload_cuda_deps(lib_folder: str, lib_name: str, required: bool = True) ->
         ctypes.CDLL(lib_path)
 
 
+def _preload_cuda_deps(err: _Optional[OSError] = None) -> None:
+    cuda_libs: list[tuple[str, str]] = [
+        ("cublas", "libcublas.so.*[0-9]"),
+        ("cudnn", "libcudnn.so.*[0-9]"),
+        ("cuda_nvrtc", "libnvrtc.so.*[0-9]"),
+        ("cuda_nvrtc", "libnvrtc-builtins.so.*[0-9]"),
+        ("cuda_runtime", "libcudart.so.*[0-9]"),
+        ("cuda_cupti", "libcupti.so.*[0-9]"),
+        ("cufft", "libcufft.so.*[0-9]"),
+        ("curand", "libcurand.so.*[0-9]"),
+        ("nvjitlink", "libnvJitLink.so.*[0-9]"),
+        ("cusparse", "libcusparse.so.*[0-9]"),
+        ("cusparselt", "libcusparseLt.so.*[0-9]"),
+        ("cusolver", "libcusolver.so.*[0-9]"),
+        ("nccl", "libnccl.so.*[0-9]"),
+        ("nvshmem", "libnvshmem_host.so.*[0-9]"),
+        ("cufile", "libcufile.so.*[0-9]"),
+    ]
+    # If error is passed, re-raise it if it's not about one of the abovementioned
+    # libraries
+    if err is not None and [
+        lib for _, lib in cuda_libs if lib.split(".", 1)[0] in err.args[0]
+    ]:
+        raise err
+
+    # Otherwise, try to preload dependencies from site-packages
+    for lib_folder, lib_name in cuda_libs:
+        _preload_cuda_lib(lib_folder, lib_name)
+
+    # libnvToolsExt is Optional Dependency
+    _preload_cuda_lib("nvtx", "libnvToolsExt.so.*[0-9]", required=False)
+
+
 # See Note [Global dependencies]
 def _load_global_deps() -> None:
     if platform.system() == "Windows":
@@ -346,43 +379,15 @@ def _load_global_deps() -> None:
             # libtorch_global_deps.so always depends in cudart, check if its installed and loaded
             if "libcudart.so" not in _maps:
                 return
-            # If all above-mentioned conditions are met, preload nvrtc and nvjitlink
-            _preload_cuda_deps("cuda_nvrtc", "libnvrtc.so.*[0-9]")
-            _preload_cuda_deps("cuda_nvrtc", "libnvrtc-builtins.so.*[0-9]")
-            _preload_cuda_deps("nvjitlink", "libnvJitLink.so.*[0-9]")
+            # If all above-mentioned conditions are met, preload CUDA dependencies
+            _preload_cuda_deps()
         except Exception:
             pass
 
     except OSError as err:
-        # Can only happen for wheel with cuda libs as PYPI deps
+        # Can happen for wheel with cuda libs as PYPI deps
         # As PyTorch is not purelib, but nvidia-*-cu12 is
-        cuda_libs: dict[str, str] = {
-            "cublas": "libcublas.so.*[0-9]",
-            "cudnn": "libcudnn.so.*[0-9]",
-            "cuda_nvrtc": "libnvrtc.so.*[0-9]",
-            "cuda_runtime": "libcudart.so.*[0-9]",
-            "cuda_cupti": "libcupti.so.*[0-9]",
-            "cufft": "libcufft.so.*[0-9]",
-            "curand": "libcurand.so.*[0-9]",
-            "nvjitlink": "libnvJitLink.so.*[0-9]",
-            "cusparse": "libcusparse.so.*[0-9]",
-            "cusparselt": "libcusparseLt.so.*[0-9]",
-            "cusolver": "libcusolver.so.*[0-9]",
-            "nccl": "libnccl.so.*[0-9]",
-            "nvshmem": "libnvshmem_host.so.*[0-9]",
-            "cufile": "libcufile.so.*[0-9]",
-        }
-
-        is_cuda_lib_err = [
-            lib for lib in cuda_libs.values() if lib.split(".")[0] in err.args[0]
-        ]
-        if not is_cuda_lib_err:
-            raise err
-        for lib_folder, lib_name in cuda_libs.items():
-            _preload_cuda_deps(lib_folder, lib_name)
-
-        # libnvToolsExt is Optional Dependency
-        _preload_cuda_deps("nvtx", "libnvToolsExt.so.*[0-9]", required=False)
+        _preload_cuda_deps(err)
         ctypes.CDLL(global_deps_lib_path, mode=ctypes.RTLD_GLOBAL)
 
 
@@ -1021,7 +1026,6 @@ def sym_fresh_size(expr):
 except ImportError:
     import torch._C as _C_for_compiled_check
 
-    # The __file__ check only works for Python 3.7 and above.
     if _C_for_compiled_check.__file__ is None:
         raise ImportError(
             textwrap.dedent(
@@ -1144,19 +1148,32 @@ def is_tensor(obj: _Any, /) -> _TypeIs["torch.Tensor"]:
     return isinstance(obj, torch.Tensor)
 
 
-def is_storage(obj: _Any, /) -> _TypeIs[_Union["TypedStorage", "UntypedStorage"]]:
+def is_storage(obj: _Any, /) -> builtins.bool:
     r"""Returns True if `obj` is a PyTorch storage object.
 
     Args:
         obj (Object): Object to test
     Example::
 
-        >>> x = torch.tensor([1, 2, 3])
-        >>> torch.is_storage(x)
-        False
-        >>> torch.is_storage(x.untyped_storage())
+        >>> import torch
+        >>> # UntypedStorage (recommended)
+        >>> tensor = torch.tensor([1, 2, 3])
+        >>> storage = tensor.untyped_storage()
+        >>> torch.is_storage(storage)
         True
-
+        >>>
+        >>> # TypedStorage (legacy)
+        >>> typed_storage = torch.TypedStorage(5, dtype=torch.float32)
+        >>> torch.is_storage(typed_storage)
+        True
+        >>>
+        >>> # regular tensor (should return False)
+        >>> torch.is_storage(tensor)
+        False
+        >>>
+        >>> # non-storage object
+        >>> torch.is_storage([1, 2, 3])
+        False
     """
     return type(obj) in _storage_classes
 
@@ -2435,6 +2452,35 @@ def reset(self):
                 reset_cudagraph_trees()
 
 
+class _TorchCompileAOTInductorWrapper(_TorchCompileInductorWrapper):
+    compiler_name = "aotinductor"
+
+    def __init__(self, mode, options, dynamic):
+        super().__init__(mode, options, dynamic)
+        self.apply_options({"cpp_wrapper": True})
+        self.apply_options({"aot_inductor.package": True})
+
+    def __call__(self, model_, inputs_):
+        from contextlib import nullcontext
+        from unittest import mock
+
+        from torch._guards import detect_fake_mode
+        from torch._inductor.virtualized import V
+
+        fake_mode = detect_fake_mode(inputs_)
+        ctx = (
+            mock.patch.object(fake_mode, "allow_non_fake_inputs", True)
+            if fake_mode
+            else nullcontext()
+        )
+        with (
+            V.set_aot_compilation(True),
+            ctx,
+            torch._inductor.config.patch("enable_autograd_for_aot", True),
+        ):
+            return super().__call__(model_, inputs_)
+
+
 class _TorchCompileWrapper:
     def __init__(self, backend, mode, options, dynamic):
         from torch._dynamo.backends.registry import lookup_backend
@@ -2616,8 +2662,8 @@ def foo(x):
     import sysconfig
 
     _C._log_api_usage_once("torch.compile")
-    if sys.version_info >= (3, 14):
-        raise RuntimeError("torch.compile is not supported on Python 3.14+")
+    if sys.version_info >= (3, 15):
+        raise RuntimeError("torch.compile is not supported on Python 3.15+")
     elif sysconfig.get_config_var("Py_GIL_DISABLED") == 1 and sys.version_info < (
         3,
         13,
@@ -2668,8 +2714,10 @@ def fn(model: _Callable[_InputT, _RetT]) -> _Callable[_InputT, _RetT]:
             backend = bisect_backend
 
     guard_filter_fn = None
+    use_aoti = False
     if options and isinstance(options, dict):
         guard_filter_fn = options.pop("guard_filter_fn", None)
+        use_aoti = options.pop("use_aoti", False)
 
     if torch.compiler.is_exporting():
         warnings.warn(
@@ -2696,7 +2744,10 @@ def export_wrapped_fn(*args, **kwargs):
         return export_wrapped_fn
 
     if backend == "inductor":
-        backend = _TorchCompileInductorWrapper(mode, options, dynamic)
+        if use_aoti:
+            backend = _TorchCompileAOTInductorWrapper(mode, options, dynamic)
+        else:
+            backend = _TorchCompileInductorWrapper(mode, options, dynamic)
     else:
         backend = _TorchCompileWrapper(backend, mode, options, dynamic)
 
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index a69a46c48b5f1..4446ed5cdd310 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -4562,6 +4562,8 @@ def should_fold(tensor1: torch.Tensor, tensor2: torch.Tensor, is_out: bool) -> b
 @aten.matmul.out.py_impl(DispatchKey.CompositeImplicitAutograd)
 @out_wrapper(pass_is_out=True)
 def matmul(tensor1, tensor2, *, is_out=False):
+    from torch.fx.experimental.symbolic_shapes import guard_or_false, guard_or_true
+
     dim_tensor1 = tensor1.dim()
     dim_tensor2 = tensor2.dim()
     assert dim_tensor1 != 0 and dim_tensor2 != 0
@@ -4630,11 +4632,11 @@ def matmul(tensor1, tensor2, *, is_out=False):
         if (
             dim_tensor1 == 3
             and dim_tensor2 == 3
-            and batch_tensor1[0] != batch_tensor2[0]
+            and guard_or_true(batch_tensor1[0] != batch_tensor2[0])
         ):
-            if batch_tensor1[0] == 1 and tensor1.requires_grad:
+            if guard_or_false(batch_tensor1[0] == 1) and tensor1.requires_grad:
                 return matmul(tensor1.squeeze(0), tensor2)
-            if batch_tensor2[0] == 1 and tensor2.requires_grad:
+            if guard_or_false(batch_tensor2[0] == 1) and tensor2.requires_grad:
                 return matmul(tensor1, tensor2.squeeze(0))
 
         # expand the batch portion (i.e. cut off matrix dimensions and expand rest)
diff --git a/torch/_decomp/decompositions_for_jvp.py b/torch/_decomp/decompositions_for_jvp.py
index fb4a4d85faa20..dd3b7e7d88992 100644
--- a/torch/_decomp/decompositions_for_jvp.py
+++ b/torch/_decomp/decompositions_for_jvp.py
@@ -84,7 +84,7 @@ def _register_jit_decomposition_for_jvp(decomp, use_python=False):
         # Thanks copilot!
         def get_function_def(sig):
             param_def = [f"{param_str}" for param_str in sig.parameters.values()]
-            param_use = [f"{param_str}" for param_str in sig.parameters.keys()]
+            param_use = [f"{param_str}" for param_str in sig.parameters]
 
             return f"def wrapped_decomp({', '.join(param_def)}):\n  return decomp_fn({', '.join(param_use)})\n"
 
diff --git a/torch/_dynamo/__init__.py b/torch/_dynamo/__init__.py
index 28a77d20ea3b0..de097edf87752 100644
--- a/torch/_dynamo/__init__.py
+++ b/torch/_dynamo/__init__.py
@@ -32,6 +32,7 @@
     error_on_graph_break,
     forbid_in_graph,
     graph_break,
+    is_dynamo_disable_recursive,
     mark_dynamic,
     mark_static,
     mark_static_address,
@@ -87,6 +88,7 @@
     "forbid_in_graph",
     "graph_break",
     "is_compiling",
+    "is_dynamo_disable_recursive",
     "list_backends",
     "lookup_backend",
     "mark_dynamic",
diff --git a/torch/_dynamo/aot_compile.py b/torch/_dynamo/aot_compile.py
index 000d977d29f36..20259b4595af7 100644
--- a/torch/_dynamo/aot_compile.py
+++ b/torch/_dynamo/aot_compile.py
@@ -1,5 +1,4 @@
 import dataclasses
-import importlib
 import inspect
 import io
 import logging
@@ -12,6 +11,7 @@
 
 import torch
 import torch.fx
+from torch._dynamo.convert_frame import GraphRuntimeEnv
 from torch._dynamo.graph_utils import _graph_device_type
 from torch._dynamo.package import SystemInfo
 
@@ -42,17 +42,15 @@ def bind_locals(
 @dataclass
 class CompileArtifacts:
     signature: inspect.Signature
-    bytecode: types.CodeType
     guard_manager: Optional["GuardManagerWrapper"]
     guards_state: bytes
-    import_sources: dict[str, str]
     backend_id: str
     compiled_fn: SerializableCallable
     original_code: types.CodeType
-    closure: Optional[tuple[Any, ...]]
-    argdefs: Optional[tuple[Any, ...]]
+    runtime_env: GraphRuntimeEnv
     source_info: "SourceInfo"
     device_type: str
+    backend_name: str
     system_info: SystemInfo = dataclasses.field(default_factory=SystemInfo.current)
 
     def check_compatibility(self) -> None:
@@ -83,15 +81,14 @@ class AOTCompiledFunction:
 
     def guard_check(self, *args: Any, **kwargs: Any) -> bool:
         f_locals: dict[str, Any] = {}
-        if self._artifacts.closure:
-            assert self._artifacts.bytecode.co_freevars and len(
-                self._artifacts.closure
-            ) == len(self._artifacts.bytecode.co_freevars)
+        env = self._artifacts.runtime_env
+        if env.closure:
+            assert env.bytecode.co_freevars and len(env.closure) == len(
+                env.bytecode.co_freevars
+            )
             f_locals = {
                 name: cell.cell_contents
-                for name, cell in zip(
-                    self._artifacts.bytecode.co_freevars, self._artifacts.closure
-                )
+                for name, cell in zip(env.bytecode.co_freevars, env.closure)
             }
         f_locals.update(bind_locals(self._artifacts.signature, *args, **kwargs))
         assert self._artifacts.guard_manager is not None
@@ -102,20 +99,9 @@ def __post_init__(self) -> None:
 
         self._artifacts.check_compatibility()
 
-        import_sources = {
-            alias: importlib.import_module(module_name)
-            for alias, module_name in self._artifacts.import_sources.items()
-        }
-        f_globals = {
-            **import_sources,
-            self._artifacts.backend_id: self._artifacts.compiled_fn,
-        }
         # pyrefly: ignore [read-only]
-        self.fn = types.FunctionType(
-            self._artifacts.bytecode,
-            f_globals,
-            closure=self._artifacts.closure,
-            argdefs=self._artifacts.argdefs,
+        self.fn = self._artifacts.runtime_env.forward_callable(
+            self._artifacts.backend_id, self._artifacts.compiled_fn
         )
 
         if self._artifacts.guard_manager is None:
@@ -123,7 +109,7 @@ def __post_init__(self) -> None:
             self._artifacts.guard_manager = load_guard_manager(
                 guards_state,
                 self._artifacts.original_code,
-                f_globals,
+                self.fn.__globals__,
             )
 
     def __call__(self, *args: Any, **kwargs: Any) -> Any:
@@ -147,7 +133,10 @@ def serialize(cls, fn: "AOTCompiledFunction") -> bytes:
 
         state = fn._artifacts.__dict__.copy()
         state["guard_manager"] = None
-        state["bytecode"] = SerializedCode.from_code_object(state["bytecode"])
+        state["runtime_env"] = dataclasses.replace(
+            state["runtime_env"],
+            bytecode=SerializedCode.from_code_object(state["runtime_env"].bytecode),
+        )
         compiled_fn = state["compiled_fn"]
         state["compiled_fn"] = (
             type(compiled_fn).deserialize_compile_artifacts,
@@ -164,9 +153,13 @@ def deserialize(cls, data: bytes) -> "AOTCompiledFunction":
         from torch._dynamo.package import SerializedCode
 
         state = pickle.loads(data)
-        state["bytecode"] = SerializedCode.to_code_object(state["bytecode"])
+        state["runtime_env"] = dataclasses.replace(
+            state["runtime_env"],
+            bytecode=SerializedCode.to_code_object(state["runtime_env"].bytecode),
+        )
         deserializer, compiled_fn_state = state["compiled_fn"]
-        state["compiled_fn"] = deserializer(compiled_fn_state)
+        with torch._inductor.config.patch(enable_autograd_for_aot=True):
+            state["compiled_fn"] = deserializer(compiled_fn_state)
         state["original_code"] = SerializedCode.to_code_object(state["original_code"])
 
         artifacts = CompileArtifacts(**state)
@@ -262,17 +255,15 @@ def new_guard_filter_fn(
 
         artifacts = CompileArtifacts(
             signature=convert_frame._get_signature(fn),
-            bytecode=graph_capture_output.bytecode,
             guard_manager=check_fn.guard_manager,
             guards_state=check_fn.guards_state,
-            import_sources=graph_capture_output.import_sources,
             backend_id=backend_input.backend_id,
             compiled_fn=compiled_fn,
             original_code=fn.__code__,
-            closure=fn.__closure__,
-            argdefs=fn.__defaults__,
+            runtime_env=graph_capture_output.get_runtime_env(),
             source_info=source_info,
             device_type=device_type,
+            backend_name=getattr(backend, "compiler_name", "unknown"),
         )
         aot_compiled_fn = AOTCompiledFunction(_artifacts=artifacts)
 
diff --git a/torch/_dynamo/aot_compile_types.py b/torch/_dynamo/aot_compile_types.py
index 547a0bbdc915d..4a6604681bbfb 100644
--- a/torch/_dynamo/aot_compile_types.py
+++ b/torch/_dynamo/aot_compile_types.py
@@ -16,6 +16,10 @@ def serialize_compile_artifacts(cls, fn: Any) -> bytes:
     def deserialize_compile_artifacts(cls, data: bytes) -> Any:
         pass
 
+    @abc.abstractmethod
+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
+        pass
+
 
 class BundledAOTAutogradSerializableCallable(SerializableCallable):
     """
diff --git a/torch/_dynamo/backends/registry.py b/torch/_dynamo/backends/registry.py
index 706ec1768cd35..1469ca478a386 100644
--- a/torch/_dynamo/backends/registry.py
+++ b/torch/_dynamo/backends/registry.py
@@ -146,7 +146,7 @@ def list_backends(exclude_tags=("debug", "experimental")) -> list[str]:  # type:
 
     backends = [
         name
-        for name in _BACKENDS.keys()
+        for name in _BACKENDS
         if name not in _COMPILER_FNS
         or not exclude_tags_set.intersection(_COMPILER_FNS[name]._tags)  # type: ignore[attr-defined]
     ]
diff --git a/torch/_dynamo/backends/tvm.py b/torch/_dynamo/backends/tvm.py
index 350df8f9c15c8..92258d55d48c6 100644
--- a/torch/_dynamo/backends/tvm.py
+++ b/torch/_dynamo/backends/tvm.py
@@ -27,6 +27,7 @@
 import sys
 import tempfile
 from collections.abc import Callable
+from pathlib import Path
 from types import MappingProxyType
 from typing import Any, Optional
 
@@ -211,7 +212,7 @@ def has_tvm() -> bool:
 @functools.cache
 def llvm_target() -> str:
     if sys.platform == "linux":
-        cpuinfo = open("/proc/cpuinfo").read()
+        cpuinfo = Path("/proc/cpuinfo").read_text()
         if "avx512" in cpuinfo:
             return "llvm -mcpu=skylake-avx512"
         elif "avx2" in cpuinfo:
diff --git a/torch/_dynamo/codegen.py b/torch/_dynamo/codegen.py
index 1861b20105265..8c19cb8b61e27 100644
--- a/torch/_dynamo/codegen.py
+++ b/torch/_dynamo/codegen.py
@@ -15,7 +15,7 @@
 import re
 import sys
 import types
-from collections import Counter
+from collections import Counter, deque
 from collections.abc import Callable, Iterable
 from typing import Any, Optional, TYPE_CHECKING, Union
 
@@ -38,7 +38,7 @@
     create_rot_n,
     Instruction,
 )
-from .exc import IncorrectUsage, unimplemented_v2
+from .exc import IncorrectUsage, unimplemented
 from .source import AttrSource, ChainedSource, DictGetItemSource, Source
 from .utils import is_safe_constant, rot_n_helper
 from .variables.base import ValueMutationExisting, VariableTracker
@@ -215,7 +215,7 @@ def __call__(
             try:
                 self.call_reconstruct(source)
             except NotImplementedError:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Reconstruction failure: source.reconstruct not implemented",
                     context=str(source),
                     explanation=f"Dynamo has no bytecode reconstruction implemented for {type(source)} variable {source}.",
@@ -359,7 +359,7 @@ def gen_fn() -> None:
             try:
                 self.call_reconstruct(value)
             except NotImplementedError:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Reconstruction failure",
                     context=str(value),
                     explanation=f"Dynamo has no bytecode reconstruction implemented for sourceless variable {value}.",
@@ -597,32 +597,35 @@ def make_call_generated_code(self, fn_name: str) -> None:
 
         graphargs = self.tx.output.graphargs
 
-        seen_sources: OrderedSet[Source] = OrderedSet()
-
-        def collect_temp_source(source: Source) -> None:
-            if source in seen_sources:
-                # This source is used at least twice, so it can be reused
-                self.mark_source_temp(source)
-                # Dont trace source further. This prevents us from marking too
-                # many nodes as temp sources.
-                return
-
-            seen_sources.add(source)
-
+        def extract_nested_sources(source: Source) -> list[Source]:
+            nested_sources: list[Source] = []
             if isinstance(source, ChainedSource):
-                collect_temp_source(source.base)
-
+                nested_sources.append(source.base)
             if isinstance(source, DictGetItemSource) and isinstance(
                 source.index, Source
             ):
-                collect_temp_source(source.index)
+                nested_sources.append(source.index)
+            return nested_sources
+
+        def collect_temp_sources(sources: deque[Source], codegen: PyCodegen) -> None:
+            seen_sources: OrderedSet[Source] = OrderedSet()
+            while sources:
+                current_source = sources.popleft()
+                if current_source in seen_sources:
+                    # This source is used at least twice, so it can be reused
+                    codegen.mark_source_temp(current_source)
+                    # Dont trace source further. This prevents us from marking too
+                    # many nodes as temp sources.
+                    continue
+                seen_sources.add(current_source)
+                sources.extend(extract_nested_sources(current_source))
 
         # Collect all the sources that are used more than once, so that we can
         # generate tmp variables in the generated pre-graph bytecode. This
         # essentially implements CSE.
-        for arg in graphargs:
-            if arg.source is not None:
-                collect_temp_source(arg.source)
+        collect_temp_sources(
+            deque([arg.source for arg in graphargs if arg.source is not None]), self
+        )
 
         cm_var = None
         if config.record_runtime_overhead:
diff --git a/torch/_dynamo/compiled_autograd.py b/torch/_dynamo/compiled_autograd.py
index cace23af20565..20fe8771a7899 100644
--- a/torch/_dynamo/compiled_autograd.py
+++ b/torch/_dynamo/compiled_autograd.py
@@ -995,7 +995,7 @@ def remove_unused_sizes(self) -> set[int]:
         sizes_node = next(it)
         assert sizes_node.name == "sizes"
 
-        for getitem_node in sizes_node.users.keys():
+        for getitem_node in sizes_node.users:
             assert getitem_node.target is operator.getitem
             if getitem_node.users:
                 used_sizes.append(getitem_node)
diff --git a/torch/_dynamo/comptime.py b/torch/_dynamo/comptime.py
index 65690dc446a24..34eec572ce550 100644
--- a/torch/_dynamo/comptime.py
+++ b/torch/_dynamo/comptime.py
@@ -47,7 +47,7 @@ def my_model(x):
 from torch._subclasses.fake_tensor import FakeTensor
 from torch.fx.experimental.symbolic_shapes import free_symbols
 
-from .exc import unimplemented_v2
+from .exc import unimplemented
 from .variables import CellVariable
 from .variables.constant import ConstantVariable
 from .variables.tensor import SymNodeVariable
@@ -193,7 +193,7 @@ def graph_break(self, msg: str = "ComptimeContext.graph_break") -> None:
         """
         Manually trigger a graph break
         """
-        unimplemented_v2(
+        unimplemented(
             gb_type="ComptimeContext graph break",
             context=msg,
             explanation=f"Manually triggered ComptimeContext graph break with message {msg}.",
diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index 5858a4584b3dd..e502576519588 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -14,8 +14,9 @@
 import os
 import sys
 import tempfile
+from collections.abc import Callable
 from os.path import abspath, dirname
-from typing import Any, Callable, Literal, Optional, TYPE_CHECKING, Union
+from typing import Any, Literal, Optional, TYPE_CHECKING, Union
 
 from torch._environment import is_fbcode
 from torch.utils._config_module import Config, get_tristate_env, install_config_module
@@ -43,6 +44,19 @@
 # turn on/off DCE pass (deprecated: always true)
 dead_code_elimination = True
 
+# Enable or disable side effect replay after graph execution.
+# When False, mutations to Python objects (lists, dicts, attributes) won't be
+# replayed after the compiled graph runs. This can cause correctness issues
+# if your code depends on these mutations being visible. This should probably
+# never be False by default. At the moment, only export will need it.
+replay_side_effects = True
+
+# Configure side effect warning level
+# If `silent`, we silently allow side effects
+# If `warn`, we warn side effects
+# If `error`, we error on side effects
+side_effect_replay_policy = "silent"
+
 # disable (for a function) when cache reaches this size
 
 # controls the maximum number of cache entries with a guard on same ID_MATCH'd
@@ -739,6 +753,9 @@ def default_debug_dir_root() -> str:
 # HACK: this is for testing custom ops profiling only
 _custom_ops_profile: Optional[Any] = None
 
+# Deprecated! Please use the config in torch/fx/experimental/_config instead.
+enrich_profiler_metadata: bool = False
+
 if TYPE_CHECKING:
     from torch.utils._config_typing import *  # noqa: F401, F403
 
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index 875f640194e42..1c44de2c1ad1e 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -29,6 +29,7 @@
 import dis
 import functools
 import gc
+import importlib
 import inspect
 import itertools
 import logging
@@ -114,7 +115,7 @@
     SkipCodeRecursiveException,
     TorchRuntimeError,
     UncapturedHigherOrderOpError,
-    unimplemented_v2,
+    unimplemented,
     Unsupported,
 )
 from .graph_bytecode_inputs import reset_user_object_tracking
@@ -646,7 +647,7 @@ def __call__(
             return ConvertFrameReturn()
 
         if is_generator(code):
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Attempt to trace generator",
                 context="",
                 explanation="Generators cannot be compiled directly with `torch.compile`.",
@@ -828,6 +829,7 @@ def run_tracer() -> None:
             raise
         finally:
             tracer.output.call_cleanup_hooks()
+            tracer.f_locals = {}
 
     try:
         run_tracer()
@@ -900,6 +902,7 @@ def graph_capture_output(
             self.bytecode,
             self.tracer_output.closure,
             argdefs,
+            self.tracer_output.f_globals,
         )
 
 
@@ -921,6 +924,34 @@ class BackendInput:
     tensor_to_context: WeakIdKeyDictionary
 
 
+@dataclass(frozen=True)
+class GraphRuntimeEnv:
+    bytecode: types.CodeType
+    import_sources: dict[str, str]
+    used_globals: dict[str, Any]
+    closure: Optional[tuple[Any, ...]]
+    argdefs: Optional[tuple[Any, ...]]
+
+    def forward_callable(
+        self, backend_id: str, compiled_fn: Callable[..., Any]
+    ) -> Callable[..., Any]:
+        import_sources = {
+            alias: importlib.import_module(module_name)
+            for alias, module_name in self.import_sources.items()
+        }
+        f_globals = {
+            **import_sources,
+            **self.used_globals,
+            backend_id: compiled_fn,
+        }
+        return types.FunctionType(
+            self.bytecode,
+            f_globals,
+            closure=self.closure,
+            argdefs=self.argdefs,
+        )
+
+
 @dataclass
 class GraphCaptureOutput:
     """
@@ -933,6 +964,7 @@ class GraphCaptureOutput:
     bytecode: CodeType
     closure: Optional[tuple[Any, ...]]
     argdefs: Optional[tuple[Any, ...]]
+    f_globals: dict[str, Any]
 
     def build_guards(
         self,
@@ -952,6 +984,27 @@ def build_guards(
             strict_error=strict_error,
         )
 
+    def get_runtime_env(self) -> GraphRuntimeEnv:
+        from torch._dynamo.source import get_global_source_name
+
+        used_globals = {}
+        for (
+            source
+        ) in self.output_graph.export_metadata.graph_input_idx_to_local_source.values():
+            global_name = get_global_source_name(source)
+            if global_name is None:
+                continue
+            if global_name in self.f_globals:
+                used_globals[global_name] = self.f_globals[global_name]
+
+        return GraphRuntimeEnv(
+            bytecode=self.bytecode,
+            import_sources=self.import_sources,
+            used_globals=used_globals,
+            closure=self.closure,
+            argdefs=self.argdefs,
+        )
+
 
 @dataclass
 class CaptureOutput:
@@ -969,27 +1022,17 @@ class CaptureOutput:
     # BackendInput can be None when dynamo didn't compile any graph (no tensor op)
     backend_input: Optional[BackendInput]
 
-    def forward_callable(self) -> Callable[..., Any]:
-        import importlib
-
-        # TODO code sharing
-        import_sources = self.graph_capture_output.output_graph.import_sources
+    def forward_callable(
+        self,
+        *,
+        compiled_fn: Optional[Callable[..., Any]] = None,
+    ) -> Callable[..., Any]:
+        runtime_env = self.graph_capture_output.get_runtime_env()
         assert self.backend_input is not None
         backend_id = self.backend_input.backend_id
-        import_sources = {
-            alias: importlib.import_module(module_name)
-            for alias, module_name in import_sources.items()
-        }
-        f_globals = {
-            **import_sources,
-            backend_id: self.backend_input.graph_module,
-        }
-        return types.FunctionType(
-            self.graph_capture_output.bytecode,
-            f_globals,
-            closure=self.graph_capture_output.closure,
-            argdefs=self.graph_capture_output.argdefs,
-        )
+        # pyrefly: ignore [not-callable]
+        compiled_fn = compiled_fn or self.backend_input.graph_module
+        return runtime_env.forward_callable(backend_id, compiled_fn)
 
 
 def get_traced_fn(mod: Any) -> tuple[FunctionType, Optional[object]]:
@@ -1000,10 +1043,34 @@ def get_traced_fn(mod: Any) -> tuple[FunctionType, Optional[object]]:
     import inspect
 
     if isinstance(mod, torch.nn.Module):
-        if len(mod._forward_pre_hooks) == 0 and len(mod._forward_hooks) == 0:
+        resolved_forward = mod.forward
+        if hasattr(resolved_forward, "__self__"):
+            # pyrefly: ignore [missing-attribute]
+            resolved_forward = resolved_forward.__func__
+
+        # Mirrored from NNModuleVariable.call_function:
+        # https://github.com/pytorch/pytorch/blob/main/torch/_dynamo/variables/nn_module.py#L1035
+        if (
+            len(mod._forward_pre_hooks) == 0
+            and len(mod._forward_hooks) == 0
+            and len(torch.nn.modules.module._global_forward_pre_hooks) == 0
+            and len(torch.nn.modules.module._global_forward_hooks) == 0
+            and len(mod._backward_pre_hooks) == 0
+            and len(mod._backward_hooks) == 0
+            and len(torch.nn.modules.module._global_backward_pre_hooks) == 0
+            and len(torch.nn.modules.module._global_backward_hooks) == 0
+            and resolved_forward != torch.nn.Module.forward
+        ):
+            # We cannot trace __call__ by default because it will break
+            # the legacy dynamo export. If we want to revisit this,
+            # feel free to remove this path and try unittests in
+            # test_strict_export_v2.py
             mod = mod.forward
+        elif isinstance(mod, torch.fx.GraphModule):
+            mod = mod._call_impl
         else:
             mod = mod.__call__
+
     if hasattr(mod, "__self__"):
         # pyrefly: ignore [missing-attribute]
         return mod.__func__, mod.__self__
@@ -1240,7 +1307,7 @@ def transform(
             # We now have a new "last attempt", reset the clock
             last_attempt_start_time = time.time()
             if attempt > 100:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Excessive RestartAnalysis() calls",
                     context="",
                     explanation="Dynamo attempted to trace the same frame 100+ times. "
@@ -1285,7 +1352,6 @@ def _compile(
     # in the case of normal and exception code paths
     convert_frame_box: Optional[ConvertFrameBox] = None,
 ) -> ConvertFrameReturn:
-    from torch._inductor.async_compile import async_compile_pool_manager
     from torch.fx.experimental.validator import (
         BisectValidationException,
         ValidationException,
@@ -1479,7 +1545,6 @@ def count_args(code: CodeType) -> int:
     with (
         _use_lazy_graph_module(config.use_lazy_graph_module),
         compile_context(CompileContext(compile_id)),
-        async_compile_pool_manager(),
         chromium_event_timed(
             "dynamo", reset_event_log_on_exit=True, log_pt2_compile_event=True
         ),
@@ -1577,7 +1642,7 @@ def format_func_info(code: CodeType) -> str:
                 raise RecompileLimitExceeded(f"{limit_type} reached")
             else:
                 # do not recursively skip frames
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Dynamo cache limit exceeded",
                     context=f"Limit type: {limit_type}",
                     explanation="Dynamo attempted to recompile the code object too many times, "
@@ -1857,7 +1922,7 @@ def __call__(
                 raise
 
             soft_fail = isinstance(e, Unsupported)
-
+            code = frame.f_code
             # This is a soft failure. In the sense, the code path reaches here
             # when we do not support graph breaks on bytecodes like LOAD_ATTR,
             # BUILD_SET etc. In such case, we can fallback to eager without
@@ -1872,7 +1937,13 @@ def __call__(
                         user_stack_formatted = "".join(
                             traceback.format_list(user_stack)
                         )
-                        user_stack_trace = f"Graph break: skip: from user code at:\n{user_stack_formatted}"
+                        frame_info = exc.format_frame_info(code)
+                        user_stack_trace = (
+                            "Graph break: torch.compile cannot properly resume from this graph break, which results in a skip.\n"
+                            f"torch.compile will skip tracing the frame {frame_info} and fall back to eager.\n"
+                            "The graph break occurred in the following user code:\n"
+                            f"{user_stack_formatted}"
+                        )
                         torch._logging.trace_structured(
                             "artifact",
                             metadata_fn=lambda: {
@@ -1884,6 +1955,7 @@ def __call__(
                         graph_break_log.debug(
                             user_stack_trace,
                             exc_info=True,
+                            stack_info=config.verbose,
                         )
 
             if not config.suppress_errors and not soft_fail:
diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index e16fa11ed08f6..2acf517aba92f 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -47,7 +47,7 @@
 from torch.utils._content_store import ContentStoreReader, ContentStoreWriter
 
 from . import config
-from .utils import clone_inputs, get_debug_dir
+from .utils import clone_inputs, get_debug_dir, warn_once
 
 
 if TYPE_CHECKING:
@@ -617,7 +617,7 @@ def storage(
                     # way would be very mysterious!  Would have been better
                     # not to store device in the serialized format...
                 return storage
-        log.warning("could not load %s, generating random data instead", storage_hash)
+        warn_once(f"could not load {storage_hash}, generating random data instead")
         shape = (nbytes // dtype_hint.itemsize,)
         stride = _stride_or_default(None, shape=shape)
         return rand_strided(shape, stride, dtype_hint, device).untyped_storage()
diff --git a/torch/_dynamo/decorators.py b/torch/_dynamo/decorators.py
index 144f0ea7eeefa..87becc8b8b1b2 100644
--- a/torch/_dynamo/decorators.py
+++ b/torch/_dynamo/decorators.py
@@ -96,6 +96,7 @@ def wrap(fn: Callable[_P, _R]) -> Callable[_P, _R]:
             nonrecursive_disable_wrapper._torchdynamo_disable = True  # type: ignore[attr-defined]
             nonrecursive_disable_wrapper._torchdynamo_disable_msg = reason  # type: ignore[attr-defined]
             nonrecursive_disable_wrapper._torchdynamo_orig_callable = fn  # type: ignore[attr-defined]
+            nonrecursive_disable_wrapper._torchdynamo_disable_recursive = False  # type: ignore[attr-defined]
             # pyrefly: ignore [bad-return]
             return nonrecursive_disable_wrapper
 
@@ -1023,3 +1024,13 @@ def error_on_graph_break(
     The default value of torch.compile's `error_on_graph_break` setting is False.
     """
     return ErrorOnGraphBreakDecoratorContextManager(error_on_graph_break)
+
+
+def is_dynamo_disable_recursive(method: Callable[[Any], Any]) -> Optional[bool]:
+    """
+    Check if a method is marked as `dynamo_disable` recursively. It returns:
+    - True if disable(recursive=True)
+    - False if disable(recursive=False)
+    - None if method is not a disable decorator
+    """
+    return getattr(method, "_torchdynamo_disable_recursive", None)
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index e23e049e3bbb1..0956facde2559 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -39,10 +39,11 @@
 import unittest
 import warnings
 import weakref
+from collections.abc import Sized
 from dataclasses import dataclass
 from enum import Enum
 from os.path import dirname, join
-from typing import Any, NamedTuple, Optional, Sized, TYPE_CHECKING, Union
+from typing import Any, NamedTuple, Optional, TYPE_CHECKING, Union
 from unittest.mock import patch
 
 import sympy
@@ -77,7 +78,7 @@
     _RelaxedConstraint,
     Constraint,
 )
-from torch.fx import GraphModule
+from torch.fx import GraphModule, traceback as fx_traceback
 from torch.fx.experimental._dynamism import (
     clone_and_convert_to_meta,
     track_dynamism_across_examples,
@@ -1133,6 +1134,17 @@ def _fn(*args: Any, **kwargs: Any) -> Any:
             try:
                 _maybe_set_eval_frame(_callback_from_stance(self.callback))
                 try:
+                    if torch.compiler.is_exporting():
+                        with fx_traceback.annotate(
+                            {
+                                "_torchdynamo_disable": True,
+                                "_torchdynamo_disable_recursive": True,
+                                "_torchdynamo_disable_method": getattr(
+                                    fn, "__name__", type(fn).__name__
+                                ),
+                            }
+                        ):
+                            return fn(*args, **kwargs)
                     return fn(*args, **kwargs)
                 finally:
                     set_eval_frame(None)
@@ -1154,6 +1166,8 @@ def _fn(*args: Any, **kwargs: Any) -> Any:
         # of decorators.
         _fn._torchdynamo_orig_callable = fn  # type: ignore[attr-defined]
 
+        _fn._torchdynamo_disable_recursive = True  # type: ignore[attr-defined]
+
         return _fn
 
     def __reduce__(self) -> tuple[type[DisableContext], tuple[Any, ...]]:
@@ -1302,8 +1316,8 @@ def signature_to_fullargspec(sig: inspect.Signature) -> inspect.FullArgSpec:
 
 
 def check_if_dynamo_supported() -> None:
-    if sys.version_info >= (3, 14):
-        raise RuntimeError("Python 3.14+ not yet supported for torch.compile")
+    if sys.version_info >= (3, 15):
+        raise RuntimeError("Python 3.15+ not yet supported for torch.compile")
     elif sysconfig.get_config_var("Py_GIL_DISABLED") == 1 and sys.version_info < (
         3,
         13,
@@ -1771,16 +1785,21 @@ def check_optional_input_and_error(f_sig: inspect.Signature) -> None:
         # Check if function has optional input.
         for name, param in f_sig.parameters.items():
             if param.default is not inspect.Parameter.empty:
-                from torch._dynamo.exc import Unsupported
+                import torch._dynamo.graph_break_hints as graph_break_hints
+                from torch._dynamo.exc import unimplemented
 
                 log.error(
                     "Parameter %s is optional with a default value of %s",
                     name,
                     param.default,
                 )
-                raise Unsupported(
-                    "Tracing through optional input is not supported yet",
-                    case_name="optional_input",
+                unimplemented(
+                    gb_type="rewrite_signature: cannot trace optional function input",
+                    context="",
+                    explanation=f"Parameter {name} is optional with a default value of {param.default}. This is not supported yet.",
+                    hints=[
+                        *graph_break_hints.SUPPORTABLE,
+                    ],
                 )
 
     def produce_matching(
@@ -2137,7 +2156,7 @@ def fakify_with_ambient(
 
             # Error if we have any constraints on static values
 
-            for k in shape_env.var_to_range.keys():
+            for k in shape_env.var_to_range:
                 if isinstance(k, sympy.Integer):
                     constraint_violation_error = ConstraintViolationError(
                         f"{''.join(traceback.format_list(shape_env.var_to_stack[k]))}\n"
diff --git a/torch/_dynamo/exc.py b/torch/_dynamo/exc.py
index 857c694ad92c6..f11c78bdaa49e 100644
--- a/torch/_dynamo/exc.py
+++ b/torch/_dynamo/exc.py
@@ -28,7 +28,6 @@
 
 import json
 import logging
-import os
 import re
 import textwrap
 import typing
@@ -452,44 +451,9 @@ def handle_observed_exception(tx: Any) -> None:
 
 
 def unimplemented_with_warning(
-    e: Exception, code: types.CodeType, msg: str
-) -> NoReturn:
-    # This function calls unimplemented internally and eventually graph breaks
-    # or falls to eager. unimplemented itself does not print any user warnings,
-    # i.e., its very silent. This helper function is intended when an error is
-    # encountered in the torch.compile stack which is worth showing as warning
-    # to the user. For example, if AOT Autograd backend fails with a fake tensor
-    # exception, its ok to fallback to eager but not silently. Here, we can use
-    # this function to log the message and the stack trace.
-    graph_break_msg = format_error_msg_verbose(e, code)
-    torch._logging.trace_structured(
-        "artifact",
-        metadata_fn=lambda: {
-            "name": "dynamo_graph_break_reason",
-            "encoding": "string",
-        },
-        payload_fn=lambda: graph_break_msg,
-    )
-    graph_breaks_log.debug("%s", graph_break_msg)
-    log.warning(msg)
-    unimplemented(msg, from_exc=e)
-
-
-_NOTHING = object()
-
-
-def unimplemented(
-    msg: str, *, from_exc: Any = _NOTHING, case_name: Optional[str] = None
-) -> NoReturn:
-    assert msg != os.environ.get("BREAK", False)
-    if from_exc is not _NOTHING:
-        raise Unsupported(msg, case_name=case_name) from from_exc
-    raise Unsupported(msg, case_name=case_name)
-
-
-def unimplemented_v2_with_warning(
     e: Exception,
     code: types.CodeType,
+    *,
     gb_type: str,
     context: str,
     explanation: str,
@@ -512,7 +476,16 @@ def unimplemented_v2_with_warning(
         payload_fn=lambda: graph_break_msg,
     )
     graph_breaks_log.debug("%s", graph_break_msg)
-    unimplemented_v2(gb_type, context, explanation, hints, from_exc=e, log_warning=True)
+    _unimplemented = unimplemented
+    # to prevent a graph break registry entry
+    _unimplemented(
+        gb_type=gb_type,
+        context=context,
+        explanation=explanation,
+        hints=hints,
+        from_exc=e,
+        log_warning=True,
+    )
 
 
 def format_graph_break_message(
@@ -587,13 +560,15 @@ def get_gbid_documentation_link(gb_type: str) -> Optional[str]:
     return None
 
 
-# TODO replace old unimplemented later
-def unimplemented_v2(
+_NOTHING = object()
+
+
+def unimplemented(
+    *,
     gb_type: str,
     context: str,
     explanation: str,
     hints: list[str],
-    *,
     from_exc: Any = _NOTHING,
     log_warning: bool = False,
 ) -> NoReturn:
@@ -819,6 +794,38 @@ def format_error_msg_verbose(
     return msg
 
 
+def format_frame_info(code: types.CodeType) -> str:
+    return (
+        f"{getattr(code, 'co_name', '<unknown>')} "
+        f"({getattr(code, 'co_filename', '<unknown>')} "
+        f"line {getattr(code, 'co_firstlineno', 0)})"
+    )
+
+
+def format_skip_frame_message(code: Optional[types.CodeType], reason: str) -> str:
+    if code is not None:
+        frame_info = format_frame_info(code)
+        return (
+            f"torch.compile intentionally decided to skip the frame {frame_info} and fall back to eager.\n"
+            f"Reason: {reason}"
+        )
+    else:
+        return (
+            f"torch.compile intentionally decided to skip the frame and fall back to eager.\n"
+            f"Reason: {reason}"
+        )
+
+
+def format_loop_skip_frame_message(code: types.CodeType, frame_summary: str) -> str:
+    frame_info = format_frame_info(code)
+    return (
+        "Skipping frame because there is a graph break in a for/while loop\n"
+        f"torch.compile intentionally decided to skip the frame {frame_info} and fall back to eager.\n"
+        f"Reason: Skipping frame because there is a graph break in a for/while loop.\n"
+        f"{frame_summary}"
+    )
+
+
 def format_error_msg(
     exc: Exception,
     code: types.CodeType,
diff --git a/torch/_dynamo/external_utils.py b/torch/_dynamo/external_utils.py
index fd21f57d8b865..10422a3e2b82b 100644
--- a/torch/_dynamo/external_utils.py
+++ b/torch/_dynamo/external_utils.py
@@ -196,6 +196,10 @@ def get_nonrecursive_disable_wrapper(fn: Callable[_P, _R]) -> Callable[_P, _R]:
     # this function is in external_utils so that convert_frame doesn't skip it.
     @functools.wraps(fn)
     def nonrecursive_disable_wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
+        if torch.compiler.is_exporting():
+            raise RuntimeError(
+                "Non-recursive torch.compiler.disable is not supported with torch.export."
+            )
         return fn(*args, **kwargs)
 
     return nonrecursive_disable_wrapper
diff --git a/torch/_dynamo/functional_export.py b/torch/_dynamo/functional_export.py
index 23b02e69a5640..84641d66c6bd8 100644
--- a/torch/_dynamo/functional_export.py
+++ b/torch/_dynamo/functional_export.py
@@ -11,6 +11,7 @@
 import torch
 import torch.fx
 import torch.utils._pytree as pytree
+from torch._dispatch.python import enable_python_dispatcher
 from torch._dynamo.convert_frame import CaptureOutput, fullgraph_capture, get_traced_fn
 from torch._dynamo.eval_frame import argument_names, check_user_input_output
 from torch._dynamo.exc import UserErrorType
@@ -435,7 +436,7 @@ def _suggest_or_raise_constraint_violation(
 
         # Error if we have any constraints on static values
 
-        for k in shape_env.var_to_range.keys():
+        for k in shape_env.var_to_range:
             if isinstance(k, sympy.Integer):
                 constraint_violation_error = ConstraintViolationError(
                     f"{''.join(traceback.format_list(shape_env.var_to_stack[k]))}\n"
@@ -487,7 +488,6 @@ def pytreeify(
     """
     assert out.backend_input is not None
     backend_input = out.backend_input
-    backend = out.backend_input.graph_module
 
     root = None
     if isinstance(mod, torch.nn.Module):
@@ -498,6 +498,9 @@ def pytreeify(
         root = mod.__self__
 
     flat_real_args, in_spec = pytree.tree_flatten((args, kwargs))
+    torch._dynamo.eval_frame.check_user_input_output(
+        flat_real_args[1 if root else 0 :], UserErrorType.INVALID_INPUT
+    )
 
     class Yield(Exception):
         pass
@@ -518,14 +521,11 @@ def backend_dummy(*example_inputs):
                 self.gm_inputs = example_inputs
                 raise Yield
 
-            backend_input.graph_module = backend_dummy  # type: ignore[assignment]
             try:
-                out.forward_callable()(*args, **kwargs)
+                out.forward_callable(compiled_fn=backend_dummy)(*args, **kwargs)
             except Yield:
                 assert self.gm_inputs is not None
                 return self.gm_inputs
-            finally:
-                backend_input.graph_module = backend
             raise RuntimeError
 
     fake_mode = torch._dynamo.utils.detect_fake_mode(flat_real_args)
@@ -557,11 +557,7 @@ def backend_dummy(*example_inputs):
                     for i in range(self.num_outputs)
                 ]
 
-            backend_input.graph_module = backend_dummy  # type: ignore[assignment]
-            try:
-                results = out.forward_callable()(*args, **kwargs)
-            finally:
-                backend_input.graph_module = backend
+            results = out.forward_callable(compiled_fn=backend_dummy)(*args, **kwargs)
             ret, self.out_spec = pytree.tree_flatten(results)
             return ret
 
@@ -579,9 +575,10 @@ def backend_dummy(*example_inputs):
     fake_mode = torch._dynamo.utils.detect_fake_mode(flat_out_shuffle_args)
     if fake_mode and fake_mode.shape_env is None:
         fake_mode.shape_env = ShapeEnv()
-    out_shuffle_graph = make_fx(
-        out_shuffle, tracing_mode="symbolic", proxy_module_inputs=True
-    )(*flat_out_shuffle_args)
+    with enable_python_dispatcher():
+        out_shuffle_graph = make_fx(
+            out_shuffle, tracing_mode="real", proxy_module_inputs=True
+        )(*flat_out_shuffle_args)
     _normalize_shuffle_graph(out_shuffle_graph)
 
     assert out_shuffle.out_spec is not None
@@ -609,6 +606,8 @@ def dynamo_graph_capture_for_export(
     def inner(*args: Any, **kwargs: Any) -> Any:
         assert not torch._dynamo.config.install_free_tensors
         with (
+            torch._dynamo.config.patch(replay_side_effects=False),
+            torch._dynamo.config.patch(side_effect_replay_policy="warn"),
             get_metrics_context(),
             dynamo_timed("fullgraph_capture"),
         ):
@@ -637,7 +636,7 @@ def inner(*args: Any, **kwargs: Any) -> Any:
             pyt.in_shuffle_graph,
             pyt.out_shuffle_graph,
             tree_leaf_names,
-            pyt.root,
+            graph_module if isinstance(pyt.root, torch.nn.Module) else pyt.root,
         )  # type: ignore[attr-defined]
         normalize_graph_module(graph_module)
         if pyt.root is not None:
@@ -648,6 +647,10 @@ def inner(*args: Any, **kwargs: Any) -> Any:
             graph_module._non_persistent_buffers_set = (
                 pyt.root._non_persistent_buffers_set.copy()
             )
+            annotations = torch.nn.Module.__dict__.get("__annotations__", None)
+            for name, value in pyt.root.__dict__.items():
+                if annotations and name not in annotations:
+                    graph_module.__dict__[name] = value
         graph_module._in_spec = pyt.in_spec
         graph_module._out_spec = pyt.out_spec
         assert not hasattr(graph_module, "_in_shuffle_graph")
diff --git a/torch/_dynamo/graph_break_registry.json b/torch/_dynamo/graph_break_registry.json
index c814c1ccf32bc..5f967971005f6 100644
--- a/torch/_dynamo/graph_break_registry.json
+++ b/torch/_dynamo/graph_break_registry.json
@@ -2937,5 +2937,735 @@
         "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
       ]
     }
+  ],
+  "GB0288": [
+    {
+      "Gb_type": "Can't extract message from torch._check()",
+      "Context": "str(message_vt)",
+      "Explanation": "The second argument of torch._check() must be a functiondefined within the torch.compile regionthat does not reference a non-local variable.",
+      "Hints": [
+        "Make sure the message function is defined in the torch.compile region.",
+        "Remove any closure variables, e.g. ",
+        "remove references to closure variable `x` in `lambda: f'{x} failed check'`",
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0289": [
+    {
+      "Gb_type": "unsupported method call on `typing` variable",
+      "Context": "typing variable: {self.value}, method name: {name}, args: {args}, kwargs: {kwargs}",
+      "Explanation": "`torch.compile` does not support method call `{name}` on `typing` variable f{self.value}.",
+      "Hints": [
+        "Avoid calling the {name} method on {self.value}.",
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0290": [
+    {
+      "Gb_type": "attempted to trace numpy.* function as a method",
+      "Context": "numpy function: {self.value}, args: {args}, kwargs: {kwargs}",
+      "Explanation": "Tracing numpy.* functions as methods is not supported.",
+      "Hints": [
+        "This graph break may be difficult to debug. Please report an issue to PyTorch for assistance."
+      ]
+    }
+  ],
+  "GB0291": [
+    {
+      "Gb_type": "logging.Logger method not supported for non-export cases",
+      "Context": "method: {self.value}.{name}, args: {args}, kwargs: {kwargs}",
+      "Explanation": "logging.Logger methods are not supported for non-export cases.",
+      "Hints": [
+        "Add the logging method to `torch._dynamo.config.ignore_logger_methods."
+      ]
+    }
+  ],
+  "GB0292": [
+    {
+      "Gb_type": "constant-like method call with unsupported return type",
+      "Context": "{self._error_prefix}.{name}(*{args}, **{kwargs}) returned {result}",
+      "Explanation": "Attempted to call {self._error_prefix}.{name}, got unsupported return value {result}.",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0293": [
+    {
+      "Gb_type": "attempted to trace numpy function with config.trace_numpy=False",
+      "Context": "numpy function: {self.value}, args: {args}, kwargs: {kwargs}",
+      "Explanation": "Attempted to trace numpy function {self.value} while `torch._dynamo.config.trace_numpy` was set to False.",
+      "Hints": [
+        "Set `torch._dynamo.config.trace_numpy` to True to trace numpy functions."
+      ]
+    }
+  ],
+  "GB0294": [
+    {
+      "Gb_type": "attempted to trace numpy function unsupported by PyTorch",
+      "Context": "numpy function: {self.value}, args: {args}, kwargs: {kwargs} (corresponding torch function: {func})",
+      "Explanation": "Can't find numpy numpy function {self.value} in torch._numpy.",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0295": [
+    {
+      "Gb_type": "cannot reconstruct NullVariable in Python < 3.11",
+      "Context": "",
+      "Explanation": "Attempted to generate PUSH_NULL instruction in Python < 3.11; where this instruction does not exist.",
+      "Hints": [
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0296": [
+    {
+      "Gb_type": "attempted to reorder a debugging function that can't actually be reordered",
+      "Context": "fn: {self.value}, args: {args}, kwargs: {kwargs}",
+      "Explanation": "`torch.compile` can only reorder functions where the arguments are Tensors, constants, or string formatters.",
+      "Hints": [
+        "Avoid calling the logging function {self.value} with args that are not supported."
+      ]
+    }
+  ],
+  "GB0297": [
+    {
+      "Gb_type": "random.Random() with improper arguments",
+      "Context": "args: {args}, kwargs: {kwargs}",
+      "Explanation": "random.Random() with > 1 arg or with kwargs is not supported.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0298": [
+    {
+      "Gb_type": "attempted to trace torch._numpy.random function with config.use_numpy_random_stream=True",
+      "Context": "numpy function: {self.value}, args: {args}, kwargs: {kwargs} (corresponding torch function: {func})",
+      "Explanation": "Attempted to trace {self.value} when `torch._dynamo.config.use_numpy_random_stream` is set to True.",
+      "Hints": [
+        "Set `torch._dynamo.config.use_numpy_random_stream` to False.",
+        "Avoid calling {self.value}."
+      ]
+    }
+  ],
+  "GB0299": [
+    {
+      "Gb_type": "constant-like method call with non-constant args",
+      "Context": "{self._error_prefix}.{name}(*{args}, **{kwargs})",
+      "Explanation": "Attempted to call {self._error_prefix}.{name} with non-constant args.",
+      "Hints": [
+        "Ensure that the args to the method call are constant (int, str, etc.)."
+      ]
+    }
+  ],
+  "GB0300": [
+    {
+      "Gb_type": "numpy function that produces a const collection type encountered non-const arguments",
+      "Context": "numpy function: {self.value}, args: {args}, kwargs: {kwargs} (corresponding torch function: {func})",
+      "Explanation": "numpy function {self.value} that produces a const collection type (e.g. np.dtype, np.iinfo/np.finfo) received arguments that are not constant.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0301": [
+    {
+      "Gb_type": "HOP: non torch.Tensor leaf",
+      "Context": "args types: {[type(a.realize()) for a in args]}",
+      "Explanation": "Expected all leaves to be of torch.Tensor type.",
+      "Hints": []
+    }
+  ],
+  "GB0302": [
+    {
+      "Gb_type": "HOP: non-callable variable",
+      "Context": "arg name: {arg_name}, func_var type: {str(func_var)}",
+      "Explanation": "{arg_name} should be a callable but is of type {str(func_var)}.",
+      "Hints": []
+    }
+  ],
+  "GB0303": [
+    {
+      "Gb_type": "torch.while_loop: improper args/kwargs",
+      "Context": "args: {args}, kwargs: {kwargs}",
+      "Explanation": "torch.while_loop expects 4 positional arguments (got {len(args)}) and no keyword arguments (got {len(kwargs)}) Usage: while_loop(cond_fn, body_fn, operands)",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0304": [
+    {
+      "Gb_type": "torch.while_loop: improper additional_inputs",
+      "Context": "str(additional_inputs)",
+      "Explanation": "Expected additional_inputs to be a list/tuple but got {additional_inputs.python_type()}",
+      "Hints": [
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0305": [
+    {
+      "Gb_type": "invalid set_subgraph_inputs and sub_kwargs settings",
+      "Context": "set_subgraph_inputs: {set_subgraph_inputs}, sub_kwargs: {sub_kwargs}",
+      "Explanation": "`sub_kwargs` cannot be used when `set_subgraph_inputs` is not set to 'automatic'.",
+      "Hints": [
+        "Use `set_subgraph_inputs='automatic'` when passing `sub_kwargs`.",
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0306": [
+    {
+      "Gb_type": "unsupported HigherOrderOperator",
+      "Context": "str(value)",
+      "Explanation": "Unable to create higher order operator variable for {value.__name__}.",
+      "Hints": [
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0307": [
+    {
+      "Gb_type": "unsupported HigherOrderOperator function call",
+      "Context": "str(self.value)",
+      "Explanation": "Unable to trace calling higher order operator variable for {self.value.__name__}.",
+      "Hints": [
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0308": [
+    {
+      "Gb_type": "torch.while_loop: unsupported cond_fn return type",
+      "Context": "str(cond_r)",
+      "Explanation": "Expected cond_fn to return a scalar tensor or a bool but got {cond_r_meta.shape}.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0309": [
+    {
+      "Gb_type": "torch.cond: improper args/kwargs",
+      "Context": "args: {args}, kwargs: {kwargs}",
+      "Explanation": "torch.cond expects 4 positional arguments (got {len(args)}) and no keyword arguments (got {len(kwargs)}) Usage: cond(pred, cond_fn, body_fn, operands)",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0310": [
+    {
+      "Gb_type": "torch.cond: improper predicate",
+      "Context": "str(pred)",
+      "Explanation": "Expected `pred` to be a bool or a boolean tensor with a single item but got {str(type(pred))} with original python type {str(pred.python_type())}.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0311": [
+    {
+      "Gb_type": "torch.cond: improper operands",
+      "Context": "str(operands)",
+      "Explanation": "Expected `operands` to be a list/tuple but got {operands.python_type()}.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0312": [
+    {
+      "Gb_type": "torch.cond: improper operands contents",
+      "Context": "str(operands)",
+      "Explanation": "Expected `operands` to be a list/tuple of pytrees that only consists of tensor leaves.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0313": [
+    {
+      "Gb_type": "torch.cond: differing branch outputs",
+      "Context": "true_spec: {true_spec.treespec}, false_spec: {false_spec.treespec}, same_spec: {same_spec}",
+      "Explanation": "Expected branches to return the same pytree structure.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0314": [
+    {
+      "Gb_type": "HOP body output unsupported",
+      "Context": "non-tensor outputs: {non_tensor_output}",
+      "Explanation": "HigherOrderOperator body's output must consist of tensors or ints/bools only but got {out.python_type()}.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0315": [
+    {
+      "Gb_type": "torch.associative_scan: improper xs",
+      "Context": "str(xs)",
+      "Explanation": "Expected xs to be a list/tuple but got {xs.python_type()}",
+      "Hints": [
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0316": [
+    {
+      "Gb_type": "torch.associative_scan: improper additional_inputs",
+      "Context": "str(additional_inputs)",
+      "Explanation": "Expected additional_inputs to be a list/tuple but got {additional_inputs.python_type()}",
+      "Hints": [
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0317": [
+    {
+      "Gb_type": "torch.associative_scan: zero-sized tensor",
+      "Context": "str(xs_vars[0])",
+      "Explanation": "associative_scan() operator doesn't support zero-sized tensors during tracing.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0318": [
+    {
+      "Gb_type": "torch.associative_scan: combine_fn improper number of leaves",
+      "Context": "str(_combine_treespec.as_python_constant())",
+      "Explanation": "combine_fn needs to produce one pytree for the output but combine_fn produces the pytree {_combine_treespec.as_python_constant()}.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0319": [
+    {
+      "Gb_type": "torch.associative_scan: mismatched input/output tree structure",
+      "Context": "xs: {xs_treespec.as_python_constant()}, output: {_combine_treespec.as_python_constant()}",
+      "Explanation": "The tree structure of the xs and the outs of the combine_fn are are expected to be identical, but got xs: {xs_treespec.as_python_constant()} vs output: {_combine_treespec.as_python_constant()}.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0320": [
+    {
+      "Gb_type": "torch.scan: improper xs",
+      "Context": "str(xs)",
+      "Explanation": "Expected xs to be a list/tuple but got {xs.python_type()}",
+      "Hints": [
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0321": [
+    {
+      "Gb_type": "torch.scan: improper init",
+      "Context": "str(init)",
+      "Explanation": "Expected init to be a list/tuple with at least one element but got {init.python_type()}",
+      "Hints": [
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0322": [
+    {
+      "Gb_type": "torch.scan: no init leaves",
+      "Context": "",
+      "Explanation": "Expected init leaves.",
+      "Hints": [
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0323": [
+    {
+      "Gb_type": "torch.scan: improper additional_inputs",
+      "Context": "str(additional_inputs)",
+      "Explanation": "Expected additional_inputs to be a list/tuple but got {additional_inputs.python_type()}",
+      "Hints": [
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0324": [
+    {
+      "Gb_type": "torch.scan: zero-sized tensor",
+      "Context": "str(xs_vars[0])",
+      "Explanation": "associative_scan() operator doesn't support zero-sized tensors during tracing.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0325": [
+    {
+      "Gb_type": "torch.map: kwargs not supported",
+      "Context": "args: {args}, kwargs: {kwargs}",
+      "Explanation": "torch.map expects no keyword arguments (got {len(kwargs)})",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0326": [
+    {
+      "Gb_type": "torch.map: improper inputs",
+      "Context": "str(sample_shape)",
+      "Explanation": "torch.map doesn't support scalar or non-zero sized tensors during tracing.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0327": [
+    {
+      "Gb_type": "executorch_call_delegate: kwargs not supported",
+      "Context": "args: {args}, kwargs: {kwargs}",
+      "Explanation": "executorch_call_delegate expects no keyword arguments (got {len(kwargs)})",
+      "Hints": []
+    }
+  ],
+  "GB0328": [
+    {
+      "Gb_type": "torch.func.functional_call capture is disabled",
+      "Context": "",
+      "Explanation": "torch.func.functional_call capture is disabled",
+      "Hints": [
+        "Set `torch._dynamo.config.inline_inbuilt_nn_modules=True` to enable."
+      ]
+    }
+  ],
+  "GB0329": [
+    {
+      "Gb_type": "WrapHigherOrderVariable: kwargs unexpected",
+      "Context": "args: {args}, kwargs: {kwargs}",
+      "Explanation": "kwargs should have been flattened into lifted args.",
+      "Hints": [
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0330": [
+    {
+      "Gb_type": "wrap_with_set_grad_enabled: unexpected kwargs",
+      "Context": "args: {args}, kwargs: {kwargs}",
+      "Explanation": "wrap_with_set_grad_enabled expects no keyword arguments (got {len(kwargs)}).",
+      "Hints": [
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0331": [
+    {
+      "Gb_type": "wrap_with_set_grad_enabled: non-constant grad_enabled",
+      "Context": "str(grad_enabled)",
+      "Explanation": "wrap_with_set_grad_enabled expects grad_enabled argument to be a constant.",
+      "Hints": [
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0332": [
+    {
+      "Gb_type": "wrap_with_set_grad_enabled: unexpected freevars",
+      "Context": "str(body_lifted_freevars)",
+      "Explanation": "wrap_with_set_grad_enabled expects no freevars.",
+      "Hints": []
+    }
+  ],
+  "GB0333": [
+    {
+      "Gb_type": "wrap_with_autocast: unexpected kwargs",
+      "Context": "args: {args}, kwargs: {kwargs}",
+      "Explanation": "wrap_with_autocast expects no keyword arguments (got {len(kwargs)}).",
+      "Hints": [
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0334": [
+    {
+      "Gb_type": "wrap_with_autocast: unexpected freevars",
+      "Context": "str(body_lifted_freevars)",
+      "Explanation": "wrap_with_autocast expects no freevars.",
+      "Hints": []
+    }
+  ],
+  "GB0335": [
+    {
+      "Gb_type": "hints_wrapper: improper args/kwargs",
+      "Context": "args: {args}, kwargs: {kwargs}",
+      "Explanation": "hints_wrapper expects 3 positional arguments (got {len(args)}) and 1 keyword argument (got {len(kwargs)}). Usage: hints_wrapper(body_fn, args, kwargs, hints=...). args is expected to be list/tuple and kwargs is expected to be a dict.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0336": [
+    {
+      "Gb_type": "out_dtype: unexpected kwargs",
+      "Context": "args: {args}, kwargs: {kwargs}",
+      "Explanation": "out_dtype expects no keyword arguments (got {len(kwargs)}).",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0337": [
+    {
+      "Gb_type": "strict_mode: unexpected kwargs",
+      "Context": "args: {args}, kwargs: {kwargs}",
+      "Explanation": "strict_mode higher order op expects no keyword arguments (got {len(kwargs)}).",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0338": [
+    {
+      "Gb_type": "invoke_subgraph: kwargs unexpected",
+      "Context": "args: {args}, kwargs: {kwargs}",
+      "Explanation": "kwargs should have been flattened into lifted args.",
+      "Hints": [
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0339": [
+    {
+      "Gb_type": "torch.while_loop: infinite loop detected",
+      "Context": "str(cond_r)",
+      "Explanation": "Infinite loop detected because while_loop's cond_fn always returns the same value {pred}.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0340": [
+    {
+      "Gb_type": "torch.cond: unsupported branch return type",
+      "Context": "str(ret_val)",
+      "Explanation": "Expected branches to return a possibly nested pytree of tensors or constant ints.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0341": [
+    {
+      "Gb_type": "torch.associative_scan: improper args",
+      "Context": "args: {args}",
+      "Explanation": "torch.associative_scan expects 2 positional arguments (got {len(args)}) Usage: associative_scan(combine_fn, xs)",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0342": [
+    {
+      "Gb_type": "torch.scan: improper combine_fn",
+      "Context": "str(combine_fn_var)",
+      "Explanation": "Expected combine_fn to be wrapped as functools.partial in scan user-facing api or a graph module if we're re-exporting but got {combine_fn_var.python_type()}.",
+      "Hints": [
+        "This graph break may be difficult to debug. Please report an issue to PyTorch for assistance."
+      ]
+    }
+  ],
+  "GB0343": [
+    {
+      "Gb_type": "torch.scan: improper combine_fn number of returns",
+      "Context": "str(combine_result_vars)",
+      "Explanation": "Expect combine_fn to return a tuple (next_carry, y) but got {combine_result_vars}.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0344": [
+    {
+      "Gb_type": "wrap_with_autocast: expected constant arg",
+      "Context": "str(args)",
+      "Explanation": "wrap_with_autocast expects device_type, dtype, enabled, and cache_enabled arguments to be constants.",
+      "Hints": [
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0345": [
+    {
+      "Gb_type": "strict_mode: improper args",
+      "Context": "args: {args}, kwargs: {kwargs}",
+      "Explanation": "strict_mode higher order op expects flat inputs (list/tuple/dict)",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0346": [
+    {
+      "Gb_type": "autograd.Function.apply: non-function or method forward",
+      "Context": "str(self.fwd_graph)",
+      "Explanation": "Expected forward function to be a function or method.",
+      "Hints": []
+    }
+  ],
+  "GB0347": [
+    {
+      "Gb_type": "autograd.Function.apply: _materialize_non_diff_grads mutation",
+      "Context": "",
+      "Explanation": "Mutations to autograd.Function.ctx._materialize_non_diff_grads are not supported.",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0348": [
+    {
+      "Gb_type": "autograd.Function.apply: non-function or method backward",
+      "Context": "str(self.bwd_graph)",
+      "Explanation": "Expected backward function to be a function or method.",
+      "Hints": []
+    }
+  ],
+  "GB0349": [
+    {
+      "Gb_type": "cannot unwrap variable for check_meta_consistency",
+      "Context": "str(var)",
+      "Explanation": "Expected {var} to be TensorVariable, SymNodeVariable, or ConstantVariable",
+      "Hints": []
+    }
+  ],
+  "GB0350": [
+    {
+      "Gb_type": "torch.cond: unsupported branch return type (constant non-int)",
+      "Context": "str(ret_val)",
+      "Explanation": "Constants returned from branches must be ints.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0351": [
+    {
+      "Gb_type": "HOP body taking non-Tensor as input",
+      "Context": "str(sub_args)",
+      "Explanation": "{description} with body that accepts non-Tensors as input. Got type {a.python_type()} at index {idx}.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0352": [
+    {
+      "Gb_type": "autograd.Function.apply: non-function or method backward (2)",
+      "Context": "str(self.bwd_graph)",
+      "Explanation": "Expected backward function to be a function or method.",
+      "Hints": []
+    }
+  ],
+  "GB0353": [
+    {
+      "Gb_type": "rewrite_signature: cannot trace optional function input",
+      "Context": "",
+      "Explanation": "Parameter {name} is optional with a default value of {param.default}. This is not supported yet.",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0354": [
+    {
+      "Gb_type": "failed to find name in frame builtins",
+      "Context": "",
+      "Explanation": "Failed to find name `{argval}` in frame's builtins.",
+      "Hints": [
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0355": [
+    {
+      "Gb_type": "non-single Tensor return unsupported",
+      "Context": "api: {api}, ret: {ret}",
+      "Explanation": "{api} over function that returns something other than one Tensor.",
+      "Hints": []
+    }
+  ],
+  "GB0356": [
+    {
+      "Gb_type": "failed to handle argument for FlexAttentionBackward HOP",
+      "Context": "args: {args}, kwargs: {kwargs}",
+      "Explanation": "Missing Dynamo support for FlexAttentionBackward HOP argument.",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0357": [
+    {
+      "Gb_type": "UnspecializedNNModuleVariable wrapped around ScriptModules unsupported",
+      "Context": "str(value)",
+      "Explanation": "ScriptModules aren't supported in UnspecializedNNModuleVariable because their .forward function isn't a static member of their type.",
+      "Hints": [
+        "This graph break may be difficult to debug. Please report an issue to PyTorch for assistance."
+      ]
+    }
+  ],
+  "GB0358": [
+    {
+      "Gb_type": "optimizer: pending mutation on parameter",
+      "Context": "variable: {variable}, parameter: {p}",
+      "Explanation": "Pending mutations on a parameter (e.g. due to using closure) require a graph break.",
+      "Hints": []
+    }
+  ],
+  "GB0359": [
+    {
+      "Gb_type": "unsupported torch._C._SDPAParams attribute",
+      "Context": "name: {name}",
+      "Explanation": "Unable to fetch attribute {name} from torch._C._SDPAParams.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0360": [
+    {
+      "Gb_type": "torch.fx.experimental.symbolic_shapes.guard_scalar branch not supported",
+      "Context": "expr: {expr}",
+      "Explanation": "Expected `expr` to be a symbolic variable or constant.",
+      "Hints": []
+    }
+  ],
+  "GB0361": [
+    {
+      "Gb_type": "triton kernel unsupported feature",
+      "Context": "",
+      "Explanation": "Encountered triton kernel unsupported feature: {msg}",
+      "Hints": []
+    }
+  ],
+  "GB0362": [
+    {
+      "Gb_type": "Attempted to access attributes/methods on an OpaqueObject",
+      "Context": "value={self.value}, attr={name}",
+      "Explanation": "Attribute/method access of OpaqueObjects is not supported.",
+      "Hints": [
+        "Use custom operators instead of direct attribute/method access."
+      ]
+    }
   ]
 }
diff --git a/torch/_dynamo/graph_bytecode_inputs.py b/torch/_dynamo/graph_bytecode_inputs.py
index 979950cf3bd1b..c00a88bfbaa81 100644
--- a/torch/_dynamo/graph_bytecode_inputs.py
+++ b/torch/_dynamo/graph_bytecode_inputs.py
@@ -1,5 +1,6 @@
 import weakref
-from typing import Any, Callable
+from collections.abc import Callable
+from typing import Any
 
 from torch._dynamo.source import Source
 
@@ -24,6 +25,11 @@ def has_user_objects() -> bool:
     return bool(index_to_bytecode_constructor)
 
 
+def stash_graph_created_object(obj: Any) -> Any:
+    keep_alive.append(obj)
+    return obj
+
+
 def get_external_object_by_index(index: int) -> Any:
     assert index in index_to_external_object_weakref, (
         "Index not registered in index_to_user_object_weakref"
@@ -58,9 +64,9 @@ def register_graph_created_object(
     try:
         index_to_external_object_weakref[index] = weakref.ref(example_value)
     except TypeError as e:
-        from .exc import unimplemented_v2
+        from .exc import unimplemented
 
-        unimplemented_v2(
+        unimplemented(
             gb_type="Failed to make weakref to graph-created external object",
             context=f"user_object: {example_value}",
             explanation="Object does not allow us to make a weakref to it",
@@ -78,9 +84,9 @@ def register_user_object(value: Any, source: Source) -> int:
     try:
         index_to_external_object_weakref[index] = weakref.ref(value)
     except TypeError as e:
-        from .exc import unimplemented_v2
+        from .exc import unimplemented
 
-        unimplemented_v2(
+        unimplemented(
             gb_type="Failed to make weakref to User Object",
             context=f"user_object: {value}",
             explanation="Object does not allow us to make a weakref to it",
diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py
index 2792ce512d8a1..a75118f9e5032 100644
--- a/torch/_dynamo/guards.py
+++ b/torch/_dynamo/guards.py
@@ -2141,9 +2141,10 @@ def TENSOR_SUBCLASS_METADATA_MATCH(self, guard: Guard) -> None:
         original_metadata = deepcopy(self.get(guard.name).__tensor_flatten__()[1])
         if hasattr(value, "__metadata_guard__"):
             verify_guard_fn_signature(value)
+            cls = type(value)
 
             def metadata_checker(x: Any) -> bool:
-                return value.__metadata_guard__(
+                return cls.__metadata_guard__(
                     original_metadata, x.__tensor_flatten__()[1]
                 )
 
@@ -2157,6 +2158,19 @@ def metadata_checker(x: Any) -> bool:
             metadata_checker, get_verbose_code_parts(global_name, guard)
         )
 
+    def DTENSOR_SPEC_MATCH(self, guard: Guard) -> None:
+        # Copied from DTensor __metadata_guard__
+        # TODO - Consider moving this to C++ if stable
+        value = deepcopy(self.get(guard.name))
+
+        def guard_fn(x: Any) -> bool:
+            return x._check_equals(value, skip_shapes=True)
+
+        code = f"__dtensor_spec_{id(guard_fn)}"
+        self.get_guard_manager(guard).add_lambda_guard(
+            guard_fn, get_verbose_code_parts(code, guard)
+        )
+
     def EQUALS_MATCH(self, guard: Guard, recompile_hint: Optional[str] = None) -> None:
         ref = self.arg_ref(guard)
         val = self.get(guard.name)
@@ -2283,7 +2297,7 @@ def NN_MODULE(self, guard: Guard) -> None:
                 # If guard_nn_modules is true, we will guard on the right set of guards
                 self._guard_on_attribute(guard, "training", GuardBuilder.CONSTANT_MATCH)  # type: ignore[arg-type]
         else:
-            exc.unimplemented_v2(
+            exc.unimplemented(
                 gb_type="Attempted to guard on uninitialized nn.Module",
                 context="",
                 explanation="Attempted to setup an NN_MODULE guard on uninitialized "
@@ -2493,12 +2507,30 @@ def GRAD_MODE(self, guard: Guard) -> None:
     def DETERMINISTIC_ALGORITHMS(self, guard: Guard) -> None:
         pass  # we always guard on this via GlobalStateGuard()
 
-    def TORCH_FUNCTION_STATE(self, guard: Guard) -> None:
-        pass  # we always guard on this via GlobalStateGuard()
-
     def FSDP_TRAINING_STATE(self, guard: Guard) -> None:
         pass  # we always guard on this via GlobalStateGuard()
 
+    def GLOBAL_STATE(self, guard: Guard) -> None:
+        output_graph = self.check_fn_manager.output_graph
+        assert output_graph is not None
+        global_state = output_graph.global_state_guard
+        self.check_fn_manager.global_state = global_state
+        self.guard_manager.root.add_global_state_guard(
+            global_state, ["___check_global_state()"]
+        )
+
+    def TORCH_FUNCTION_STATE(self, guard: Guard) -> None:
+        assert self.check_fn_manager.torch_function_mode_stack is not None
+        self.check_fn_manager.torch_function_mode_stack_check_fn = (
+            make_torch_function_mode_stack_guard(
+                self.check_fn_manager.torch_function_mode_stack
+            )
+        )
+        self.guard_manager.root.add_torch_function_mode_stack_guard(
+            self.check_fn_manager.torch_function_mode_stack,
+            ["___check_torch_function_mode_stack()"],
+        )
+
     def DEFAULT_DEVICE(self, guard: Guard) -> None:
         """Guard on CURRENT_DEVICE per torch.utils._device"""
         assert guard.source is GuardSource.GLOBAL
@@ -3518,6 +3550,8 @@ def __init__(
         self.additional_used_local_vars: OrderedSet[str] = OrderedSet()
         self.additional_used_global_vars: OrderedSet[str] = OrderedSet()
         self.runtime_global_scope = runtime_global_scope
+        self.global_state: Optional[torch._C._dynamo.guards.GlobalStateGuard] = None
+        self.torch_function_mode_stack_check_fn: Optional[Callable[[], bool]] = None
 
         if not justknobs_check("pytorch/compiler:guard_nn_modules"):
             log.warning("guard_nn_modules is turned off using justknobs killswitch")
@@ -3925,27 +3959,11 @@ def compile_check_fn(
         verbose_code_parts = []
         structured_guard_fns: list[Callable[[], dict[str, Any]]] = []
 
-        assert self.torch_function_mode_stack is not None
-        torch_function_mode_stack_check_fn = make_torch_function_mode_stack_guard(
-            self.torch_function_mode_stack
-        )
-
         # Add compile id info in the guard manager for debugging purpose
         self.guard_manager.root.attach_compile_id(
             str(CompileContext.current_compile_id())
         )
 
-        # Insert the global_state guard
-        assert self.output_graph is not None
-        global_state = self.output_graph.global_state_guard
-        self.guard_manager.root.add_global_state_guard(
-            global_state, ["___check_global_state()"]
-        )
-
-        self.guard_manager.root.add_torch_function_mode_stack_guard(
-            self.torch_function_mode_stack,
-            ["___check_torch_function_mode_stack()"],
-        )
         # Clear references to torch_function modes held in the list
         self.torch_function_mode_stack = None
 
@@ -4091,12 +4109,14 @@ def add_code_part(
 
         if convert_frame.initial_global_state is None:
             # we should only hit this case in NopTests()
-            global_state = convert_frame.GlobalStateGuard()
+            check_global_state = convert_frame.GlobalStateGuard().check
+        else:
+            check_global_state = getattr(self.global_state, "check", None)
         closure_vars = {
             "___check_tensors": check_tensors_fn,
             "___check_tensors_verbose": check_tensors_verbose_fn,
-            "___check_global_state": global_state.check,
-            "___check_torch_function_mode_stack": torch_function_mode_stack_check_fn,
+            "___check_global_state": check_global_state,
+            "___check_torch_function_mode_stack": self.torch_function_mode_stack_check_fn,
             **SYMPY_INTERP,
             **_get_closure_vars(),
         }
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index 77f5d6cb05a01..67c29e9f9c62c 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -56,6 +56,7 @@
     tracing,
     TracingContext,
 )
+from torch._library.opaque_object import is_opaque_type
 from torch._subclasses.fake_tensor import FakeTensor
 from torch._utils_internal import signpost_event
 from torch.export.dynamic_shapes import _ConstraintTarget
@@ -98,8 +99,8 @@
     BackendCompilerFailed,
     exceptions_allowed_to_be_fallback,
     SkipFrame,
-    unimplemented_v2,
-    unimplemented_v2_with_warning,
+    unimplemented,
+    unimplemented_with_warning,
 )
 from .graph_bytecode_inputs import has_user_objects, index_to_bytecode_constructor
 from .graph_deduplication import apply_graph_deduplication
@@ -762,7 +763,7 @@ def add_backward_state_hook(
     def get_backward_state_proxy(self) -> torch.fx.Proxy:
         if self.backward_state_proxy is None:
             if self.export:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="backward_state does not support export",
                     context="",
                     explanation="Compiled autograd doesn't work with `torch.export`.",
@@ -793,6 +794,7 @@ def init_ambient_guards(self) -> None:
 
         self.guards.add(GlobalStateSource().make_guard(GuardBuilder.DEFAULT_DEVICE))
 
+        self.guards.add(GlobalStateSource().make_guard(GuardBuilder.GLOBAL_STATE))
         self.guards.add(
             GlobalStateSource().make_guard(GuardBuilder.TORCH_FUNCTION_STATE)
         )
@@ -1526,37 +1528,6 @@ def compile_subgraph(
 
         from .decorators import disable
 
-        if has_user_objects():
-            # NB: This is where we store possible user objects before running the graph
-            # index_to_user_object_weakref is the function used in the graph to translate
-            # the dynamo-generated index into the actual object passed to the compiled function.
-            # We generate bytecode to store all user objects at the proper index in the below
-            # call.
-            codegen = PyCodegen(
-                self.root_tx, root, overridden_sources=overridden_sources
-            )
-            codegen.add_push_null(
-                lambda: codegen.load_import_from(
-                    torch._dynamo.graph_bytecode_inputs.__name__,
-                    "store_user_object_weakrefs",
-                )
-            )
-            tmp_vars = []
-            for constructor in reversed(index_to_bytecode_constructor.values()):
-                constructor(codegen)
-                var_name = (
-                    self.new_var()
-                )  # keep alive any temp objects for the rest of the frame
-                codegen.store(var_name)
-                tmp_vars.append(var_name)
-
-            for var_name in tmp_vars:
-                codegen.append_output(codegen.create_load(var_name))
-
-            codegen.call_function(len(index_to_bytecode_constructor), False)
-            codegen.pop_top()
-            self.add_output_instructions(codegen.get_instructions())
-
         # to handle random calls
         if len(self.random_calls) > 0:
             random_calls_instructions = []
@@ -1845,7 +1816,7 @@ def compile_subgraph(
                 [create_instruction("DELETE_FAST", argval=graph_output_var)]
             )
 
-        if self.export:
+        if torch._dynamo.config.side_effect_replay_policy in ["warn", "error"]:
             from torch.export._trace import _ExportModuleSpecTrackerDict
 
             potential_side_effects = []
@@ -1881,10 +1852,16 @@ def compile_subgraph(
             ]
 
             if side_effect_refs:
-                warnings.warn(
-                    f"While exporting, we found certain side effects happened in the model.forward. "
-                    f"Here are the list of potential sources you can double check: {side_effect_refs}"
-                )
+                if torch._dynamo.config.side_effect_replay_policy == "warn":
+                    warnings.warn(
+                        f"While compiling, we found certain side effects happened in the model.forward. "
+                        f"Here are the list of potential sources you can double check: {side_effect_refs}"
+                    )
+                else:
+                    raise RuntimeError(
+                        f"While compiling, we found certain side effects happened in the model.forward. "
+                        f"Here are the list of potential sources you can double check: {side_effect_refs}"
+                    )
 
         return all_stack_locals_metas
 
@@ -1930,7 +1907,8 @@ def codegen_suffix(
                 assert self.backward_state_var is not None
                 cg.append_output(cg.create_load(self.backward_state_var))
                 cg.store_attr(name)
-        self.side_effects.codegen_hooks(cg)
+        if config.replay_side_effects:
+            self.side_effects.codegen_hooks(cg)
 
         # TODO get debug_locals working for nested graph breaks
         # Return variables used for logging at the end
@@ -1945,7 +1923,8 @@ def codegen_suffix(
         self.codegen_cells(tx, cg)
 
         cg.restore_stack(stack_values, value_from_source=not tx.export)
-        self.side_effects.codegen_update_mutated(cg)
+        if config.replay_side_effects:
+            self.side_effects.codegen_update_mutated(cg)
 
     def cleanup_graph(self) -> None:
         """
@@ -2333,6 +2312,35 @@ def specialized_dispatch(*args: Any, **kwargs: Any) -> Any:
             assert self.root_tx is not None
             cg = PyCodegen(self.root_tx)
 
+            if has_user_objects():
+                # NB: This is where we store possible user objects before running the graph
+                # index_to_user_object_weakref is the function used in the graph to translate
+                # the dynamo-generated index into the actual object passed to the compiled function.
+                # We generate bytecode to store all user objects at the proper index in the below
+                # call.
+                cg.add_push_null(
+                    lambda: cg.load_import_from(
+                        torch._dynamo.graph_bytecode_inputs.__name__,
+                        "store_user_object_weakrefs",
+                    )
+                )
+
+                tmp_vars = []
+                for constructor in index_to_bytecode_constructor.values():
+                    constructor(cg)
+                    var_name = (
+                        self.new_var()
+                    )  # keep alive any user objects for the rest of the frame
+                    # TODO: we could omit this for objects we create but shouldn't be too much overhead for now
+                    cg.store(var_name)
+                    tmp_vars.append(var_name)
+
+                for var_name in tmp_vars:
+                    cg.append_output(cg.create_load(var_name))
+
+                cg.call_function(len(index_to_bytecode_constructor), False)
+                cg.pop_top()
+
             for idx, arg in enumerate(self.graphargs):
                 self.export_metadata.graph_input_idx_to_local_source[idx] = arg.source
 
@@ -2403,7 +2411,7 @@ def _call_user_compiler(
                 raise BackendCompilerFailed(
                     self.compiler_fn, e, inspect.currentframe()
                 ).with_traceback(e.__traceback__) from None
-            unimplemented_v2_with_warning(
+            unimplemented_with_warning(
                 e,
                 self.root_tx.f_code,
                 gb_type="Backend compiler exception",
@@ -2587,7 +2595,7 @@ def update_used_symbols(
                             real_script_obj
                         ):
                             flat_dict = dict(real_script_obj.__obj_flatten__())  # type: ignore[attr-defined]
-                            for attr in flat_dict.keys():
+                            for attr in flat_dict:
                                 fake_attr_val = getattr(
                                     fake_script_obj.wrapped_obj, attr
                                 )
@@ -2597,6 +2605,8 @@ def update_used_symbols(
                                     fake_attr_val,
                                 )
                         continue
+                    if is_opaque_type(type(node.meta["grapharg"].example)):
+                        continue
                     fake = (
                         arg.fake_tensor if arg.fake_tensor is not None else arg.example
                     )
@@ -2770,6 +2780,7 @@ class DynamoTracerOutput:
     is_tracing_resume_prologue: bool
     output_graph: Optional[OutputGraph]
     closure: Optional[tuple[Any, ...]]
+    f_globals: dict[str, Any]
 
     def __init__(
         self, tracer: "InstructionTranslatorBase", error: Optional[Any] = None
@@ -2777,6 +2788,7 @@ def __init__(
         self.error_on_graph_break = tracer.error_on_graph_break
         self.is_tracing_resume_prologue = tracer.is_tracing_resume_prologue
         self.closure = tracer.closure
+        self.f_globals = tracer.f_globals
         if error:
             self.output_graph = None
         else:
@@ -2806,7 +2818,7 @@ def encountered_compliant_op(target: torch._ops.OpOverload) -> None:
     def encountered_non_compliant_op(target: torch._ops.OpOverload, msg: str) -> None:
         output_graph.non_compliant_ops.add(target)
         if config.only_allow_pt2_compliant_ops:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Encountered non-PT2-compliant op",
                 context="",
                 explanation=msg + " " + err_epilogue,
@@ -2848,7 +2860,7 @@ def encountered_non_compliant_op(target: torch._ops.OpOverload, msg: str) -> Non
                 target._qualified_op_name, *args, **kwargs
             )
         except RuntimeError as e:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Error when attempting to resolve op packet",
                 context="",
                 explanation=str(e),
@@ -2995,6 +3007,11 @@ def __init__(
                 "Inference mode is supposed to be disabled during compilation. Please open an issue."
             )
 
+        self.tracked_tensor_or_symint_vt: OrderedSet[VariableTracker] = OrderedSet()
+
+    def record_tensor_or_symint_vt(self, vt: VariableTracker):
+        self.tracked_tensor_or_symint_vt.add(vt)
+
     # preserve original meta if it is available
     def _maybe_preserve_original_meta(
         self, tx: "InstructionTranslatorBase", node: fx.Node
@@ -3147,7 +3164,7 @@ def get_trace_call_log_str() -> str:
         elif kind == "call_module":
             if self.parent is not None:
                 # TODO can remove once inline_inbuilt_nn_modules is always True
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Invoking an nn.Module inside a higher order operator",
                     context=f"Higher order op name: {self.source_target}",
                     explanation="This is not supported.",
@@ -3181,7 +3198,7 @@ def get_trace_call_log_str() -> str:
                 elif kind == "call_module":
                     if self.parent is not None:
                         # TODO can remove once inline_inbuilt_nn_modules is always True
-                        unimplemented_v2(
+                        unimplemented(
                             gb_type="Invoking an nn.Module inside a HigherOrderOperator",
                             context="",
                             explanation="This is not supported.",
@@ -3255,7 +3272,7 @@ def create_node(
     def remove_node(self, node: fx.Node) -> None:
         if len(node.users) > 0:
             user_graph_nodes: list[torch.fx.Node] = []
-            for user in node.users.keys():
+            for user in node.users:
                 # For the case where user.graph == self.graph, that is a real bug and will raise
                 # properly.
                 if user.graph != self.graph:
diff --git a/torch/_dynamo/polyfills/__init__.py b/torch/_dynamo/polyfills/__init__.py
index a8dcf3e00c166..59f6f76317e6d 100644
--- a/torch/_dynamo/polyfills/__init__.py
+++ b/torch/_dynamo/polyfills/__init__.py
@@ -10,7 +10,7 @@
 
 import types
 from collections import OrderedDict
-from collections.abc import Callable, Hashable, Iterable, MutableMapping, Sequence
+from collections.abc import Callable, Hashable, Iterable, Mapping, Sequence
 from itertools import repeat as _repeat
 from operator import eq, ne
 from typing import Any, TYPE_CHECKING
@@ -276,7 +276,7 @@ def getattr_and_trace(*args, **kwargs):
     return fn(*args[2:], **kwargs)
 
 
-def mapping_get(obj, key, value=None):
+def mapping_get(obj, key, value=None, /):
     try:
         return obj.__getitem__(key)
     except KeyError:
@@ -293,31 +293,45 @@ def instantiate_user_defined_class_object(cls, /, *args, **kwargs):
     return obj
 
 
-# Used with something like dict(obj)
-def construct_dict(cls, /, *args, **kwargs):
-    dst = cls.__new__(cls)
-
-    if args:
-        src = args[0]
-
-        if not isinstance(src, Iterable):
-            raise TypeError(f"{type(src)} object is not iterable")
-
-        # Ensure that the overridden __iter__ method is invoked
-        if isinstance(src, (dict, MutableMapping, types.MappingProxyType)):
-            for key in src:
-                # This will inline the __getitem__ of the src object
-                dst[key] = src[key]
-        else:
-            # likely a sequence like tuple of pairs
-            for key, value in src:
-                dst[key] = value
+def mutable_mapping_update(self, data=(), /, **kwargs):
+    if isinstance(data, Mapping):
+        # Merge standard mapping with PyMapping_Items
+        for key, value in data.items():
+            self[key] = value
+    # FIXME: Enabling the `elif`-branch below needs too many `VariableClass.call_obj_hasattr` changes.
+    #   >>> class Foo:
+    #   ...     def __init__(self):
+    #   ...         self.keys = lambda: ['a', 'b', 'c']  # not required to be a method
+    #   ...
+    #   ...     def __getitem__(self, key):
+    #   ...         return 0
+    #   ...
+    #   >>> dict(Foo())
+    #   {'a': 0, 'b': 0, 'c': 0}
+    #
+    # > This is a rare case, so we comment it out for now.
+    #
+    # elif hasattr(data, "keys"):
+    #     # Merge mapping-like object with PyMapping_Keys + PyObject_GetItem
+    #     for key in data.keys():
+    #         self[key] = data[key]
+    else:
+        if not isinstance(data, Iterable):
+            raise TypeError(f"{type(data).__name__!r} object is not iterable")
+        # Likely a sequence of pairs
+        for key, value in data:
+            self[key] = value
 
     if kwargs:
-        for key in kwargs:
-            dst[key] = kwargs[key]
+        for key, value in kwargs.items():
+            self[key] = value
 
-    return dst
+
+# Used with something like dict(obj)
+def construct_dict(cls, data=(), /, **kwargs):
+    self = cls.__new__(cls)
+    mutable_mapping_update(self, data, **kwargs)
+    return self
 
 
 def foreach_map_fn(*args):
diff --git a/torch/_dynamo/polyfills/loader.py b/torch/_dynamo/polyfills/loader.py
index d348a422ff576..31479e9d86ce6 100644
--- a/torch/_dynamo/polyfills/loader.py
+++ b/torch/_dynamo/polyfills/loader.py
@@ -4,6 +4,8 @@
 import importlib
 from typing import TYPE_CHECKING
 
+import torch.utils._pytree as python_pytree
+
 from .. import polyfills, trace_rules
 
 
@@ -19,12 +21,14 @@
     "itertools",
     "operator",
     "os",
-    "pytree",
     "struct",
     "sys",
     "fx",
     "tensor",
 )
+if python_pytree._cxx_pytree_dynamo_traceable:
+    POLYFILLED_MODULE_NAMES += ("pytree",)
+
 POLYFILLED_MODULES: tuple["ModuleType", ...] = tuple(
     importlib.import_module(f".{submodule}", package=polyfills.__name__)
     for submodule in POLYFILLED_MODULE_NAMES
diff --git a/torch/_dynamo/polyfills/pytree.py b/torch/_dynamo/polyfills/pytree.py
index f9bdc0cce4a00..63a72afa43a6d 100644
--- a/torch/_dynamo/polyfills/pytree.py
+++ b/torch/_dynamo/polyfills/pytree.py
@@ -6,10 +6,23 @@
 
 from collections import deque
 from dataclasses import dataclass, field
-from typing import Any, TYPE_CHECKING
-from typing_extensions import TypeIs
-
-import torch.utils._pytree as python_pytree
+from typing import Any, TYPE_CHECKING, TypeVar
+
+import optree
+import optree._C
+import optree.utils
+from optree import (
+    is_namedtuple,
+    is_namedtuple_class,
+    is_namedtuple_instance,
+    is_structseq,
+    is_structseq_class,
+    is_structseq_instance,
+    namedtuple_fields,
+    structseq_fields,
+)
+
+import torch.utils._cxx_pytree as cxx_pytree  # noqa: F401
 from torch.utils._pytree import BUILTIN_TYPES, STANDARD_DICT_TYPES
 
 from ..decorators import substitute_in_graph
@@ -18,422 +31,549 @@
 if TYPE_CHECKING:
     import builtins
     from collections.abc import Callable, Iterable, Mapping
-    from typing_extensions import Self
-
-
-__all__: list[str] = []
-
-
-if python_pytree._cxx_pytree_dynamo_traceable:
-    import optree
-    import optree._C
-
-    import torch.utils._cxx_pytree as cxx_pytree  # noqa: F401
+    from typing_extensions import Self, TypeIs
+
+    from torch.utils._cxx_pytree import PyTree
+
+
+__all__ = [
+    "is_namedtuple",
+    "is_namedtuple_class",
+    "is_namedtuple_instance",
+    "is_structseq",
+    "is_structseq_class",
+    "is_structseq_instance",
+    "namedtuple_fields",
+    "structseq_fields",
+    "treespec_leaf",
+    "treespec_tuple",
+    "treespec_dict",
+    "tree_is_leaf",
+    "tree_iter",
+    "tree_leaves",
+    "tree_flatten",
+    "tree_flatten_with_path",
+    "tree_structure",
+    "tree_unflatten",
+]
+
+
+_T = TypeVar("_T")
+_KT = TypeVar("_KT")
+_VT = TypeVar("_VT")
+
+
+@substitute_in_graph(
+    optree._C.is_dict_insertion_ordered,
+    can_constant_fold_through=True,
+)
+def _(*args: Any, **kwargs: Any) -> bool:
+    # In namespace 'torch', the dictionary is always traversed in insertion order.
+    # This function returns True.
+    raise ValueError(
+        "Should not be called directly "
+        "because the original function will be called in the constant fold path."
+    )
 
-    if TYPE_CHECKING:
-        from torch.utils._cxx_pytree import PyTree
 
-    @substitute_in_graph(
-        optree._C.is_dict_insertion_ordered,
+__name = ""
+for __name, __func in (
+    ("is_namedtuple", is_namedtuple),
+    ("is_namedtuple_class", is_namedtuple_class),
+    ("is_namedtuple_instance", is_namedtuple_instance),
+    ("is_structseq", is_structseq),
+    ("is_structseq_class", is_structseq_class),
+    ("is_structseq_instance", is_structseq_instance),
+    ("namedtuple_fields", namedtuple_fields),
+    ("structseq_fields", structseq_fields),
+):
+    globals()[__name] = substitute_in_graph(
+        __func,  # type: ignore[arg-type]
         can_constant_fold_through=True,
-    )
-    def _(*args: Any, **kwargs: Any) -> bool:
-        # In namespace 'torch', the dictionary is always traversed in insertion order.
-        # This function returns True.
-        raise ValueError(
-            "Should not be called directly "
-            "because the original function will be called in the constant fold path."
-        )
+    )(__func.__python_implementation__)  # type: ignore[attr-defined]
+    del __func
+del __name
+
+
+@substitute_in_graph(optree.tree_is_leaf, can_constant_fold_through=True)  # type: ignore[arg-type]
+def tree_is_leaf(
+    tree: PyTree,
+    /,
+    is_leaf: Callable[[PyTree], bool] | None = None,
+    *,
+    none_is_leaf: bool = False,
+    namespace: str = "",
+) -> bool:
+    if (tree is None and none_is_leaf) or (is_leaf is not None and is_leaf(tree)):
+        return True
+    if optree.register_pytree_node.get(type(tree), namespace=namespace) is None:
+        return True
+    return False
+
+
+@substitute_in_graph(optree.tree_iter, can_constant_fold_through=False)  # type: ignore[arg-type]
+def tree_iter(
+    tree: PyTree,
+    /,
+    is_leaf: Callable[[PyTree], bool] | None = None,
+    *,
+    none_is_leaf: bool = False,
+    namespace: str = "",
+) -> Iterable[Any]:
+    stack = [tree]
+    while stack:
+        node = stack.pop()
+        if tree_is_leaf(
+            node,
+            is_leaf=is_leaf,
+            none_is_leaf=none_is_leaf,
+            namespace=namespace,
+        ):
+            yield node
+            continue
 
-    __name = ""
-    for __name in (
-        "is_namedtuple",
-        "is_namedtuple_class",
-        "is_namedtuple_instance",
-        "is_structseq",
-        "is_structseq_class",
-        "is_structseq_instance",
-        "namedtuple_fields",
-        "structseq_fields",
-    ):
-        __func = getattr(optree, __name)
-        globals()[__name] = substitute_in_graph(__func, can_constant_fold_through=True)(
-            __func.__python_implementation__
+        children, *_ = optree.tree_flatten_one_level(
+            node,
+            is_leaf=is_leaf,
+            none_is_leaf=none_is_leaf,
+            namespace=namespace,
         )
-        __all__ += [__name]  # noqa: PLE0604
-        del __func
-    del __name
-
-    @substitute_in_graph(optree.tree_is_leaf, can_constant_fold_through=True)
-    def tree_is_leaf(
-        tree: PyTree,
-        /,
-        is_leaf: Callable[[PyTree], bool] | None = None,
-        *,
-        none_is_leaf: bool = False,
-        namespace: str = "",
-    ) -> bool:
-        if (tree is None and none_is_leaf) or (is_leaf is not None and is_leaf(tree)):
-            return True
-        if optree.register_pytree_node.get(type(tree), namespace=namespace) is None:  # type: ignore[attr-defined]
-            return True
-        return False
-
-    @substitute_in_graph(optree.tree_iter, can_constant_fold_through=False)
-    def tree_iter(
-        tree: PyTree,
-        /,
-        is_leaf: Callable[[PyTree], bool] | None = None,
-        *,
-        none_is_leaf: bool = False,
-        namespace: str = "",
-    ) -> Iterable[Any]:
-        stack = [tree]
-        while stack:
-            node = stack.pop()
-            if tree_is_leaf(
-                node,
-                is_leaf=is_leaf,
-                none_is_leaf=none_is_leaf,
-                namespace=namespace,
-            ):
-                yield node
-                continue
-
-            children, *_ = optree.tree_flatten_one_level(
-                node,
-                is_leaf=is_leaf,
-                none_is_leaf=none_is_leaf,
-                namespace=namespace,
-            )
-            stack.extend(reversed(children))
-
-    __all__ += ["tree_iter"]
-
-    @substitute_in_graph(optree.tree_leaves, can_constant_fold_through=True)
-    def tree_leaves(
-        tree: PyTree,
-        /,
-        is_leaf: Callable[[PyTree], bool] | None = None,
-        *,
-        none_is_leaf: bool = False,
-        namespace: str = "",
-    ) -> list[Any]:
-        return list(
-            tree_iter(
-                tree,
-                is_leaf=is_leaf,
-                none_is_leaf=none_is_leaf,
-                namespace=namespace,
-            )
+        stack.extend(reversed(children))
+
+
+@substitute_in_graph(optree.tree_leaves, can_constant_fold_through=True)  # type: ignore[arg-type]
+def tree_leaves(
+    tree: PyTree,
+    /,
+    is_leaf: Callable[[PyTree], bool] | None = None,
+    *,
+    none_is_leaf: bool = False,
+    namespace: str = "",
+) -> list[Any]:
+    return list(
+        tree_iter(
+            tree,
+            is_leaf=is_leaf,
+            none_is_leaf=none_is_leaf,
+            namespace=namespace,
         )
+    )
 
-    __all__ += ["tree_leaves"]
-
-    class _Asterisk(str):
-        __slots__ = ()
-
-        def __new__(cls) -> Self:
-            return super().__new__(cls, "*")
-
-        def __repr__(self) -> str:
-            return "*"  # no quotes
-
-    _asterisk = _Asterisk()
-    del _Asterisk
-
-    @dataclass(frozen=True)
-    class PyTreeSpec:
-        """Analog for :class:`optree.PyTreeSpec` in Python."""
-
-        _children: tuple[PyTreeSpec, ...]
-        _type: builtins.type | None
-        _metadata: Any
-        _entries: tuple[Any, ...]
-        _unflatten_func: Callable[[Any | None, Iterable[PyTree]], PyTree] | None
-        none_is_leaf: bool
-        namespace: str
-
-        num_nodes: int = field(init=False)
-        num_leaves: int = field(init=False)
-        num_children: int = field(init=False)
-
-        def __post_init__(self) -> None:
-            if self._type is None:
-                assert len(self._children) == 0
-                assert self._metadata is None
-                assert self._entries == ()
-                assert self._unflatten_func is None
-                num_nodes = 1
-                num_leaves = 1
-                num_children = 0
-            else:
-                assert callable(self._unflatten_func)
-                num_nodes = sum((spec.num_nodes for spec in self._children), start=1)
-                num_leaves = sum(spec.num_leaves for spec in self._children)
-                num_children = len(self._children)
-
-            object.__setattr__(self, "num_nodes", num_nodes)
-            object.__setattr__(self, "num_leaves", num_leaves)
-            object.__setattr__(self, "num_children", num_children)
-
-        def __repr__(self) -> str:
-            def helper(treespec: PyTreeSpec) -> str:
-                if treespec.is_leaf():
-                    assert treespec.type is None
-                    return _asterisk
-
-                assert treespec.type is not None
-                assert callable(treespec._unflatten_func)
-                children_representations = [
-                    helper(subspec) for subspec in treespec._children
-                ]
-                if (
-                    treespec.type in BUILTIN_TYPES
-                    or (treespec.type is type(None) and not self.none_is_leaf)
-                    or optree.is_namedtuple_class(treespec.type)
-                    or optree.is_structseq_class(treespec.type)
-                ):
-                    # pyrefly: ignore [bad-return]
-                    return treespec._unflatten_func(
-                        treespec._metadata,
-                        children_representations,
-                    )
-                return (
-                    f"CustomTreeNode({treespec.type.__name__}[{treespec._metadata!r}], "
-                    f"[{', '.join(children_representations)}])"
-                )
 
-            inner = [
-                str(helper(self)),
-                *(["NoneIsLeaf"] if self.none_is_leaf else []),
-                f"namespace={self.namespace!r}",
-            ]
-            return f"PyTreeSpec({', '.join(inner)})"
+class _Asterisk(str):
+    __slots__ = ()
 
-        def __len__(self) -> int:
-            return self.num_leaves
+    def __new__(cls) -> Self:
+        return super().__new__(cls, "*")
 
-        @property
-        def type(self) -> builtins.type | None:
-            return self._type
+    def __repr__(self) -> str:
+        return "*"  # no quotes
 
-        def is_leaf(self) -> bool:
-            return self.num_nodes == 1 and self.num_leaves == 1
 
-        def children(self) -> list[PyTreeSpec]:
-            return list(self._children)
+_asterisk = _Asterisk()
+del _Asterisk
 
-        def child(self, index: int) -> PyTreeSpec:
-            return self._children[index]
 
-        def entries(self) -> list[Any]:
-            return list(self._entries)
+@dataclass(frozen=True)
+class PyTreeSpec:
+    """Analog for :class:`optree.PyTreeSpec` in Python."""
 
-        def entry(self, index: int) -> Any:
-            return self._entries[index]
+    _children: tuple[PyTreeSpec, ...]
+    _type: builtins.type | None
+    _metadata: Any
+    _entries: tuple[Any, ...]
+    _unflatten_func: Callable[[Any | None, Iterable[PyTree]], PyTree] | None
+    none_is_leaf: bool
+    namespace: str
 
-        def flatten_up_to(self, tree: PyTree) -> list[PyTree]:
-            def helper(
-                treespec: PyTreeSpec,
-                node: PyTree,
-                subtrees: list[PyTree],
-            ) -> None:
-                if treespec.is_leaf():
-                    subtrees.append(node)
-                    return
+    num_nodes: int = field(init=False)
+    num_leaves: int = field(init=False)
+    num_children: int = field(init=False)
 
-                node_type = type(node)
-                if treespec.type not in BUILTIN_TYPES:
-                    # Always require custom node types to match exactly
-                    if node_type != treespec.type:
-                        raise ValueError(
-                            f"Type mismatch; "
-                            f"expected {treespec.type!r}, but got {node_type!r}.",
-                        )
+    def __post_init__(self, /) -> None:
+        if self._type is None:
+            assert len(self._children) == 0
+            assert self._metadata is None
+            assert self._entries == ()
+            assert self._unflatten_func is None
+            num_nodes = 1
+            num_leaves = 1
+            num_children = 0
+        else:
+            assert callable(self._unflatten_func)
+            num_nodes = 1
+            num_leaves = 0
+            for child in self._children:
+                num_nodes += child.num_nodes
+                num_leaves += child.num_leaves
+            num_children = len(self._children)
+
+        object.__setattr__(self, "num_nodes", num_nodes)
+        object.__setattr__(self, "num_leaves", num_leaves)
+        object.__setattr__(self, "num_children", num_children)
+
+    def __repr__(self, /) -> str:
+        def helper(treespec: PyTreeSpec) -> str:
+            if treespec.is_leaf():
+                assert treespec.type is None
+                return _asterisk
+
+            assert treespec.type is not None
+            assert callable(treespec._unflatten_func)
+            children_representations = [
+                helper(subspec) for subspec in treespec._children
+            ]
+            if (
+                treespec.type in BUILTIN_TYPES
+                or (treespec.type is type(None) and not self.none_is_leaf)
+                or optree.is_namedtuple_class(treespec.type)
+                or optree.is_structseq_class(treespec.type)
+            ):
+                # pyrefly: ignore [bad-return]
+                return treespec._unflatten_func(
+                    treespec._metadata,
+                    children_representations,
+                )
+            return (
+                f"CustomTreeNode({treespec.type.__name__}[{treespec._metadata!r}], "
+                f"[{', '.join(children_representations)}])"
+            )
+
+        inner = [
+            str(helper(self)),
+            *(["NoneIsLeaf"] if self.none_is_leaf else []),
+            f"namespace={self.namespace!r}",
+        ]
+        return f"PyTreeSpec({', '.join(inner)})"
+
+    def __len__(self, /) -> int:
+        return self.num_leaves
+
+    @property
+    def type(self, /) -> builtins.type | None:
+        return self._type
+
+    def is_leaf(self, /) -> bool:
+        return self.num_nodes == 1 and self.num_leaves == 1
+
+    def paths(self, /) -> list[tuple[Any, ...]]:
+        def helper(treespec: PyTreeSpec, path_prefix: list[Any]) -> None:
+            if treespec.is_leaf():
+                paths.append(path_prefix)
+                return
+
+            for entry, subspec in zip(
+                treespec._entries,
+                treespec._children,
+                strict=True,
+            ):
+                helper(subspec, path_prefix + [entry])
+
+        paths: list[list[Any]] = []
+        helper(self, [])
+        return [tuple(path) for path in paths]
+
+    def accessors(self, /) -> list[optree.PyTreeAccessor]:
+        def helper(
+            treespec: PyTreeSpec,
+            entry_path_prefix: list[optree.PyTreeEntry],
+        ) -> None:
+            if treespec.is_leaf():
+                entry_paths.append(entry_path_prefix)
+                return
+
+            node_type = treespec.type
+            assert node_type is not None
+            handler = optree.register_pytree_node.get(
+                node_type, namespace=treespec.namespace
+            )
+            assert handler is not None
+            kind: optree.PyTreeKind = handler.kind
+            path_entry_type: type[optree.PyTreeEntry] = handler.path_entry_type
+
+            for entry, subspec in zip(
+                treespec._entries,
+                treespec._children,
+                strict=True,
+            ):
+                helper(
+                    subspec,
+                    entry_path_prefix + [path_entry_type(entry, node_type, kind)],
+                )
 
+        entry_paths: list[list[optree.PyTreeEntry]] = []
+        helper(self, [])
+        return [optree.PyTreeAccessor(path) for path in entry_paths]
+
+    def children(self, /) -> list[PyTreeSpec]:
+        return list(self._children)
+
+    def child(self, index: int, /) -> PyTreeSpec:
+        return self._children[index]
+
+    def entries(self, /) -> list[Any]:
+        return list(self._entries)
+
+    def entry(self, index: int, /) -> Any:
+        return self._entries[index]
+
+    def flatten_up_to(self, tree: PyTree, /) -> list[PyTree]:
+        def helper(
+            treespec: PyTreeSpec,
+            node: PyTree,
+            subtrees: list[PyTree],
+        ) -> None:
+            if treespec.is_leaf():
+                subtrees.append(node)
+                return
+
+            node_type = type(node)
+            if treespec.type not in BUILTIN_TYPES:
+                # Always require custom node types to match exactly
+                if node_type != treespec.type:
+                    raise ValueError(
+                        f"Type mismatch; "
+                        f"expected {treespec.type!r}, but got {node_type!r}.",
+                    )
+
+                children, metadata, *_ = optree.tree_flatten_one_level(
+                    node,
+                    none_is_leaf=self.none_is_leaf,
+                    namespace=self.namespace,
+                )
+                if len(children) != treespec.num_children:
+                    raise ValueError(
+                        f"Node arity mismatch; "
+                        f"expected {treespec.num_children}, but got {len(children)}.",
+                    )
+                if metadata != treespec._metadata:
+                    raise ValueError(
+                        f"Node context mismatch for custom node type {treespec.type!r}.",
+                    )
+            else:
+                # For builtin dictionary types, we allow some flexibility
+                # Otherwise, we require exact matches
+                both_standard_dict = (
+                    treespec.type in STANDARD_DICT_TYPES
+                    and node_type in STANDARD_DICT_TYPES
+                )
+                if not both_standard_dict and node_type != treespec.type:
+                    raise ValueError(
+                        f"Node type mismatch; "
+                        f"expected {treespec.type!r}, but got {node_type!r}.",
+                    )
+                if len(node) != treespec.num_children:
+                    raise ValueError(
+                        f"Node arity mismatch; "
+                        f"expected {treespec.num_children}, but got {len(node)}.",
+                    )
+
+                if both_standard_dict:
+                    # dictionary types are compatible with each other
+                    expected_keys = treespec.entries()
+                    got_key_set = set(node)
+                    expected_key_set = set(expected_keys)
+                    if got_key_set != expected_key_set:
+                        missing_keys = expected_key_set.difference(got_key_set)
+                        extra_keys = got_key_set.difference(expected_key_set)
+                        message = ""
+                        if missing_keys:
+                            message += f"; missing key(s): {missing_keys}"
+                        if extra_keys:
+                            message += f"; extra key(s): {extra_keys}"
+                        raise ValueError(f"Node keys mismatch{message}.")
+                    children = [node[key] for key in expected_keys]
+                else:
+                    # node_type is treespec.type
                     children, metadata, *_ = optree.tree_flatten_one_level(
                         node,
                         none_is_leaf=self.none_is_leaf,
                         namespace=self.namespace,
                     )
-                    if len(children) != treespec.num_children:
+                    if (
+                        node_type is not deque  # ignore mismatch of `maxlen` for deque
+                    ) and metadata != treespec._metadata:
                         raise ValueError(
-                            f"Node arity mismatch; "
-                            f"expected {treespec.num_children}, but got {len(children)}.",
-                        )
-                    if metadata != treespec._metadata:
-                        raise ValueError(
-                            f"Node context mismatch for custom node type {treespec.type!r}.",
-                        )
-                else:
-                    # For builtin dictionary types, we allow some flexibility
-                    # Otherwise, we require exact matches
-                    both_standard_dict = (
-                        treespec.type in STANDARD_DICT_TYPES
-                        and node_type in STANDARD_DICT_TYPES
-                    )
-                    if not both_standard_dict and node_type != treespec.type:
-                        raise ValueError(
-                            f"Node type mismatch; "
-                            f"expected {treespec.type!r}, but got {node_type!r}.",
-                        )
-                    if len(node) != treespec.num_children:
-                        raise ValueError(
-                            f"Node arity mismatch; "
-                            f"expected {treespec.num_children}, but got {len(node)}.",
+                            f"Node metadata mismatch for node type {treespec.type!r}; "
+                            f"expected {treespec._metadata!r}, but got {metadata!r}.",  # namedtuple type mismatch
                         )
 
-                    if both_standard_dict:
-                        # dictionary types are compatible with each other
-                        expected_keys = treespec.entries()
-                        got_key_set = set(node)
-                        expected_key_set = set(expected_keys)
-                        if got_key_set != expected_key_set:
-                            missing_keys = expected_key_set.difference(got_key_set)
-                            extra_keys = got_key_set.difference(expected_key_set)
-                            message = ""
-                            if missing_keys:
-                                message += f"; missing key(s): {missing_keys}"
-                            if extra_keys:
-                                message += f"; extra key(s): {extra_keys}"
-                            raise ValueError(f"Node keys mismatch{message}.")
-                        children = [node[key] for key in expected_keys]
-                    else:
-                        # node_type is treespec.type
-                        children, metadata, *_ = optree.tree_flatten_one_level(
-                            node,
-                            none_is_leaf=self.none_is_leaf,
-                            namespace=self.namespace,
-                        )
-                        if (
-                            node_type
-                            is not deque  # ignore mismatch of `maxlen` for deque
-                        ) and metadata != treespec._metadata:
-                            raise ValueError(
-                                f"Node metadata mismatch for node type {treespec.type!r}; "
-                                f"expected {treespec._metadata!r}, but got {metadata!r}.",  # namedtuple type mismatch
-                            )
-
-                for subtree, subspec in zip(children, treespec._children):
-                    helper(subspec, subtree, subtrees)
-
-            subtrees: list[PyTree] = []
-            helper(self, tree, subtrees)
-            return subtrees
-
-        def unflatten(self, leaves: Iterable[Any]) -> PyTree:
-            if not isinstance(leaves, (list, tuple)):
-                leaves = list(leaves)
-            if len(leaves) != self.num_leaves:
-                raise ValueError(
-                    f"treespec.unflatten(leaves): `leaves` has length {len(leaves)} "
-                    f"but the spec refers to a pytree that holds {self.num_leaves} "
-                    f"items ({self}).",
-                )
-            if self.is_leaf():
-                return leaves[0]
-
-            # Recursively unflatten the children
-            start = 0
-            end = 0
-            subtrees = []
-            for subspec in self._children:
-                end += subspec.num_leaves
-                subtrees.append(subspec.unflatten(leaves[start:end]))
-                start = end
+            for subtree, subspec in zip(children, treespec._children, strict=True):
+                helper(subspec, subtree, subtrees)
 
-            assert callable(self._unflatten_func)
-            return self._unflatten_func(self._metadata, subtrees)
+        subtrees: list[PyTree] = []
+        helper(self, tree, subtrees)
+        return subtrees
+
+    def unflatten(self, leaves: Iterable[Any], /) -> PyTree:
+        if not isinstance(leaves, (list, tuple)):
+            leaves = list(leaves)
+        if len(leaves) != self.num_leaves:
+            raise ValueError(
+                f"treespec.unflatten(leaves): `leaves` has length {len(leaves)} "
+                f"but the spec refers to a pytree that holds {self.num_leaves} "
+                f"items ({self}).",
+            )
+        if self.is_leaf():
+            return leaves[0]
+
+        # Recursively unflatten the children
+        start = 0
+        end = 0
+        subtrees = []
+        for subspec in self._children:
+            end += subspec.num_leaves
+            subtrees.append(subspec.unflatten(leaves[start:end]))
+            start = end
+
+        assert callable(self._unflatten_func)
+        return self._unflatten_func(self._metadata, subtrees)
+
+
+def _is_pytreespec_instance(obj: Any, /) -> TypeIs[PyTreeSpec]:
+    return isinstance(obj, PyTreeSpec)
+
+
+@substitute_in_graph(  # type: ignore[arg-type]
+    optree.treespec_leaf,
+    # We need to disable constant folding here because we want the function to reference the
+    # PyTreeSpec class defined above, not the one in the C++ module.
+    can_constant_fold_through=False,
+)
+def treespec_leaf(
+    *,
+    none_is_leaf: bool = False,
+    namespace: str = "",  # unused
+) -> PyTreeSpec:
+    return PyTreeSpec(
+        (),
+        None,
+        None,
+        (),
+        None,
+        none_is_leaf=none_is_leaf,
+        namespace="",
+    )
 
-    def _is_pytreespec_instance(obj: Any, /) -> TypeIs[PyTreeSpec]:
-        return isinstance(obj, PyTreeSpec)
 
-    @substitute_in_graph(  # type: ignore[arg-type]
-        optree.treespec_leaf,
-        # We need to disable constant folding here because we want the function to reference the
-        # PyTreeSpec class defined above, not the one in the C++ module.
-        can_constant_fold_through=False,
+@substitute_in_graph(  # type: ignore[arg-type]
+    optree.treespec_tuple,
+    # We need to disable constant folding here because we want the function to reference the
+    # PyTreeSpec class defined above, not the one in the C++ module.
+    can_constant_fold_through=False,
+)
+def treespec_tuple(
+    iterable: Iterable[PyTreeSpec] = (),
+    /,
+    *,
+    none_is_leaf: bool = False,
+    namespace: str = "",
+) -> PyTreeSpec:
+    children = tuple(iterable)
+    if any(not _is_pytreespec_instance(child) for child in children):
+        raise ValueError(f"Expected a tuple of PyTreeSpecs, got: {children!r}.")
+    if any(child.none_is_leaf != none_is_leaf for child in children):
+        raise ValueError(
+            "All children PyTreeSpecs must have the same `none_is_leaf` value "
+            f"as the parent; expected {none_is_leaf}, got: {children!r}.",
+        )
+    if any(child.namespace not in (namespace, "") for child in children):
+        raise ValueError(
+            "All children PyTreeSpecs must have the same `namespace` value "
+            f"as the parent; expected {namespace!r}, got: {children!r}.",
+        )
+    handler = optree.register_pytree_node.get(tuple, namespace=namespace)
+    assert handler is not None
+    return PyTreeSpec(
+        tuple(children),
+        tuple,
+        None,
+        tuple(range(len(children))),
+        handler.unflatten_func,
+        none_is_leaf=none_is_leaf,
+        namespace=namespace,
     )
-    def treespec_leaf(
-        *,
-        none_is_leaf: bool = False,
-        namespace: str = "",  # unused
-    ) -> PyTreeSpec:
-        return PyTreeSpec(
-            (),
-            None,
-            None,
-            (),
-            None,
-            none_is_leaf=none_is_leaf,
-            namespace="",
+
+
+@substitute_in_graph(  # type: ignore[arg-type]
+    optree.treespec_dict,
+    # We need to disable constant folding here because we want the function to reference the
+    # PyTreeSpec class defined above, not the one in the C++ module.
+    can_constant_fold_through=False,
+)
+def treespec_dict(
+    mapping: Mapping[Any, PyTreeSpec] | Iterable[tuple[Any, PyTreeSpec]] = (),
+    /,
+    *,
+    none_is_leaf: bool = False,
+    namespace: str = "",
+    **kwargs: PyTreeSpec,
+) -> PyTreeSpec:
+    dct = dict(mapping, **kwargs)
+    if any(not _is_pytreespec_instance(child) for child in dct.values()):
+        raise ValueError(f"Expected a dictionary of TreeSpecs, got: {dct!r}.")
+    if any(child.none_is_leaf != none_is_leaf for child in dct.values()):
+        raise ValueError(
+            "All children PyTreeSpecs must have the same `none_is_leaf` value "
+            f"as the parent; expected {none_is_leaf}, got: {dct!r}.",
+        )
+    if any(child.namespace not in (namespace, "") for child in dct.values()):
+        raise ValueError(
+            "All children PyTreeSpecs must have the same `namespace` value "
+            f"as the parent; expected {namespace!r}, got: {dct!r}.",
         )
 
-    @substitute_in_graph(  # type: ignore[arg-type]
-        optree.treespec_tuple,
-        # We need to disable constant folding here because we want the function to reference the
-        # PyTreeSpec class defined above, not the one in the C++ module.
-        can_constant_fold_through=False,
+    (
+        children,
+        metadata,
+        entries,
+        unflatten_func,
+    ) = optree.tree_flatten_one_level(  # type: ignore[assignment,var-annotated]
+        dct,  # type: ignore[arg-type]
+        none_is_leaf=none_is_leaf,
+        namespace=namespace,
     )
-    def treespec_tuple(
-        iterable: Iterable[PyTreeSpec] = (),
-        /,
-        *,
-        none_is_leaf: bool = False,
-        namespace: str = "",
-    ) -> PyTreeSpec:
-        children = tuple(iterable)
-        if any(not _is_pytreespec_instance(child) for child in children):
-            raise ValueError(f"Expected a tuple of PyTreeSpecs, got: {children!r}.")
-        if any(child.none_is_leaf != none_is_leaf for child in children):
-            raise ValueError(
-                "All children PyTreeSpecs must have the same `none_is_leaf` value "
-                f"as the parent; expected {none_is_leaf}, got: {children!r}.",
-            )
-        if any(child.namespace not in (namespace, "") for child in children):
-            raise ValueError(
-                "All children PyTreeSpecs must have the same `namespace` value "
-                f"as the parent; expected {namespace!r}, got: {children!r}.",
-            )
-        handler = optree.register_pytree_node.get(tuple, namespace=namespace)  # type: ignore[attr-defined]
-        assert handler is not None
-        return PyTreeSpec(
-            tuple(children),
-            tuple,
-            None,
-            tuple(range(len(children))),
-            handler.unflatten_func,
+    return PyTreeSpec(
+        tuple(children),  # type: ignore[arg-type]
+        dict,
+        metadata,
+        entries,
+        unflatten_func,  # type: ignore[arg-type]
+        none_is_leaf=none_is_leaf,
+        namespace=namespace,
+    )
+
+
+@substitute_in_graph(  # type: ignore[arg-type]
+    optree.tree_flatten,
+    # We need to disable constant folding here because we want the function to reference the
+    # PyTreeSpec class defined above, not the one in the C++ module.
+    can_constant_fold_through=False,
+)
+def tree_flatten(
+    tree: PyTree,
+    /,
+    is_leaf: Callable[[PyTree], bool] | None = None,
+    *,
+    none_is_leaf: bool = False,
+    namespace: str = "",
+) -> tuple[list[Any], PyTreeSpec]:
+    def helper(node: PyTree, leaves: list[Any]) -> PyTreeSpec:
+        if tree_is_leaf(
+            node,
+            is_leaf=is_leaf,
             none_is_leaf=none_is_leaf,
             namespace=namespace,
-        )
-
-    @substitute_in_graph(  # type: ignore[arg-type]
-        optree.treespec_dict,
-        # We need to disable constant folding here because we want the function to reference the
-        # PyTreeSpec class defined above, not the one in the C++ module.
-        can_constant_fold_through=False,
-    )
-    def treespec_dict(
-        mapping: Mapping[Any, PyTreeSpec] | Iterable[tuple[Any, PyTreeSpec]] = (),
-        /,
-        *,
-        none_is_leaf: bool = False,
-        namespace: str = "",
-        **kwargs: PyTreeSpec,
-    ) -> PyTreeSpec:
-        dct = dict(mapping, **kwargs)
-        if any(not _is_pytreespec_instance(child) for child in dct.values()):
-            raise ValueError(f"Expected a dictionary of TreeSpecs, got: {dct!r}.")
-        if any(child.none_is_leaf != none_is_leaf for child in dct.values()):
-            raise ValueError(
-                "All children PyTreeSpecs must have the same `none_is_leaf` value "
-                f"as the parent; expected {none_is_leaf}, got: {dct!r}.",
-            )
-        if any(child.namespace not in (namespace, "") for child in dct.values()):
-            raise ValueError(
-                "All children PyTreeSpecs must have the same `namespace` value "
-                f"as the parent; expected {namespace!r}, got: {dct!r}.",
+        ):
+            leaves.append(node)
+            return PyTreeSpec(
+                (),
+                None,
+                None,
+                (),
+                None,
+                none_is_leaf=none_is_leaf,
+                namespace=namespace,
             )
 
         (
@@ -441,173 +581,177 @@ def treespec_dict(
             metadata,
             entries,
             unflatten_func,
-        ) = optree.tree_flatten_one_level(  # type: ignore[assignment,var-annotated]
-            dct,  # type: ignore[arg-type]
+        ) = optree.tree_flatten_one_level(
+            node,
+            is_leaf=is_leaf,
             none_is_leaf=none_is_leaf,
             namespace=namespace,
         )
+
+        # Recursively flatten the children
+        subspecs = tuple(helper(child, leaves) for child in children)
         return PyTreeSpec(
-            tuple(children),  # type: ignore[arg-type]
-            dict,
+            subspecs,
+            type(node),
             metadata,
             entries,
-            unflatten_func,
+            unflatten_func,  # type: ignore[arg-type]
             none_is_leaf=none_is_leaf,
             namespace=namespace,
-        )
-
-    @substitute_in_graph(  # type: ignore[arg-type]
-        optree.tree_flatten,
-        # We need to disable constant folding here because we want the function to reference the
-        # PyTreeSpec class defined above, not the one in the C++ module.
-        can_constant_fold_through=False,
+        )  # type: ignore[arg-type]
+
+    leaves: list[Any] = []
+    treespec = helper(tree, leaves)
+    return leaves, treespec
+
+
+@substitute_in_graph(  # type: ignore[arg-type]
+    optree._C.flatten,
+    # We need to disable constant folding here because we want the function to reference the
+    # PyTreeSpec class defined above, not the one in the C++ module.
+    can_constant_fold_through=False,
+)
+def _C_flatten(
+    tree: PyTree,
+    /,
+    leaf_predicate: Callable[[PyTree], bool] | None = None,
+    none_is_leaf: bool = False,
+    namespace: str = "",
+) -> tuple[list[Any], PyTreeSpec]:
+    return tree_flatten(  # type: ignore[return-value]
+        tree,
+        is_leaf=leaf_predicate,
+        none_is_leaf=none_is_leaf,
+        namespace=namespace,
     )
-    def tree_flatten(
-        tree: PyTree,
-        /,
-        is_leaf: Callable[[PyTree], bool] | None = None,
-        *,
-        none_is_leaf: bool = False,
-        namespace: str = "",
-    ) -> tuple[list[Any], PyTreeSpec]:
-        def helper(node: PyTree, leaves: list[Any]) -> PyTreeSpec:
-            if tree_is_leaf(
-                node,
-                is_leaf=is_leaf,
-                none_is_leaf=none_is_leaf,
-                namespace=namespace,
-            ):
-                leaves.append(node)
-                return PyTreeSpec(
-                    (),
-                    None,
-                    None,
-                    (),
-                    None,
-                    none_is_leaf=none_is_leaf,
-                    namespace=namespace,
-                )
-
-            (
-                children,
-                metadata,
-                entries,
-                unflatten_func,
-            ) = optree.tree_flatten_one_level(
-                node,
-                is_leaf=is_leaf,
-                none_is_leaf=none_is_leaf,
-                namespace=namespace,
-            )
 
-            # Recursively flatten the children
-            subspecs = tuple(helper(child, leaves) for child in children)
-            return PyTreeSpec(
-                subspecs,
-                type(node),
-                metadata,
-                entries,
-                unflatten_func,
-                none_is_leaf=none_is_leaf,
-                namespace=namespace,
-            )  # type: ignore[arg-type]
-
-        leaves: list[Any] = []
-        treespec = helper(tree, leaves)
-        return leaves, treespec
-
-    __all__ += ["tree_flatten"]
 
-    @substitute_in_graph(  # type: ignore[arg-type]
-        optree.tree_structure,
-        # We need to disable constant folding here because we want the function to reference the
-        # PyTreeSpec class defined above, not the one in the C++ module.
-        can_constant_fold_through=False,
+@substitute_in_graph(  # type: ignore[arg-type]
+    optree.tree_flatten_with_path,
+    # We need to disable constant folding here because we want the function to reference the
+    # PyTreeSpec class defined above, not the one in the C++ module.
+    can_constant_fold_through=False,
+)
+def tree_flatten_with_path(
+    tree: PyTree,
+    /,
+    is_leaf: Callable[[PyTree], bool] | None = None,
+    *,
+    none_is_leaf: bool = False,
+    namespace: str = "",
+) -> tuple[list[tuple[Any, ...]], list[Any], PyTreeSpec]:
+    leaves, treespec = tree_flatten(
+        tree,
+        is_leaf=is_leaf,
+        none_is_leaf=none_is_leaf,
+        namespace=namespace,
     )
-    def tree_structure(
-        tree: PyTree,
-        /,
-        is_leaf: Callable[[PyTree], bool] | None = None,
-        *,
-        none_is_leaf: bool = False,
-        namespace: str = "",
-    ) -> PyTreeSpec:
-        return tree_flatten(  # type: ignore[return-value]
-            tree,
-            is_leaf=is_leaf,
-            none_is_leaf=none_is_leaf,
-            namespace=namespace,
-        )[1]
-
-    __all__ += ["tree_structure"]
-
-    @substitute_in_graph(  # type: ignore[arg-type]
-        optree.tree_unflatten,
-        # We need to disable constant folding here because we want the function to reference the
-        # PyTreeSpec class defined above, not the one in the C++ module.
-        can_constant_fold_through=False,
+    return treespec.paths(), leaves, treespec  # type: ignore[return-value]
+
+
+@substitute_in_graph(  # type: ignore[arg-type]
+    optree._C.flatten_with_path,
+    # We need to disable constant folding here because we want the function to reference the
+    # PyTreeSpec class defined above, not the one in the C++ module.
+    can_constant_fold_through=False,
+)
+def _C_flatten_with_path(
+    tree: PyTree,
+    /,
+    leaf_predicate: Callable[[PyTree], bool] | None = None,
+    none_is_leaf: bool = False,
+    namespace: str = "",
+) -> tuple[list[tuple[Any, ...]], list[Any], PyTreeSpec]:
+    return tree_flatten_with_path(  # type: ignore[return-value]
+        tree,
+        is_leaf=leaf_predicate,
+        none_is_leaf=none_is_leaf,
+        namespace=namespace,
     )
-    def tree_unflatten(treespec: PyTreeSpec, leaves: Iterable[Any]) -> PyTree:
-        if not _is_pytreespec_instance(treespec):
-            raise TypeError(
-                f"tree_unflatten(leaves, treespec): Expected `treespec` to be instance of "
-                f"PyTreeSpec but got item of type {type(treespec)}."
-            )
-        return treespec.unflatten(leaves)
-
-    __all__ += ["tree_unflatten"]
-
-    @substitute_in_graph(optree.tree_map, can_constant_fold_through=True)
-    def tree_map(
-        func: Callable[..., Any],
-        tree: PyTree,
-        /,
-        *rests: PyTree,
-        is_leaf: Callable[[PyTree], bool] | None = None,
-        none_is_leaf: bool = False,
-        namespace: str = "",
-    ) -> PyTree:
-        leaves, treespec = tree_flatten(
-            tree,
-            is_leaf=is_leaf,
-            none_is_leaf=none_is_leaf,
-            namespace=namespace,
-        )
-        flat_args = [leaves] + [treespec.flatten_up_to(r) for r in rests]
-        return treespec.unflatten(map(func, *flat_args))
-
-    __all__ += ["tree_map"]
-
-    @substitute_in_graph(optree.tree_map_, can_constant_fold_through=True)
-    def tree_map_(
-        func: Callable[..., Any],
-        tree: PyTree,
-        /,
-        *rests: PyTree,
-        is_leaf: Callable[[PyTree], bool] | None = None,
-        none_is_leaf: bool = False,
-        namespace: str = "",
-    ) -> PyTree:
-        leaves, treespec = tree_flatten(
-            tree,
-            is_leaf=is_leaf,
-            none_is_leaf=none_is_leaf,
-            namespace=namespace,
-        )
-        flat_args = [leaves] + [treespec.flatten_up_to(r) for r in rests]
-        deque(map(func, *flat_args), maxlen=0)  # consume and exhaust the iterable
-        return tree
 
-    __all__ += ["tree_map_"]
 
-    _none_unflatten = optree.register_pytree_node.get(type(None)).unflatten_func  # type: ignore[union-attr]
-
-    @substitute_in_graph(  # type: ignore[arg-type]
-        _none_unflatten,
-        can_constant_fold_through=True,
-        skip_signature_check=True,
-    )
-    def none_unflatten(_: None, children: Iterable[Any], /) -> None:
-        if len(list(children)) != 0:
-            raise ValueError("Expected no children.")
-        return None
+@substitute_in_graph(  # type: ignore[arg-type]
+    optree.tree_structure,
+    # We need to disable constant folding here because we want the function to reference the
+    # PyTreeSpec class defined above, not the one in the C++ module.
+    can_constant_fold_through=False,
+)
+def tree_structure(
+    tree: PyTree,
+    /,
+    is_leaf: Callable[[PyTree], bool] | None = None,
+    *,
+    none_is_leaf: bool = False,
+    namespace: str = "",
+) -> PyTreeSpec:
+    return tree_flatten(  # type: ignore[return-value]
+        tree,
+        is_leaf=is_leaf,
+        none_is_leaf=none_is_leaf,
+        namespace=namespace,
+    )[1]
+
+
+@substitute_in_graph(  # type: ignore[arg-type]
+    optree.tree_unflatten,
+    # We need to disable constant folding here because we want the function to reference the
+    # PyTreeSpec class defined above, not the one in the C++ module.
+    can_constant_fold_through=False,
+)
+def tree_unflatten(treespec: PyTreeSpec, leaves: Iterable[Any]) -> PyTree:
+    if not _is_pytreespec_instance(treespec):
+        raise TypeError(
+            f"tree_unflatten(leaves, treespec): Expected `treespec` to be instance of "
+            f"PyTreeSpec but got item of type {type(treespec)}."
+        )
+    return treespec.unflatten(leaves)
+
+
+_none_registration = optree.register_pytree_node.get(type(None))
+assert _none_registration is not None
+
+
+@substitute_in_graph(  # type: ignore[arg-type]
+    _none_registration.unflatten_func,
+    can_constant_fold_through=True,
+    skip_signature_check=True,
+)
+def none_unflatten(_: None, children: Iterable[_T], /) -> None:
+    if len(list(children)) != 0:
+        raise ValueError("Expected no children.")
+    return None
+
+
+with optree.dict_insertion_ordered(False, namespace="torch"):
+    _dict_registration = optree.register_pytree_node.get(dict)
+    assert _dict_registration is not None
+
+
+@substitute_in_graph(  # type: ignore[arg-type]
+    _dict_registration.flatten_func,
+    can_constant_fold_through=True,
+    skip_signature_check=True,
+)
+def dict_flatten(
+    dct: dict[_KT, _VT], /
+) -> tuple[list[_VT], tuple[list[_KT], list[_KT]], tuple[_KT, ...]]:
+    sorted_keys = optree.utils.total_order_sorted(dct)
+    values = [dct[key] for key in sorted_keys]
+    original_keys = list(dct)
+    return values, (original_keys, sorted_keys), tuple(sorted_keys)
+
+
+@substitute_in_graph(  # type: ignore[arg-type]
+    _dict_registration.unflatten_func,
+    can_constant_fold_through=True,
+    skip_signature_check=True,
+)
+def dict_unflatten(
+    metadata: tuple[list[_KT], list[_KT]],
+    values: Iterable[_VT],
+    /,
+) -> dict[_KT, _VT]:
+    original_keys, sorted_keys = metadata
+    d = dict.fromkeys(original_keys)
+    d.update(zip(sorted_keys, values, strict=True))
+    return d  # type: ignore[return-value]
diff --git a/torch/_dynamo/side_effects.py b/torch/_dynamo/side_effects.py
index bd38e9295a05a..95ebeeb7f0a6d 100644
--- a/torch/_dynamo/side_effects.py
+++ b/torch/_dynamo/side_effects.py
@@ -41,8 +41,8 @@
     create_instruction,
 )
 from .codegen import PyCodegen
-from .exc import SideEffectsError, unimplemented_v2
-from .source import GlobalSource, LocalCellSource, LocalSource, Source
+from .exc import SideEffectsError, unimplemented
+from .source import GlobalSource, LocalCellSource, Source, TempLocalSource
 from .utils import is_frozen_dataclass, nn_module_new, object_new
 from .variables.base import (
     AttributeMutation,
@@ -261,7 +261,7 @@ def check_allowed_side_effect(self, item: VariableTracker) -> bool:
         assert item.mutation_type is not None
         if not is_side_effect_safe(item.mutation_type):
             # TODO plumb HOP information here
-            unimplemented_v2(
+            unimplemented(
                 gb_type="HigherOrderOperator: Mutating a variable not in the current scope (SideEffects)",
                 context="",
                 explanation="This is not supported.",
@@ -289,7 +289,7 @@ def load_attr(
             assert self.is_attribute_mutation(item)
         result = self.store_attr_mutations[item][name]
         if not deleted_ok and isinstance(result, variables.DeletedVariable):
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Attempted to read a deleted variable",
                 context=f"item: {item}, name: {name}",
                 explanation="",
@@ -299,7 +299,7 @@ def load_attr(
 
     def store_cell(self, cellvar: VariableTracker, value: VariableTracker) -> None:
         if cellvar.is_immutable():
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Write to immutable cell",
                 context=f"cellvar: {cellvar}, value: {value}",
                 explanation="Dynamo doesn't support writing to immutable/sourceless cell variables.",
@@ -315,7 +315,7 @@ def load_cell(self, cellvar: VariableTracker) -> VariableTracker:
             return self.load_attr(cellvar, "cell_contents", check=False)
         if cellvar.pre_existing_contents:
             return cellvar.pre_existing_contents
-        unimplemented_v2(
+        unimplemented(
             gb_type="Read uninitialized cell",
             context=str(cellvar),
             explanation="Attempted to read a cell variable that has not been populated yet.",
@@ -704,7 +704,7 @@ def codegen_save_tempvars(self, cg: PyCodegen) -> None:
                     )
                     cg.extend_output(create_call_function(0, False))
                     cg.add_cache(var)
-                    var.source = LocalSource(cg.tempvars[var])  # type: ignore[attr-defined]
+                    var.source = TempLocalSource(cg.tempvars[var])  # type: ignore[attr-defined]
                 elif var.source is None:
                     # pyrefly: ignore [bad-assignment]
                     var.source = LocalCellSource(var.local_name)
@@ -729,9 +729,9 @@ def codegen_save_tempvars(self, cg: PyCodegen) -> None:
                     # `add_cache` generates STORE and consumes TOS, but we never
                     # cleared it. TODO move this call into `add_cache`
                     cg.clear_tos()
-                    var.source = LocalSource(cg.tempvars[var])
+                    var.source = TempLocalSource(cg.tempvars[var])
             elif isinstance(var, variables.AutogradFunctionContextVariable):
-                unimplemented_v2(
+                unimplemented(
                     gb_type="AutogradFunctionContextVariable escaped Dynamo-traced region",
                     context="",
                     explanation="We cannot reconstruct a torch.autograd.Function's context object.",
@@ -764,7 +764,7 @@ def load_new_method() -> None:
                 cg.extend_output(create_call_function(1 + len(var.init_args), False))  # type: ignore[attr-defined]
 
                 cg.add_cache(var)
-                var.source = LocalSource(cg.tempvars[var])
+                var.source = TempLocalSource(cg.tempvars[var])
 
         for ctx, args in self.save_for_backward:
             cg(ctx.source)
@@ -889,7 +889,7 @@ def codegen_update_mutated(self, cg: PyCodegen) -> None:
                     isinstance(var.maxlen, variables.ConstantVariable)
                     and var.maxlen.value is None
                 ):
-                    unimplemented_v2(
+                    unimplemented(
                         gb_type="Side effect on existing deque with limited maxlen",
                         context="",
                         explanation="This is not supported.",
diff --git a/torch/_dynamo/source.py b/torch/_dynamo/source.py
index 8edd8f7540e31..5be6b8ccbf41d 100644
--- a/torch/_dynamo/source.py
+++ b/torch/_dynamo/source.py
@@ -151,6 +151,23 @@ def name(self) -> str:
         return f"L[{repr(self.local_name)}]"
 
 
+@dataclasses.dataclass(frozen=True)
+class TempLocalSource(Source):
+    # like LocalSource, but cannot be guarded on
+    local_name: str
+
+    def reconstruct(self, codegen: "PyCodegen") -> None:
+        codegen.append_output(codegen.create_load(self.local_name))
+
+    def guard_source(self) -> GuardSource:
+        return GuardSource.TEMP_LOCAL
+
+    def name(self) -> str:
+        raise NotImplementedError(
+            "Cannot create guard on TempLocalSource - this is an internal Dynamo bug. Please file an issue on GitHub."
+        )
+
+
 @dataclasses.dataclass(frozen=True)
 class SyntheticLocalSource(Source):
     local_name: str
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index 9d0d87c5f8a06..18f053a2ca675 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -94,10 +94,12 @@
     BackendCompilerFailed,
     collapse_resume_frames,
     format_graph_break_message,
+    format_loop_skip_frame_message,
+    format_skip_frame_message,
     get_stack_above_dynamo,
     ResumePrologueTracingError,
     StepUnsupported,
-    unimplemented_v2,
+    unimplemented,
     Unsupported,
 )
 from .funcname_cache import get_funcname
@@ -434,12 +436,15 @@ def resume_fn(self) -> ReenterWith:
         else:
             return ReenterWith(self.stack_index - 1)
 
-    def exit(self, tx: InstructionTranslatorBase, is_graph_break: bool) -> None:
+    def exit(
+        self, tx: InstructionTranslatorBase, is_graph_break: bool
+    ) -> VariableTracker | None:
         assert self.with_context is not None
         if (
             is_graph_break and self.with_context.exit_on_graph_break()
         ) or not is_graph_break:
             return self.with_context.exit(tx)  # type: ignore[arg-type]
+        return None
 
 
 class SpeculationLogDivergence(AssertionError):
@@ -602,9 +607,9 @@ def jump_graph_break(
         )
         # compile a partial subgraph prefix then jump into user code
         if self.maybe_has_backedge():
-            msg = (
-                "Skipping frame because there is a graph break in a for/while loop\n"
-                f"{self.frame_summary()}"
+            msg = format_loop_skip_frame_message(
+                self.f_code,
+                "".join(traceback.format_list([self.frame_summary()])),
             )
             log.info(msg)
             raise exc.SkipFrame(msg)
@@ -654,7 +659,7 @@ def inner(self: InstructionTranslatorBase, inst: Instruction) -> None:
                 elif self.should_compile_partial_graph():
                     jump_graph_break(self, inst, value)
                 else:
-                    unimplemented_v2(
+                    unimplemented(
                         gb_type="Data-dependent assertion failed (cannot compile partial graph)",
                         context=f"value: {value}",
                         explanation="Dynamo has determined when encountering a data-dependent assert failure "
@@ -693,7 +698,7 @@ def inner(self: InstructionTranslatorBase, inst: Instruction) -> None:
 
                 result = torch.fx.experimental.symbolic_shapes.expect_true(sym_expr)
                 if not result:
-                    unimplemented_v2(
+                    unimplemented(
                         gb_type="Assertion failed on symbolic shapes",
                         context=str(sym_expr),
                         explanation="",
@@ -769,7 +774,7 @@ def inner(self: InstructionTranslatorBase, inst: Instruction) -> None:
                             self.push(value)
                         self.jump(inst)
                 else:
-                    unimplemented_v2(
+                    unimplemented(
                         gb_type="Data-dependent branching with non-constant __bool__",
                         context=f"method: {x}, result: {result}",
                         explanation="Attempted to perform data-dependent branching on a user-defined "
@@ -822,7 +827,7 @@ def inner(self: InstructionTranslatorBase, inst: Instruction) -> None:
                         self.push(value)
                     self.jump(inst)
             else:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Data-dependent branching",
                     context=f"attempted to jump with {value}",
                     explanation=_explanation,
@@ -856,7 +861,7 @@ def wrapper(self: InstructionTranslatorBase, inst: Instruction) -> None:
                     # We don't support graph break under GenericContextWrappingVariable,
                     # If there is, we roll back to the checkpoint and fall back.
                     excp.remove_from_stats()
-                    unimplemented_v2(
+                    unimplemented(
                         gb_type="Graph break under GenericContextWrappingVariable",
                         context=f"Active generic context managers: {self.active_generic_context_managers}",
                         explanation="Attempted to graph break in an active context manager(s) that doesn't support graph breaking.",
@@ -880,9 +885,9 @@ def wrapper(self: InstructionTranslatorBase, inst: Instruction) -> None:
                 )
 
                 if self.maybe_has_backedge():
-                    msg = (
-                        "Skipping frame because there is a graph break in a for/while loop\n"
-                        f"{self.frame_summary()}"
+                    msg = format_loop_skip_frame_message(
+                        self.f_code,
+                        "".join(traceback.format_list([self.frame_summary()])),
                     )
                     log.info(msg)
                     raise exc.SkipFrame(msg) from excp
@@ -980,7 +985,7 @@ def __init__(cls: type, name: str, bases: Any, dct: Any) -> None:
         super().__init__(name, bases, dct)  # type: ignore[misc]
 
         def _missing(opname: str, *args: Any) -> None:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Missing bytecode handler",
                 context=f"{opname} with args {args}",
                 explanation=f"Dynamo does not know how to handle the bytecode instruction `{opname}`.",
@@ -1334,7 +1339,7 @@ def step(self) -> bool:
                 or self.is_tracing_resume_prologue
             ):
                 if isinstance(e, StepUnsupported):
-                    unimplemented_v2(
+                    unimplemented(
                         gb_type="cannot resume from torch._dynamo.step_unsupported()",
                         context="",
                         explanation="traced torch._dynamo.step_unsupported(), but Dynamo is instructed "
@@ -1349,7 +1354,7 @@ def step(self) -> bool:
             if self.current_speculation is None:
                 log.debug("empty checkpoint - cannot resume from graph break")
                 if isinstance(e, StepUnsupported):
-                    unimplemented_v2(
+                    unimplemented(
                         gb_type="torch._dynamo.step_unsupported() with empty checkpoint",
                         context="",
                         explanation="traced torch._dynamo.step_unsupported(), but there is no checkpoint "
@@ -1706,7 +1711,7 @@ def LOAD_FAST(self, inst: Instruction) -> None:
                     new_name = name.replace(".", "implicit")
                     self.push(self.symbolic_locals[new_name])
                 except KeyError:
-                    unimplemented_v2(
+                    unimplemented(
                         gb_type="Attempted to read undefined local variable (implicit)",
                         context=f"LOAD_FAST {name}",
                         explanation=f"Could not find an implicit local variable with name `{name}`",
@@ -1716,7 +1721,7 @@ def LOAD_FAST(self, inst: Instruction) -> None:
                         ],
                     )
             else:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Attempted to read undefined local variable",
                     context=f"LOAD_FAST {name}",
                     explanation=f"Could not find a local variable with name `{name}`",
@@ -1821,7 +1826,7 @@ def STORE_GLOBAL(self, inst: Instruction) -> None:
             source, self.symbolic_globals[name]
         )
         if isinstance(value, RemovableHandleVariable):
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Storing Tensor hook handle in globals",
                 context=name,
                 explanation="This is not supported.",
@@ -1917,7 +1922,7 @@ def IMPORT_NAME(self, inst: Instruction) -> None:
                     globals=self.f_globals,
                 )
             except ImportError:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Import failure",
                     context=f"module_name: {module_name}, fromlist: {fromlist}, level={level}",
                     explanation="Failure when attempting to import.",
@@ -1948,7 +1953,7 @@ def IMPORT_NAME(self, inst: Instruction) -> None:
             # pyrefly: ignore [unbound-name]
             self.push(PythonModuleVariable(value, source=source))
         else:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Bad import result",
                 # pyrefly: ignore [unbound-name]
                 context=typestr(value),
@@ -1968,7 +1973,14 @@ def IMPORT_FROM(self, inst: Instruction) -> None:
     @cache_method
     def load_builtin_from_argval(self, argval: Any) -> VariableTracker:
         if argval not in self.f_builtins:
-            raise Unsupported(f"name '{argval}' is not defined")
+            unimplemented(
+                gb_type="failed to find name in frame builtins",
+                context="",
+                explanation=f"Failed to find name `{argval}` in frame's builtins.",
+                hints=[
+                    *graph_break_hints.DYNAMO_BUG,
+                ],
+            )
         val = self.f_builtins[argval]
 
         if callable(val):
@@ -2089,7 +2101,7 @@ def _raise_exception_variable(self, val: VariableTracker) -> NoReturn:
         if self._isinstance_exception(val):
             observed_exception_type = exc.get_dynamo_observed_exception(val.exc_type)  # type: ignore[attr-defined, union-attr]
             raise observed_exception_type(f"raised exception {val}")
-        unimplemented_v2(
+        unimplemented(
             gb_type="Failed to raise exception",
             context=str(exc),
             explanation="Attempted to raise a non-Exception type/value.",
@@ -2129,7 +2141,7 @@ def CLEANUP_THROW(self, inst: Instruction) -> None:
         tos = self.stack[-1]
         assert isinstance(tos, ExceptionVariable)
         if tos.exc_type is StopIteration:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="CLEANUP_THROW with StopIteration",
                 context="",
                 explanation="Received StopIteration when handling generator.throw/close. This is not supported.",
@@ -2215,7 +2227,7 @@ def bubble_exception_to_interpreter() -> None:
             curr_exc = self.exn_vt_stack.get_current_exception()
             dynamo_exc = exc.get_dynamo_observed_exception(curr_exc.python_type())
             assert isinstance(raised_exception, dynamo_exc)  # sanity check
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Observed exception",
                 context=f"raised exception {curr_exc.python_type_name()}({curr_exc.args})",  # type: ignore[union-attr]
                 explanation=observed_exn_gb_explanation,
@@ -2270,7 +2282,7 @@ def bubble_exception_to_interpreter() -> None:
                         # instruction translator.
                         self.stack.clear()
                         if type(self) is InstructionTranslator:
-                            unimplemented_v2(
+                            unimplemented(
                                 gb_type="Observed exception (EXCEPT_HANDLER)",
                                 context=str(raised_exception),
                                 explanation=observed_exn_gb_explanation
@@ -2408,7 +2420,7 @@ def check_if_exc_matches(self) -> bool:
                 UserDefinedExceptionObjectVariable,
             ),
         ):
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Exception with bad expected type",
                 context=str(expected_exc_types),
                 explanation=f"`except ...` has unsupported type {expected_exc_types}.",
@@ -2417,7 +2429,7 @@ def check_if_exc_matches(self) -> bool:
 
         if sys.version_info >= (3, 11):
             if not self._isinstance_exception(exc_instance):
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Caught non-Exception value",
                     context=str(exc_instance),
                     explanation=f"Except expects to receive an object of Exception type but received {exc_instance}.",
@@ -2440,7 +2452,7 @@ def check_if_exc_matches(self) -> bool:
                     UserDefinedExceptionClassVariable,
                 ),
             ):
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Exception with non-type expectation",
                     context=str(expected_type),
                     explanation=f"`except ...` expects a non-type: {expected_type}.",
@@ -2495,7 +2507,7 @@ def CALL_FUNCTION_EX(self, inst: Instruction) -> None:
                 kwargsvars = ConstDictVariable({})
             argsvars = self.pop()
         else:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Variadic function call with bad flags",
                 context=f"flags: {inst.argval}",
                 explanation=f"Attempted to call a variadic function (CALL_FUNCTION_EX) with bad flags {inst.argval}",
@@ -2533,7 +2545,7 @@ def CALL_FUNCTION_EX(self, inst: Instruction) -> None:
             kwargsvars,
             ConstDictVariable,
         ):
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Variadic function call with bad args/kwargs type",
                 # pyrefly: ignore [unbound-name]
                 context=f"args type: {typestr(argsvars)}, kwargs type: {typestr(kwargsvars)}",
@@ -2649,7 +2661,7 @@ def STORE_ATTR(self, inst: Instruction) -> None:
 
     def store_attr_graph_break(self, inst: Instruction) -> None:
         if not self.should_compile_partial_graph():
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Should not compile partial graph (STORE_ATTR)",
                 context="",
                 explanation="Dynamo has determined when encountering an unsupported "
@@ -2806,7 +2818,7 @@ def create_resume(
             reads = livevars_analysis(self.instructions, resume_inst)
             all_argnames = tuple(
                 k
-                for k in self.symbolic_locals.keys()
+                for k in self.symbolic_locals
                 if k in reads and k not in self.cell_and_freevars()
             )
             argnames_null_set = set(meta.locals_null_keys)
@@ -3225,7 +3237,7 @@ def BUILD_TUPLE(self, inst: Instruction) -> None:
 
     def BUILD_SLICE(self, inst: Instruction) -> None:
         items = self.popn(inst.argval)
-        self.push(SliceVariable(items, tx=self))
+        self.push(SliceVariable(items, tx=self))  # type: ignore[arg-type]
 
     def BUILD_LIST(self, inst: Instruction) -> None:
         items = self.popn(inst.argval)
@@ -3233,7 +3245,7 @@ def BUILD_LIST(self, inst: Instruction) -> None:
 
     def BUILD_SET(self, inst: Instruction) -> None:
         if config.inject_BUILD_SET_unimplemented_TESTING_ONLY:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="missing BUILD_SET handler",
                 context="",
                 explanation="Missing BUILD_SET bytecode handler (for testing purposes).",
@@ -3250,7 +3262,7 @@ def BUILD_LIST_UNPACK(self, inst: Instruction, cls: type = ListVariable) -> None
             try:
                 items.extend(seq.force_unpack_var_sequence(self))
             except NotImplementedError:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Failed to unpack object for BUILD_LIST_UNPACK",
                     context=str(seq),
                     explanation=f"{seq} cannot be unpacked into a list for the BUILD_LIST_UNPACK "
@@ -3317,7 +3329,7 @@ def SET_ADD(self, inst: Instruction) -> None:
         obj = self.stack[-inst.arg]
         assert isinstance(obj, SetVariable)
         assert obj.is_mutable()
-        obj.call_method(self, "add", [v], {})
+        obj.call_method(self, "add", [v], {})  # type: ignore[arg-type]
 
     def SET_UPDATE(self, inst: Instruction) -> None:
         v = self.pop()
@@ -3326,7 +3338,7 @@ def SET_UPDATE(self, inst: Instruction) -> None:
         obj = self.stack[-inst.arg]
         assert isinstance(obj, SetVariable)
         assert obj.is_mutable()
-        obj.call_method(self, "update", [v], {})
+        obj.call_method(self, "update", [v], {})  # type: ignore[arg-type]
 
     def LIST_APPEND(self, inst: Instruction) -> None:
         v = self.pop()
@@ -3388,7 +3400,7 @@ def UNPACK_SEQUENCE(self, inst: Instruction) -> None:
         elif seq.has_force_unpack_var_sequence(self):
             val = seq.force_unpack_var_sequence(self)
         else:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Failed to unpack object for UNPACK_SEQUENCE",
                 context=str(seq),
                 explanation=f"{seq} cannot be unpacked into a list for the UNPACK_SEQUENCE bytecode "
@@ -3397,7 +3409,7 @@ def UNPACK_SEQUENCE(self, inst: Instruction) -> None:
             )
         # pyrefly: ignore [unbound-name]
         if len(val) != inst.argval:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Length mismatch when unpacking object for UNPACK_SEQUENCE",
                 # pyrefly: ignore [unbound-name]
                 context=f"expected length: {inst.argval}, actual: {len(val)}",
@@ -3426,7 +3438,7 @@ def UNPACK_EX(self, inst: Instruction) -> None:
             for item in reversed(vals_prefix):
                 self.push(item)
         else:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Failed to unpack object for UNPACK_EX",
                 context=str(seq),
                 explanation=f"{seq} cannot be unpacked into a list for the UNPACK_EX bytecode.",
@@ -3436,7 +3448,7 @@ def UNPACK_EX(self, inst: Instruction) -> None:
     @break_graph_if_unsupported(push=0)
     def graph_break_on_leaf_function(self, inst: Instruction) -> None:
         if self.is_leaf_tracer:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Forced graph break on leaf function",
                 context="",
                 explanation="Forced graph break for nested graph break testing purposes",
@@ -3542,7 +3554,7 @@ def BUILD_STRING(self, inst: Instruction) -> None:
                 format_string_parts.append(part.format_string)
                 args.extend(part.sym_args)
                 if set(kwargs.keys()) & set(part.sym_kwargs.keys()):
-                    unimplemented_v2(
+                    unimplemented(
                         gb_type="BUILD_STRING key conflict",
                         context=f"format_string_parts: {format_string_parts}, kwargs: {kwargs}, part.sym_kwargs: {part.sym_kwargs}",
                         explanation="Failed to build format string due to key conflict",
@@ -3550,7 +3562,7 @@ def BUILD_STRING(self, inst: Instruction) -> None:
                     )
                 kwargs.update(part.sym_kwargs)
             else:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="BUILD_STRING type error",
                     context=str(part),
                     explanation="Format string part type is not correct - expected constant or format string.",
@@ -3604,7 +3616,7 @@ def LIST_EXTEND(self, inst: Instruction) -> None:
         obj = self.stack[-inst.arg]
         assert isinstance(obj, ListVariable)
         assert obj.is_mutable()
-        obj.call_method(self, "extend", [v], {})
+        obj.call_method(self, "extend", [v], {})  # type: ignore[arg-type]
 
     def LIST_TO_TUPLE(self, inst: Instruction) -> None:
         self.push(BuiltinVariable(tuple).call_function(self, [self.pop()], {}))  # type: ignore[arg-type]
@@ -3634,7 +3646,7 @@ def DICT_MERGE(self, inst: Instruction) -> None:
         obj = self.stack[-inst.arg].realize()
         assert isinstance(obj, ConstDictVariable)
         assert obj.is_mutable()
-        obj.call_method(self, "update", [v], {})
+        obj.call_method(self, "update", [v], {})  # type: ignore[arg-type]
 
     DICT_UPDATE = DICT_MERGE
 
@@ -3670,7 +3682,7 @@ def MATCH_SEQUENCE(self, inst: Instruction) -> None:
     def MATCH_KEYS(self, inst: Instruction) -> None:
         tos = self.stack[-1]
         assert isinstance(tos, TupleVariable)
-        keys = tos.unpack_var_sequence(self)
+        keys = tos.unpack_var_sequence(self)  # type: ignore[arg-type]
         tos1 = self.stack[-2]
         assert isinstance(tos1, ConstDictVariable)
 
@@ -3860,11 +3872,11 @@ def enter_ctx(
             else:
                 self.block_stack.append(BlockStackEntry(inst, target, len(self.stack)))
 
-        return ctx.enter(self)
+        return ctx.enter(self)  # type: ignore[arg-type]
 
     @staticmethod
     def unsupported_ctx_graph_break(ctx: VariableTracker) -> NoReturn:
-        unimplemented_v2(
+        unimplemented(
             gb_type="Unsupported context manager",
             context=f"Attempted SETUP_WITH/BEFORE_WITH/LOAD_SPECIAL on {ctx}",
             explanation=f"Dynamo does not know how to enter a `{ctx.python_type_name()}` context manager.",
@@ -3927,7 +3939,7 @@ def END_FOR(self, inst: Instruction) -> None:
 
     def LOAD_FAST_CHECK(self, inst: Instruction) -> None:
         if istype(self.symbolic_locals.get(inst.argval, None), NullVariable):
-            unimplemented_v2(
+            unimplemented(
                 gb_type="LOAD_FAST_CHECK on uninitialized variable",
                 context=inst.argval,
                 explanation=f"Attempted to load uninitialized local variable {inst.argval}",
@@ -3961,7 +3973,7 @@ def CALL_INTRINSIC_1(self, inst: Instruction) -> None:
             # INTRINSIC_LIST_TO_TUPLE
             self.push(TupleVariable(self.pop().force_unpack_var_sequence(self)))
         else:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Missing CALL_INTRINSIC_1 handler",
                 context=f"CALL_INTRINSIC_1 operand: {inst.argval}",
                 explanation=f"No handler implemented for CALL_INTRINSIC_1 {inst.argval} instruction.",
@@ -3997,7 +4009,14 @@ def SET_FUNCTION_ATTRIBUTE(self, inst: Instruction) -> None:
         assert isinstance(fn, NestedUserFunctionVariable)
         attr = self.pop()
 
-        if flags & 0x08:
+        if flags & 0x10:
+            assert sys.version_info >= (3, 14)
+
+            # maybe use Format.VALUE_WITH_FAKE_GLOBALS instead?
+            # https://docs.python.org/3/library/annotationlib.html#annotationlib.Format.VALUE_WITH_FAKE_GLOBALS
+            attr = attr.call_function(self, [ConstantVariable.create(1)], {})
+            fn.annotations = attr
+        elif flags & 0x08:
             fn.closure = attr
         elif flags & 0x04:
             fn.annotations = attr
@@ -4558,7 +4577,7 @@ def _throw_if_in_functorch(self) -> None:
                 # if it reaches here, it means Dynamo failed to inline a functorch function
                 f"- torch.func.{name}(fn) requires the function to be inlined by dynamo"
             )
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Unsupported functorch tracing attempt",
                 context="",
                 explanation=msg,
@@ -4609,8 +4628,9 @@ def _return(self, inst: Instruction) -> None:
             and not self.error_on_graph_break
             and not self.is_tracing_resume_prologue
         ):
-            raise exc.SkipFrame("because no content in function call")
-
+            raise exc.SkipFrame(
+                format_skip_frame_message(self.f_code, "no content in function call")
+            )
         self.instruction_pointer = None
         _step_logger()(
             logging.INFO,
@@ -4666,7 +4686,7 @@ def inline_call(cls, parent: Any, func: Any, args: Any, kwargs: Any) -> Any:
     @staticmethod
     def check_inlineable(func: Any) -> trace_rules.SkipResult:
         if func.has_self():
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Inline attempt with __self__",
                 context=str(func),
                 explanation="Attempted to inline a function with the `__self__` attribute. "
@@ -4680,7 +4700,7 @@ def check_inlineable(func: Any) -> trace_rules.SkipResult:
             msg = inspect.getattr_static(
                 func.get_function(), "_torchdynamo_disable_msg", None
             )
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Skip inlining `torch.compiler.disable()`d function",
                 context=str(func.get_function()),
                 explanation=f"Skip inlining function {func.get_function()} since it was wrapped "
@@ -4716,7 +4736,7 @@ def check_inlineable(func: Any) -> trace_rules.SkipResult:
                     "More graph breaks may occur as a result of attempting to trace into the function.",
                     "Please file an issue to PyTorch.",
                 ]
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Attempted to inline function marked as skipped",
                 context=f"qualname: {fn_qualname}, name: {func.get_name()}, "
                 f"filename: `{func.get_filename()}`, skip reason: {result.reason}",
@@ -4758,7 +4778,7 @@ def build_inline_tracer(
 
         if result is None:
             if isinstance(func, SkipFunctionVariable):
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Attempted to inline function marked as skipped (SkipFunctionVariable)",
                     context=f"Attempted to inline a SkipFunctionVariable {func}",
                     explanation=(
@@ -4789,7 +4809,7 @@ def build_inline_tracer(
 
         for v in itertools.chain(sub_locals.values()):
             if not isinstance(v, VariableTracker):
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Encountered unconverted argument when attempting to inline",
                     context=f"func: {func}, arg: {v}",
                     explanation="An argument to an inlined function was not successfully converted to a VariableTracker.",
@@ -4799,7 +4819,7 @@ def build_inline_tracer(
         if code.co_name in ("__setitem__", "__setattr__") and not (
             args and isinstance(args[0], variables.UserDefinedObjectVariable)
         ):
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Unsupported __setitem__/__setattr__ inline attempt",
                 context=f"code name: {code.co_name}, args: {args}",
                 explanation=f"Attempted to inline {code.co_name} where first argument (self) is not a user-defined object.",
@@ -4845,6 +4865,7 @@ def get_trace_call_log_str() -> str:
                     "orig_graphmodule"
                 ] = weakref.ref(module)
 
+        assert not isinstance(func, SkipFunctionVariable)
         tracer: InliningInstructionTranslator
         if is_generator(code):
             tracer = InliningGeneratorInstructionTranslator(
@@ -4857,8 +4878,6 @@ def get_trace_call_log_str() -> str:
                 func,
             )
         else:
-            # need the line below to make MyPy happy
-            assert not isinstance(func, LocalGeneratorObjectVariable)
             tracer = InliningInstructionTranslator(
                 parent,
                 code,
@@ -4866,7 +4885,6 @@ def get_trace_call_log_str() -> str:
                 parent.symbolic_globals,
                 parent.symbolic_torch_function_state,
                 parent.symbolic_stream_state,
-                # pyrefly: ignore [bad-argument-type]
                 func,
             )
         return tracer
@@ -4950,9 +4968,9 @@ def __init__(
         symbolic_globals: dict[str, VariableTracker],
         symbolic_torch_function_state: SymbolicTorchFunctionState,
         symbolic_stream_state: SymbolicStreamState,
-        funcvar: BaseUserFunctionVariable,
+        funcvar: BaseUserFunctionVariable | LocalGeneratorObjectVariable,
     ) -> None:
-        f_globals = funcvar.get_globals()  # type: ignore[attr-defined]
+        f_globals = funcvar.get_globals()
         f_builtins = f_globals["__builtins__"]
         if not isinstance(f_builtins, dict):
             f_builtins = f_builtins.__dict__
@@ -5010,6 +5028,8 @@ def run_ctx_mgr(self) -> Any:
 
     def should_compile_partial_graph(self) -> bool:
         if config.nested_graph_breaks:
+            if not self.funcvar.should_allow_nested_graph_breaks():
+                return False
             if not self.parent.should_compile_partial_graph():
                 return False
             return super().should_compile_partial_graph()
@@ -5022,7 +5042,7 @@ def create_call_resume_at(
     ) -> list[Instruction]:
         if config.nested_graph_breaks:
             return super().create_call_resume_at(inst, all_stack_locals_metadata)
-        unimplemented_v2(
+        unimplemented(
             gb_type="Graph break in inlined function",
             context="",
             explanation="Graph breaks in an inlined call are not supported.",
@@ -5103,7 +5123,7 @@ def STORE_GLOBAL(self, inst: Instruction) -> None:
         else:
             value = self.pop()
             if isinstance(value, RemovableHandleVariable):
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Storing Tensor hook handle in globals (inline call)",
                     context=inst.argval,
                     explanation="This is not supported.",
@@ -5170,7 +5190,7 @@ def YIELD_FROM(self, inst: Instruction) -> None:
             # lifted the `unimplemented("generator")` in frame conversion. This codepath handles
             # subgenerator and lines up with this line in Python 3.10
             # https://github.com/python/cpython/blob/3.10/Python/ceval.c#L2599
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Unreachable sub-generator code",
                 context="",
                 explanation="Should only be encountered while implementing generator support.",
@@ -5228,14 +5248,14 @@ def SEND(self, inst: Instruction) -> None:
                 # lifted the `unimplemented("generator")` in frame conversion. This codepath handles
                 # subgenerator and lines up with this line in Python 3.11
                 # https://github.com/python/cpython/blob/3.11/Python/ceval.c#L2597
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Unreachable sub-generator code",
                     context="",
                     explanation="Should only be encountered while implementing generator support.",
                     hints=[],
                 )
         else:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="SEND with bad type",
                 context=f"TOS type: {typestr(tos)}",
                 explanation=f"Attempted to SEND with unsupported type {typestr(tos)}.",
diff --git a/torch/_dynamo/test_case.py b/torch/_dynamo/test_case.py
index 691f06af6492f..0706e55abd8fa 100644
--- a/torch/_dynamo/test_case.py
+++ b/torch/_dynamo/test_case.py
@@ -115,6 +115,8 @@ def assertEqual(self, x: Any, y: Any, *args: Any, **kwargs: Any) -> None:  # typ
     # graph break tests
 
 
+# NB: multiple inheritance with LoggingTestCase is possible - this should be fine
+# since there is no overlap in overridden methods.
 class TestCaseWithNestedGraphBreaks(TestCase):
     def setUp(self) -> None:
         super().setUp()
diff --git a/torch/_dynamo/testing.py b/torch/_dynamo/testing.py
index 9206f2598afc2..3eeedfb65da20 100644
--- a/torch/_dynamo/testing.py
+++ b/torch/_dynamo/testing.py
@@ -87,6 +87,12 @@ def extract_graph_backend(_gm, *args, **kwargs):  # type: ignore[no-untyped-def]
     return gm.graph, region_tracker  # type: ignore[union-attr]
 
 
+def extract_graph(fn, *args, **kwargs):  # type: ignore[no-untyped-def]
+    backend = AotEagerAndRecordGraphs()
+    result = torch.compile(backend=backend)(fn)(*args, **kwargs)
+    return result, backend.graphs, backend.fw_graphs, backend.bw_graphs
+
+
 def collect_results(
     model: torch.nn.Module, prediction: Any, loss: Any, example_inputs: Any
 ) -> list[Any]:
diff --git a/torch/_dynamo/trace_rules.py b/torch/_dynamo/trace_rules.py
index 6a162350039d7..97a3946b48bde 100644
--- a/torch/_dynamo/trace_rules.py
+++ b/torch/_dynamo/trace_rules.py
@@ -180,6 +180,7 @@
     "torch.compiler.is_exporting": TorchInGraphFunctionVariable,
     "torch._C._to_dlpack": SkipFunctionVariable,
     "torch.to_dlpack": SkipFunctionVariable,
+    "torch._check": TorchInGraphFunctionVariable,
     # We graph break on RNG state setters or getters like
     # `torch.get_rng_state` or `torch.set_rng_state`. These functions
     # are not aten operations and therefore they are completely ignored
@@ -2319,6 +2320,8 @@
     torch_c_binding_in_graph_functions["math.exp2"] = TorchInGraphFunctionVariable
     torch_c_binding_in_graph_functions["math.cbrt"] = TorchInGraphFunctionVariable
 
+if sys.version_info >= (3, 13):
+    torch_c_binding_in_graph_functions["math.fma"] = TorchInGraphFunctionVariable
 
 # In graph functions (including constant folding) that are not C bindings
 # NOTE: [Cacheability of in-graph torch functions]
@@ -2343,7 +2346,6 @@
         "torch._check_type",
         "torch._check_value",
         "torch._check_with",
-        "torch._check",
         "torch._compile._disable_dynamo",
         "torch._functorch.apis.chunk_vmap",
         "torch._functorch.batch_norm_replacement.batch_norm_without_running_stats",
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index 644081ab68579..ec8f83c33d333 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -903,11 +903,33 @@ def reset_graph_break_dup_checker() -> None:
     graph_break_dup_warning_checker.reset()
 
 
+# Matches ANSI escape sequences (CSI)
+ANSI_ESCAPE_PATTERN = re.compile(
+    r"""
+    \x1B            # ESC
+    \[              # [
+    [0-?]*          # Parameter bytes
+    [ -/]*          # Intermediate bytes
+    [@-~]           # Final byte
+    """,
+    re.VERBOSE,
+)
+
+
+class StripAnsiFormatter(logging.Formatter):
+    """Logging formatter that strips ANSI escape codes."""
+
+    def format(self, record):
+        msg = super().format(record)
+        return ANSI_ESCAPE_PATTERN.sub("", msg)
+
+
 def add_file_handler() -> contextlib.ExitStack:
     log_path = os.path.join(get_debug_dir(), "torchdynamo")
     os.makedirs(log_path, exist_ok=True)
 
     log_file_handler = logging.FileHandler(os.path.join(log_path, "debug.log"))
+    log_file_handler.setFormatter(StripAnsiFormatter("%(message)s"))
     logger = logging.getLogger("torch._dynamo")
     logger.addHandler(log_file_handler)
 
@@ -1251,10 +1273,10 @@ def proxy_args_kwargs(args: Any, kwargs: Any) -> tuple[tuple[Any, ...], dict[str
         proxy_kwargs = {key: arg.as_proxy() for key, arg in kwargs.items()}
         return proxy_args, proxy_kwargs
     except NotImplementedError as e:
-        from .exc import unimplemented_v2
+        from .exc import unimplemented
         from .variables.base import typestr
 
-        unimplemented_v2(
+        unimplemented(
             gb_type="Failed to convert args/kwargs to proxy",
             context=f"call_function args: {typestr(*args)} {typestr(*list(kwargs.values()))}",
             explanation="Missing `as_proxy()` implementation for some arg/kwarg.",
@@ -2248,12 +2270,15 @@ def skip_frame_if_in_functorch_mode(val: torch.Tensor) -> None:
     try:
         val.data_ptr()  # will throw for functorch tensors
     except RuntimeError as e:
-        from .exc import SkipFrame
+        from .exc import format_skip_frame_message, SkipFrame
 
         # This will be GradTrackingTensor/BatchedTensor/etc
         functorch_subclass_name = re.sub(r"\(.*", "", repr(val))
         raise SkipFrame(
-            f"torch.compile cannot be run in context: {functorch_subclass_name}"
+            format_skip_frame_message(
+                None,
+                f"torch.compile cannot be run in context: {functorch_subclass_name}",
+            )
         ) from e
 
 
@@ -2756,9 +2781,9 @@ def _get_fake_tensor(vt: VariableTracker) -> Any:
     fake_tensor = vt.as_proxy().node.meta.get("example_value")
     if not is_fake(fake_tensor):
         from . import graph_break_hints
-        from .exc import unimplemented_v2
+        from .exc import unimplemented
 
-        unimplemented_v2(
+        unimplemented(
             gb_type="Cannot check Tensor object identity without its fake value",
             context=str(fake_tensor),
             explanation="TensorVariable is missing a fake example_value.",
@@ -2843,7 +2868,7 @@ def key_is_id(
 
 
 def key_to_id(value: Any) -> list[Any]:
-    return [id(k) if key_is_id(k) else k for k in value.keys()]
+    return [id(k) if key_is_id(k) else k for k in value]
 
 
 def const_repr(x: Any, *, local: Any) -> str:
@@ -2929,11 +2954,11 @@ def wrap_fake_exception(fn: Callable[[], Any]) -> Any:
     try:
         return fn()
     except UnsupportedFakeTensorException as e:
-        from .exc import unimplemented_v2
+        from .exc import unimplemented
 
         msg = f"Encountered exception ({e.reason}) during fake tensor propagation."
         log.warning(msg)
-        unimplemented_v2(
+        unimplemented(
             gb_type="Fake tensor propagation exception",
             context=str(e.reason),
             explanation=msg,
@@ -3263,7 +3288,7 @@ def get_multiplier() -> float:
                 log_error=log_error,
                 use_larger_multiplier_for_smaller_tensor=use_larger_multiplier_for_smaller_tensor,
             )
-            for key in ref.__dict__.keys()
+            for key in ref.__dict__
         )
     else:
         raise RuntimeError(f"unsupported type: {type(ref).__name__}")
@@ -3326,11 +3351,11 @@ def extract_fake_example_value(node: torch.fx.Node, required: bool = True) -> An
     if "example_value" in node.meta and is_fake(node.meta["example_value"]):
         return node.meta["example_value"]
     elif required:
-        from torch._dynamo.exc import unimplemented_v2
+        from torch._dynamo.exc import unimplemented
 
         from . import graph_break_hints
 
-        unimplemented_v2(
+        unimplemented(
             gb_type="Missing FakeTensor example value",
             context=str(node),
             explanation=f"`FakeTensor` example value was required for {node} but not available.",
@@ -3385,7 +3410,7 @@ def get_fake_value(
 
     from .exc import (
         TorchRuntimeError,
-        unimplemented_v2,
+        unimplemented,
         Unsupported,
         UserError,
         UserErrorType,
@@ -3479,7 +3504,7 @@ def get_fake_value(
                     "Consider wrapping the operator into a PyTorch-understood custom operator "
                     "(see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html)",
                 ]
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Data dependent operator",
                 context=str(cause.func),
                 explanation=f"Operator `{cause.func}` has a non-Tensor output "
@@ -3490,7 +3515,7 @@ def get_fake_value(
             cause, torch._subclasses.fake_tensor.DynamicOutputShapeException
         ):
             if not torch._dynamo.config.capture_dynamic_output_shape_ops:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Dynamic shape operator",
                     context=str(cause.func),
                     explanation=f"Operator `{cause.func}`'s output shape depends on input Tensor data.",
@@ -3500,7 +3525,7 @@ def get_fake_value(
                     ],
                 )
             else:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Dynamic shape operator (no meta kernel)",
                     context=str(cause.func),
                     explanation=f"Operator `{cause.func}` does not have a meta kernel that supports dynamic output shapes",
@@ -3524,7 +3549,7 @@ def get_fake_value(
                         f"module `{module}` and you may need to `import {module}`"
                         f"({ctx}), otherwise "
                     )
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Operator does not support running with fake tensors",
                 context=f"unsupported operator: {cause.func}",
                 explanation="",
@@ -3545,7 +3570,7 @@ def get_fake_value(
         elif isinstance(cause, ValueRangeError):
             raise UserError(UserErrorType.CONSTRAINT_VIOLATION, e.args[0]) from e
         elif isinstance(cause, TypeError) and "argument" in str(cause):
-            unimplemented_v2(
+            unimplemented(
                 gb_type="TypeError when making fake tensor call",
                 context=f"TypeError {node.target}: {cause}",
                 explanation="",
@@ -3623,9 +3648,9 @@ def make_error_message(e: Any) -> str:
                 return node.target(*args, **kwargs)  # type: ignore[operator]
             elif op == "call_method":
                 if not hasattr(args[0], node.target):  # type: ignore[arg-type]
-                    from .exc import unimplemented_v2
+                    from .exc import unimplemented
 
-                    unimplemented_v2(
+                    unimplemented(
                         gb_type="Missing attribute when running call_method node",
                         context="",
                         explanation=make_error_message("attribute not defined"),
@@ -3643,7 +3668,7 @@ def make_error_message(e: Any) -> str:
 
         except (NotImplementedError, UnsupportedFakeTensorException) as e:
             # NB: mimic how wrap_fake_exception does it
-            from .exc import unimplemented_v2
+            from .exc import unimplemented
 
             hints = []
             if isinstance(e, NotImplementedError):
@@ -3651,7 +3676,7 @@ def make_error_message(e: Any) -> str:
                     "If the op is a PyTorch op, please file an issue to PyTorch.",
                 ]
 
-            unimplemented_v2(
+            unimplemented(
                 gb_type="NotImplementedError/UnsupportedFakeTensorException when running FX node",
                 context="",
                 explanation=make_error_message(e),
@@ -4768,6 +4793,10 @@ def build_stream(args: tuple[Any], kwargs: dict[Any, Any]) -> torch.Stream:
     return torch._C.Stream(*args, **kwargs)
 
 
+def build_event(args: tuple[Any], kwargs: dict[Any, Any]) -> torch.Event:
+    return torch._C.Event(*args, **kwargs)
+
+
 class CompileTimeInstructionCounter:
     _counter: int = 0
     _id: int = -1
diff --git a/torch/_dynamo/variables/base.py b/torch/_dynamo/variables/base.py
index 0abf2cc91e784..4e248320e60b6 100644
--- a/torch/_dynamo/variables/base.py
+++ b/torch/_dynamo/variables/base.py
@@ -23,7 +23,7 @@
 
 from .. import graph_break_hints, variables
 from ..current_scope_id import current_scope_id
-from ..exc import raise_observed_exception, unimplemented_v2
+from ..exc import raise_observed_exception, unimplemented
 from ..guards import GuardBuilder, install_guard
 from ..source import AttrSource, Source
 from ..utils import cmp_name_to_op_mapping, istype
@@ -90,7 +90,7 @@ def __init__(self, typ: SourceType) -> None:
         elif typ is SourceType.New:
             self.scope = current_scope_id()
         else:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Unsupported SourceType",
                 context=f"MutationType.__init__ {self} {typ}",
                 explanation=f"Dynamo does not support the type `{typ}`",
@@ -349,7 +349,7 @@ def guard_as_python_constant(self) -> Any:
         try:
             return self.as_python_constant()
         except NotImplementedError:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Not a Python constant",
                 context=f"guard_as_python_constant {self}",
                 explanation=f"Failed to convert {self} into a Python constant.",
@@ -443,16 +443,8 @@ def force_apply_to_var_sequence(
         for v in self.unpack_var_sequence(tx):
             fn(v)
 
-    def inspect_parameter_names(self) -> list[str]:
-        unimplemented_v2(
-            gb_type="Unsupported inspect call",
-            context=f"inspect_parameter_names {self}",
-            explanation=f"Dynamo does not know how to trace the function `{self.debug_repr()}`",
-            hints=[],
-        )
-
     def call_obj_hasattr(self, tx: Any, name: str) -> "VariableTracker":
-        unimplemented_v2(
+        unimplemented(
             gb_type="Unsupported hasattr call",
             context=f"call_obj_hasattr {self} {name}",
             explanation=f"Dynamo does not know how to trace the function `{self.debug_repr()}`",
@@ -468,7 +460,7 @@ def call_function(
         args: Sequence["VariableTracker"],
         kwargs: dict[str, "VariableTracker"],
     ) -> "VariableTracker":
-        unimplemented_v2(
+        unimplemented(
             gb_type="Unsupported function call",
             context=f"call_function {self} {args} {kwargs}",
             explanation=f"Dynamo does not know how to trace the function `{self.debug_repr()}`",
@@ -514,7 +506,7 @@ def call_method(
                 or tx.output.side_effects.has_pending_mutation(self)
                 or tx.output.side_effects.has_pending_mutation(other)
             ):
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Builtin `operator.*` comparison with constant `self` failed",
                     context=f"call_method {self} {name} {args} {kwargs}",
                     explanation=f"Failed to compare {self} with {other}, "
@@ -560,7 +552,7 @@ def call_method(
                 "(2) fix any graph breaks in the function above the comprehension, (3) wrap the comprehension in a "
                 "function, or (4) use Python 3.12+."
             )
-        unimplemented_v2(
+        unimplemented(
             gb_type="Unsupported method call",
             context=f"call_method {self} {name} {args} {kwargs}",
             explanation=f"Dynamo does not know how to trace method `{name}` of class `{self.python_type_name()}`",
@@ -583,7 +575,7 @@ def is_realized(self) -> bool:
         return True
 
     def next_variable(self, tx: Any) -> "VariableTracker":
-        unimplemented_v2(
+        unimplemented(
             gb_type="Unsupported next() call",
             context=f"next({self})",
             explanation=f"Dynamo does not know how to trace calling `next()` on variable `{self}`.",
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index 81baaa236b0a8..b41da586c799c 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -58,6 +58,7 @@
 from torch._guards import TracingContext
 from torch._higher_order_ops.flat_apply import flat_apply
 from torch._higher_order_ops.torchbind import call_torchbind
+from torch._library.opaque_object import is_opaque_type
 from torch._ops import HigherOrderOperator
 from torch._subclasses.fake_tensor import FakeTensor, is_fake, maybe_get_fake_mode
 from torch._subclasses.meta_utils import is_sparse_any, safe_grad
@@ -87,7 +88,7 @@
 
 from .. import config, graph_break_hints, mutation_guard, replay_record, trace_rules
 from ..device_interface import get_registered_device_interfaces
-from ..exc import InternalTorchDynamoError, raise_observed_exception, unimplemented_v2
+from ..exc import InternalTorchDynamoError, raise_observed_exception, unimplemented
 from ..guards import GuardBuilder, install_guard, make_dupe_guard
 from ..pgo import (
     auto_dynamic,
@@ -359,6 +360,13 @@ class GraphArg:
     # stash a strong reference too.
     example_strong_ref: Optional[torch.Tensor] = None
 
+    def __setattr__(self, name, value):
+        # Use object.__setattr__ to bypass Dynamo's STORE_ATTR interception.
+        # This is needed because when PYTORCH_TEST_WITH_DYNAMO=1, even internal
+        # GraphArg creation can be traced, and with replay_side_effects=False,
+        # normal STORE_ATTR bytecode only records mutations without applying them.
+        object.__setattr__(self, name, value)
+
     @property
     def example(self):
         if isinstance(self._example, TensorWeakRef):
@@ -567,7 +575,7 @@ def wrap_removable_handle(self, value):
         # Our current infra requires the hook to be registered and removed in
         # the same frame. So graph break.
         # Related test - PYTORCH_TEST_WITH_DYNAMO=1 python test/test_autograd.py -k TestAutograd.test_hooks
-        unimplemented_v2(
+        unimplemented(
             gb_type="Attempted to represent unregistered RemovableHandle",
             context="",
             explanation="Dynamo attempted to build a representation of a torch.utils.hooks.RemovableHandle, "
@@ -586,12 +594,12 @@ def wrap_mapping_proxy(self, value):
         # This might be suboptimal compared to dict guards. But mappingproxy is
         # not very common, so its ok to guard on all keys.
         self.install_guards(GuardBuilder.MAPPING_KEYS_CHECK)
-        all_const = all(ConstantVariable.is_literal(k) for k in value.keys())
+        all_const = all(ConstantVariable.is_literal(k) for k in value)
 
         if not all_const:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="non-const keys in mappingproxy",
-                context=f"non-const keys: {[k for k in value.keys() if not ConstantVariable.is_literal(k)]}",
+                context=f"non-const keys: {[k for k in value.keys() if not ConstantVariable.is_literal(k)]}",  # noqa: SIM118
                 explanation="Dynamo expects mappingproxy keys to be constants.",
                 hints=[
                     "Ensure your mappingproxy keys are constants (e.g. int, float, strings)",
@@ -732,7 +740,7 @@ def from_tensor():
             return self.tx.output.side_effects.track_object_existing(value, result)
         elif istype(value, (dict, collections.defaultdict, collections.OrderedDict)):
             self.install_guards(GuardBuilder.TYPE_MATCH)
-            all_const = all(ConstantVariable.is_literal(k) for k in value.keys())
+            all_const = all(ConstantVariable.is_literal(k) for k in value)
 
             # For all_const, we don't have to guard on anything yet. We guard on
             # keys lazily by adding a dict_getitem entry for each accessed key.
@@ -807,7 +815,7 @@ def build_key_value(i, k, v):
             return var
         elif istype(value, set):
             if any(isinstance(x, torch.Tensor) for x in value):
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Attempted to wrap a set with tensors",
                     context="Python set containing torch.Tensor elements",
                     explanation=(
@@ -888,7 +896,7 @@ def build_key_value(i, k, v):
             keywords_source = AttrSource(self.get_source(), "keywords")
             for k, v in value.keywords.items():
                 if not ConstantVariable.is_literal(k):
-                    unimplemented_v2(
+                    unimplemented(
                         gb_type="functools.partial() with non-literal keyword",
                         context=f"non-literal keyword: {k}",
                         explanation="functools.partial() expects literal/string keywords",
@@ -1039,7 +1047,7 @@ def build_key_value(i, k, v):
             return self.wrap_unspecialized_primitive(value)
         elif isinstance(value, HigherOrderOperator):
             if value is torch._higher_order_ops.invoke_subgraph:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Attempted to wrap torch._higher_order_ops.invoke_subgraph",
                     context="",
                     explanation="Directly using invoke_subgraph is not supported. Use nested_compile_region",
@@ -1061,9 +1069,7 @@ def build_key_value(i, k, v):
             )
             set_example_value(stream_proxy.node, value)
             var = StreamVariable(
-                stream_proxy,
-                value,
-                source=self.source,
+                stream_proxy, value, source=self.source, user_object_index=index
             )
             return self.tx.output.side_effects.track_object_existing(value, var)
         elif isinstance(value, (torch._C._SDPAParams)):
@@ -1085,6 +1091,7 @@ def build_key_value(i, k, v):
             return EventVariable(
                 event_proxy,
                 value,
+                index,
                 source=self.source,
             )
         elif (
@@ -1170,7 +1177,7 @@ def build_key_value(i, k, v):
                 f"{sym_expr} is not a basic Symbol."
             )
             self.tx.output.tracked_fakes.append(TrackedFake(node, source, None))
-            return SymNodeVariable(sym_node_proxy, node)
+            return SymNodeVariable.create(self.tx, sym_node_proxy, node)
         elif is_torch_sym(value):
             # Note: this doesn't handle nested symints.
             # For SymBool input, we reuse the infra for SymInt by simulating SymBool with a SymInt in dynamo.
@@ -1203,7 +1210,7 @@ def build_key_value(i, k, v):
                     # this is automatically done by evaluating the guards once but this
                     # will cause data-dependent error when we evaluate the outer unbacked symints.
                     # The test case that triggers this graph break is test_cond_unbacked_symint_closure
-                    unimplemented_v2(
+                    unimplemented(
                         gb_type="Attempted to wrap unbacked SymInt",
                         context="",
                         explanation="Unbacked SymInt input is not supported yet.",
@@ -1446,27 +1453,32 @@ def build_key_value(i, k, v):
                     source=self.source,
                 )
 
-            # This exists to allow a smoother transition.
-            # The implications are:
-            # The script objects won't be tracked as proxies.
-            # Methods on these objects won't show up in the graph.
-            # The original script object might be mutated.
-            if not hasattr(value, "__obj_flatten__"):
-                return self.wrap_user_defined(value)
+            if is_opaque_type(type(value)):
+                self.install_guards(GuardBuilder.TYPE_MATCH)
 
-            # Install the guards on the fully qualified name of the script object
-            LazyVariableTracker.realize_all(
-                VariableBuilder(self.tx, ScriptObjectQualifiedNameSource(self.source))(
-                    value._type().qualified_name()  # type: ignore[attr-defined]
+            elif not hasattr(value, "__obj_flatten__"):
+                # This exists to allow a smoother transition.
+                # The implications are:
+                # The script objects won't be tracked as proxies.
+                # Methods on these objects won't show up in the graph.
+                # The original script object might be mutated.
+                return self.wrap_user_defined(value)
+            else:
+                # Install the guards on the fully qualified name of the script object
+                LazyVariableTracker.realize_all(
+                    VariableBuilder(
+                        self.tx, ScriptObjectQualifiedNameSource(self.source)
+                    )(
+                        value._type().qualified_name()  # type: ignore[attr-defined]
+                    )
                 )
-            )
-            # Install the guards on the content of the script object by setting the source
-            # to be FlattenScriptObjectSource, which calls __obj_flatten__() to get the contents.
-            LazyVariableTracker.realize_all(
-                VariableBuilder(self.tx, FlattenScriptObjectSource(self.source))(
-                    value.__obj_flatten__()
+                # Install the guards on the content of the script object by setting the source
+                # to be FlattenScriptObjectSource, which calls __obj_flatten__() to get the contents.
+                LazyVariableTracker.realize_all(
+                    VariableBuilder(self.tx, FlattenScriptObjectSource(self.source))(
+                        value.__obj_flatten__()
+                    )
                 )
-            )
 
             fake_script_obj = torch._library.fake_class_registry.maybe_to_fake_obj(
                 self.tx.output.fake_mode, value
@@ -1617,7 +1629,7 @@ def build_key_value(i, k, v):
                 )
                 return DictKeySetVariable(items, source=self.source)
             else:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="non-const keys in dict_keys",
                     context=f"non-const keys: {[k for k in value if not ConstantVariable.is_literal(k)]}",
                     explanation="Dynamo expects dict_keys keys to be constants.",
@@ -1666,7 +1678,7 @@ def wrap_user_defined(self, value: Any):
     def wrap_listlike(self, value: Union[tuple, list, odict_values, NamedTuple]):
         for item in value:
             if item is value:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="list elements are pointing to the list itself",
                     context="",
                     explanation="Dynamo does not support lists whose items reference to itself",
@@ -1835,7 +1847,7 @@ def wrap_module(self, value: torch.nn.Module):
         from ..eval_frame import OptimizedModule
 
         if len(value.__dict__) == 0:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Uninitialized nn.Module",
                 context=typestr(value),
                 explanation=f"Attempted to trace an uninitialized nn.Module of type {typestr(value)}.",
@@ -1867,7 +1879,7 @@ def wrap_module(self, value: torch.nn.Module):
             isinstance(value, (torch.nn.RNN, torch.nn.GRU, torch.nn.LSTM))
             and not config.allow_rnn
         ):
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Attempted to wrap RNN, GRU, or LSTM",
                 context=str(value),
                 explanation="Dynamo does not support RNN, GRU, or LSTM.",
@@ -1881,7 +1893,7 @@ def wrap_module(self, value: torch.nn.Module):
             # we can't do this assert inside FSDP constructor,
             # since we don't know yet whether dynamo will be used
             if not getattr(value, "_fsdp_use_orig_params", False):
-                unimplemented_v2(
+                unimplemented(
                     gb_type="FSDP with use_orig_params=False",
                     context="",
                     explanation="Dynamo only supports FSDP with use_orig_params=True",
@@ -2146,7 +2158,7 @@ def wrap_tensor(self, value: torch.Tensor):
             and value.is_nested
             and not isinstance(value, torch.nested._internal.nested_tensor.NestedTensor)
         ):
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Attempted to wrap strided NestedTensor",
                 context="",
                 explanation="torch.compile does not support strided NestedTensor",
@@ -2162,7 +2174,7 @@ def wrap_tensor(self, value: torch.Tensor):
             # A hot fix for sparse tensors + torch.compile. Support for
             # export + sparsity is being added but we need to create
             # SPARSE_TENSOR_GUARDS for guards to work properly.
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Attempted to wrap sparse Tensor",
                 context="",
                 explanation="torch.compile does not support sparse Tensors",
@@ -2174,7 +2186,7 @@ def wrap_tensor(self, value: torch.Tensor):
             and safe_grad(value) is not None
             and value.dtype != safe_grad(value).dtype
         ):
-            unimplemented_v2(
+            unimplemented(
                 gb_type="dtype mismatch between tensor and its gradient",
                 context=f"tensor dtype: {value.dtype}; grad dtype: {safe_grad(value).dtype}",
                 explanation="Inconsistent dtype between tensor and its gradient. "
@@ -2228,25 +2240,70 @@ def wrap_tensor(self, value: torch.Tensor):
         if isinstance(source, GradSource) and is_from_optimizer_source(source):
             guard_type = GuardBuilder.NOT_NONE_MATCH
 
-        self.install_guards(
-            functools.partial(
-                guard_type,
-                value=(
-                    value
-                    if isinstance(source, NumpyTensorSource)
-                    else TensorWeakRef(value)
-                ),
-            )
+        is_dtensor = torch.distributed.is_available() and isinstance(
+            value, torch.distributed.tensor.DTensor
         )
+        if not is_dtensor:
+            # We guard on the _local_tensor and the _spec, and therefore we dont
+            # have to guard on the outer DTensor.
+            self.install_guards(
+                functools.partial(
+                    guard_type,
+                    value=(
+                        value
+                        if isinstance(source, NumpyTensorSource)
+                        else TensorWeakRef(value)
+                    ),
+                )
+            )
 
         # We install TYPE_MATCH guards for traceable wrapper subclass object,
         # and recursively install corresponding guard for each inner attribute.
         if is_traceable_wrapper_subclass(value):
-            self.install_guards(GuardBuilder.TENSOR_SUBCLASS_METADATA_MATCH)
-            self.install_guards(GuardBuilder.TYPE_MATCH)
-            install_guard(
-                SubclassAttrListSource(source).make_guard(GuardBuilder.EQUALS_MATCH)
-            )
+            # Tensor subclass guards are very expensive because they are
+            # implemented in Python. Since DTensor is PyTorch-maintained class,
+            # we can skip a lot of these guards.
+            if is_dtensor:
+                self.install_guards(GuardBuilder.TYPE_MATCH)
+
+                # The inner tensor name is always _local_tensor. If its not, we
+                # raise assertion to update the check accordingly.
+                inner_tensor_name = value.__tensor_flatten__()[0][0]
+                if inner_tensor_name != "_local_tensor":
+                    raise RuntimeError(
+                        "Expecting Dtensor inner tensor name to be _local_tensor"
+                    )
+
+                # Now selectively guard on the flattening context
+                flattening_ctx = value.__tensor_flatten__()[1]
+                # This is supposed to be (self._spec, self.requires_grad)
+                if not (
+                    len(flattening_ctx) == 2
+                    and flattening_ctx[0] == value._spec
+                    and flattening_ctx[1] == value.requires_grad
+                ):
+                    # If not, raise an assertion to update to the new guards
+                    raise RuntimeError(
+                        "Expecting Dtensor flattening ctx to be _spec, requires_grad"
+                    )
+                # Guard on the dtensor spec
+                install_guard(
+                    AttrSource(self.source, "_spec").make_guard(
+                        GuardBuilder.DTENSOR_SPEC_MATCH
+                    )
+                )
+                # Move this to C++
+                install_guard(
+                    AttrSource(self.source, "requires_grad").make_guard(
+                        GuardBuilder.EQUALS_MATCH
+                    )
+                )
+            else:
+                self.install_guards(GuardBuilder.TENSOR_SUBCLASS_METADATA_MATCH)
+                self.install_guards(GuardBuilder.TYPE_MATCH)
+                install_guard(
+                    SubclassAttrListSource(source).make_guard(GuardBuilder.EQUALS_MATCH)
+                )
 
             attrs, _ = value.__tensor_flatten__()
             for attr in attrs:
@@ -2295,7 +2352,7 @@ def wrap_numpy_ndarray(self, value):
                     tensor_value = clone_preserve_strides(tensor_value)
             except NotImplementedError as e:
                 # failed to convert to tensor, graph break
-                unimplemented_v2(
+                unimplemented(
                     gb_type="failed to convert numpy.ndarray to Tensor",
                     context=str(value),
                     explanation="Exception encountered when attempting to convert numpy.ndarray to Tensor",
@@ -2455,7 +2512,7 @@ def wrap_symint(
         sym_expr = wrapped_value.node.expr
         assert isinstance(sym_expr, sympy.Symbol), f"{sym_expr} is not a basic Symbol."
         self.tx.output.root_tracer.bound_symbols[sym_expr] = proxy
-        unspec_var = SymNodeVariable(proxy, wrapped_value, **options)
+        unspec_var = SymNodeVariable.create(self.tx, proxy, wrapped_value, **options)
         self.tx.output.unspec_variable_map[self.name] = unspec_var
 
         if not is_constant_source(self.get_source()):
@@ -2674,7 +2731,7 @@ def _dataclasses_fields_lambda(obj):
     if isinstance(obj, UserDefinedObjectVariable):
         value = obj.value
     else:
-        unimplemented_v2(
+        unimplemented(
             gb_type="dataclass fields failure",
             context=f"obj: {obj}; variable type: {type(obj)}",
             explanation=f"Dataclass fields handling fails for {obj}. Expected it to be a user-defined object.",
@@ -2785,21 +2842,34 @@ def wrap_fx_proxy_cls(
     target_cls, tx, proxy, example_value=None, subclass_type=None, **options
 ):
     if example_value is None:
-        return _wrap_fx_proxy(
+        out = _wrap_fx_proxy(
             target_cls, tx, proxy, example_value, subclass_type, **options
         )
     elif isinstance(example_value, torch.Tensor):
-        return _wrap_fx_preexisting_tensor(
+        out = _wrap_fx_preexisting_tensor(
             target_cls, tx, proxy, example_value, subclass_type, **options
         )
     else:
         # This will skip tracing an op and recursively reinvoke wrap_fx_proxy_cls on supported
         # data structures. In essence this just handles tracing some other value which may
         # contain Fake Tensors or is otherwise proxyable.
-        return handle_traced_output(
+        out = handle_traced_output(
             example_value, tx, proxy, options, subclass_type, target_cls
         )
 
+    if (
+        isinstance(
+            out,
+            (
+                torch._dynamo.variables.TensorVariable,
+                torch._dynamo.variables.SymNodeVariable,
+            ),
+        )
+        and proxy.node.op != "placeholder"
+    ):
+        tx.output.current_tracer.record_tensor_or_symint_vt(out)
+    return out
+
 
 # This is 1 above (wrapping a preexisting tensor)
 def _wrap_fx_preexisting_tensor(
@@ -2902,7 +2972,7 @@ def handle_traced_output(example_value, tx, proxy, options, subclass_type, targe
         if is_sparse_any(example_value) and (
             not tx.export or not config.capture_sparse_compute
         ):
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Attempted to wrap sparse Tensor with VariableTracker",
                 context=str(example_value),
                 explanation="torch.compile does not support sparse Tensors with VariableTracker",
@@ -3003,17 +3073,31 @@ def handle_traced_output(example_value, tx, proxy, options, subclass_type, targe
     elif isinstance(example_value, (torch.SymInt, torch.SymFloat, torch.SymBool)):
         tx.output.current_tracer.track_produced_symints(example_value, proxy)
         set_example_value(proxy.node, example_value)
-        return SymNodeVariable(proxy, example_value, **options)
+        return SymNodeVariable.create(tx, proxy, example_value, **options)
     elif (
         isinstance(example_value, torch.Stream)
-        and proxy.node.target
-        in (get_external_object_by_index, torch.accelerator.current_stream)
+        and proxy.node.target is get_external_object_by_index
+    ) or proxy.node.target in [
+        device_interface.current_stream
+        for _, device_interface in get_registered_device_interfaces()
+    ]:
+        set_example_value(proxy.node, example_value)
+        index = None
+        if proxy.node.target is get_external_object_by_index:
+            index = proxy.node.args[0]
+        return StreamVariable(proxy, example_value, index, **options)
+    elif (
+        isinstance(example_value, torch.Event)
+        and proxy.node.target is get_external_object_by_index
     ) or proxy.node.target in [
         device_interface.current_stream
         for _, device_interface in get_registered_device_interfaces()
     ]:
+        index = None
+        if proxy.node.target is get_external_object_by_index:
+            index = proxy.node.args[0]
         set_example_value(proxy.node, example_value)
-        return StreamVariable(proxy, example_value, **options)
+        return EventVariable(proxy, example_value, index, **options)
     elif (
         inspect.isclass(proxy.node.target)
         and issubclass(proxy.node.target, torch.Event)
@@ -3022,7 +3106,7 @@ def handle_traced_output(example_value, tx, proxy, options, subclass_type, targe
         for _, device_interface in get_registered_device_interfaces()
     ]:
         set_example_value(proxy.node, example_value)
-        return EventVariable(proxy, example_value, **options)
+        return EventVariable(proxy, example_value, None, **options)
     elif proxy.node.target == "query" and proxy.node.op == "call_method":
         set_example_value(proxy.node, example_value)
         return ConstantVariable(example_value, **options)
@@ -3033,7 +3117,7 @@ def handle_traced_output(example_value, tx, proxy, options, subclass_type, targe
         and proxy.node.op == "call_method"
     ):
         set_example_value(proxy.node, example_value)
-        return EventVariable(proxy, example_value, **options)
+        return EventVariable(proxy, example_value, None, **options)
     elif isinstance(example_value, int) and (
         proxy.node.target
         in [
@@ -3095,7 +3179,7 @@ def handle_traced_output(example_value, tx, proxy, options, subclass_type, targe
         set_example_value(proxy.node, example_value)
         return ConstantVariable.create(example_value, **options)
     else:
-        unimplemented_v2(
+        unimplemented(
             gb_type="torch.* op returned non-Tensor",
             context=f"example_value type: {typestr(example_value)}; op: {proxy.node.op}; target: {proxy.node.target}",
             explanation="torch.* ops that return a non-Tensor cannot be traced into the Dynamo FX graph output",
@@ -3295,7 +3379,7 @@ def _automatic_dynamic(
     if e.is_nested and not isinstance(
         e, torch.nested._internal.nested_tensor.NestedTensor
     ):
-        unimplemented_v2(
+        unimplemented(
             gb_type="Encountered strided NestedTensor in automatic dynamic dim determination",
             context="",
             explanation="torch.compile does not support strided NestedTensor",
@@ -3757,7 +3841,7 @@ def create(tx: "InstructionTranslator", value) -> VariableTracker:
         ):
             proxy = tx.output.bound_symbols[value.node.expr]
             return SymNodeVariable.create(tx, proxy)
-        unimplemented_v2(
+        unimplemented(
             gb_type="Unexpected type in sourceless builder",
             context=f"{value_type.__module__}.{value_type.__qualname__}",
             explanation=f"SourcelessBuilder.create does not know how to wrap {value_type}",
diff --git a/torch/_dynamo/variables/builtin.py b/torch/_dynamo/variables/builtin.py
index 1817a5f3c7ed1..746db0f3dfd62 100644
--- a/torch/_dynamo/variables/builtin.py
+++ b/torch/_dynamo/variables/builtin.py
@@ -45,7 +45,7 @@
     ObservedAttributeError,
     ObservedUserStopIteration,
     raise_observed_exception,
-    unimplemented_v2,
+    unimplemented,
     Unsupported,
     UserError,
     UserErrorType,
@@ -449,13 +449,13 @@ def _binops() -> dict[
                 ["__rshift__", "__rrshift__", "__irshift__"],
                 operator.irshift,
             ),
+            operator.xor: (["__xor__", "__rxor__", "__ixor__"], operator.xor),
             # NB: The follow binary operators are not supported for now, since the
             # corresponding magic methods aren't defined on SymInt / SymFloat:
             # operator.matmul
             # divmod
             # operator.and_
             # operator.or_
-            # operator.xor
         }
         return fns
 
@@ -1034,7 +1034,7 @@ def create_exception_class_object(
                     and isinstance(x.value, str)
                     for x in args
                 ):
-                    unimplemented_v2(
+                    unimplemented(
                         gb_type="assert with non-string message",
                         context=str(args),
                         explanation="Dynamo only supports asserts with string messages",
@@ -1104,7 +1104,7 @@ def call_self_handler(
                                 self_handler,
                                 e,
                             )
-                            unimplemented_v2(
+                            unimplemented(
                                 gb_type="invalid call to builtin op handler",
                                 context=f"invalid args to {self_handler}: {args} {kwargs}",
                                 explanation=f"Encountered TypeError when trying to handle op {fn.__name__}",
@@ -1145,7 +1145,7 @@ def constant_fold_handler(
                             args=list(map(ConstantVariable.create, exc.args)),
                         )
                     except AsPythonConstantNotImplementedError as exc:
-                        unimplemented_v2(
+                        unimplemented(
                             gb_type="constant fold exception",
                             context=f"attempted to run function {fn} with arguments {args}",
                             explanation="Encountered exception when attempting to constant fold.",
@@ -1172,7 +1172,7 @@ def constant_fold_handler(
                                 },
                             )
                         except AsPythonConstantNotImplementedError as exc:
-                            unimplemented_v2(
+                            unimplemented(
                                 gb_type="constant fold exception",
                                 context=f"attempted to run function {fn} with arguments {args}",
                                 explanation="Encountered exception when attempting to constant fold.",
@@ -1191,9 +1191,9 @@ def constant_fold_handler(
 
             handlers.append(constant_fold_handler)
 
-        def call_unimplemented_v2(args: Sequence[VariableTracker]) -> None:
+        def call_unimplemented(args: Sequence[VariableTracker]) -> None:
             real_arg_types = [arg.python_type_name() for arg in args]
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Failed to trace builtin operator",
                 context=f"builtin {fn.__name__} {arg_types} {has_kwargs}",
                 explanation=f"Dynamo does not know how to trace builtin operator `{fn.__name__}` "
@@ -1208,7 +1208,7 @@ def call_unimplemented_v2(args: Sequence[VariableTracker]) -> None:
             )
 
         if len(handlers) == 0:
-            return lambda tx, args, kwargs: call_unimplemented_v2(args)
+            return lambda tx, args, kwargs: call_unimplemented(args)
         elif len(handlers) == 1:
             (handler,) = handlers
 
@@ -1220,7 +1220,7 @@ def builtin_dispatch(
                 rv = handler(tx, args, kwargs)
                 if rv:
                     return rv
-                call_unimplemented_v2(args)
+                call_unimplemented(args)
                 return rv
 
         else:
@@ -1235,14 +1235,14 @@ def builtin_dispatch(
                     rv = fn(tx, args, kwargs)
                     if rv:
                         return rv
-                call_unimplemented_v2(args)
+                call_unimplemented(args)
                 return rv
 
         return builtin_dispatch
 
     def call_vars(self, tx: "InstructionTranslator", *args: Any) -> VariableTracker:
         if len(args) == 0:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="unimplemented builtin op vars() with no arguments",
                 context=f"vars: {self} {args}",
                 explanation=f"Dynamo does not know how to trace builtin operator {self.fn} with no arguments",
@@ -1394,7 +1394,7 @@ def _handle_insert_op_in_graph(
                 return wrap_fx_proxy(tx, proxy)
 
         except NotImplementedError:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="unimplemented builtin op on tensor arguments",
                 context=f"partial tensor op: {self} {args} {kwargs}",
                 explanation=f"Dynamo does not know how to trace builtin operator {self.fn} with tensor arguments",
@@ -1608,6 +1608,21 @@ def call_bool(
         # TODO handle more cases and merge this with this with `generic_jump`.
         return None
 
+    def call_repr(self, tx: "InstructionTranslator", arg):
+        """Handle repr() on user defined objects."""
+        if isinstance(arg, variables.UserDefinedObjectVariable):
+            repr_method = arg.value.__repr__
+
+            if type(arg.value).__repr__ is object.__repr__:
+                # Default repr - build and trace it
+                fn_vt = VariableTracker.build(tx, repr_method)
+                return fn_vt.call_function(tx, [], {})
+            else:
+                # Custom repr - inline the method for tracing
+                bound_method = repr_method.__func__
+                fn_vt = VariableTracker.build(tx, bound_method)
+                return fn_vt.call_function(tx, [arg], {})
+
     def call_str(
         self, tx: "InstructionTranslator", arg: VariableTracker
     ) -> VariableTracker | None:
@@ -1622,7 +1637,7 @@ def call_str(
                 # account for __repr__ functions when __str__ is absent
                 str_method = arg.value.__repr__
             else:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="failed to call str() on user defined object",
                     context=str(arg),
                     explanation="User defined object has no __str__ or __repr__ method",
@@ -1639,7 +1654,7 @@ def call_str(
                     return None
             # pyrefly: ignore [unbound-name]
             elif is_wrapper_or_member_descriptor(str_method):
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Attempted to a str() method implemented in C/C++",
                     context="",
                     explanation=f"{type(arg.value)} has a C/C++ based str method. This is not supported.",
@@ -1819,7 +1834,7 @@ def call_index(
         self, tx: "InstructionTranslator", arg: VariableTracker
     ) -> VariableTracker:
         if isinstance(arg, variables.TensorVariable):
-            unimplemented_v2(
+            unimplemented(
                 gb_type="unsupported index(Tensor)",
                 context="",
                 explanation="Dynamo does not support tracing builtin index() on a Tensor",
@@ -1991,7 +2006,7 @@ def call_iter(
             # If the object implements a __getitem__ method, iter(...) will call obj.__getitem__()
             # with an integer argument starting at 0, until __getitem__ raises IndexError
             ret = variables.UserFunctionVariable(
-                polyfills.builtins.iter_
+                polyfills.builtins.iter_  # type: ignore[arg-type]
             ).call_function(tx, [obj, *args], {})
 
             if args:
@@ -2044,7 +2059,7 @@ def call_cast(
         if len(args) == 2:
             return args[1]
 
-        unimplemented_v2(
+        unimplemented(
             gb_type="bad args to builtin cast()",
             context=f"got args {args} {kwargs}",
             explanation="Dynamo expects exactly 2 args to builtin cast().",
@@ -2061,7 +2076,11 @@ def call_dir(
         return None
 
     def call_dict(
-        self, tx: "InstructionTranslator", *args: Any, **kwargs: Any
+        self,
+        tx: "InstructionTranslator",
+        /,
+        *args: VariableTracker,
+        **kwargs: VariableTracker,
     ) -> VariableTracker:
         return BuiltinVariable.call_custom_dict(tx, dict, *args, **kwargs)
 
@@ -2069,6 +2088,7 @@ def call_dict(
     def call_custom_dict(
         tx: "InstructionTranslator",
         user_cls: type,
+        /,
         *args: VariableTracker,
         **kwargs: VariableTracker,
     ) -> VariableTracker:
@@ -2093,11 +2113,12 @@ def call_custom_dict(
     def call_custom_dict_fromkeys(
         tx: "InstructionTranslator",
         user_cls: type,
+        /,
         *args: VariableTracker,
         **kwargs: VariableTracker,
     ) -> VariableTracker:
         if user_cls not in {dict, OrderedDict, defaultdict}:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Unsupported dict type for fromkeys()",
                 context=f"{user_cls.__name__}.fromkeys(): {args} {kwargs}",
                 explanation=f"Failed to call {user_cls.__name__}.fromkeys() because "
@@ -2144,7 +2165,7 @@ def call_custom_dict_fromkeys(
         )
 
         if isinstance(arg, dict):
-            arg_list = [ConstantVariable.create(k) for k in arg.keys()]
+            arg_list = [ConstantVariable.create(k) for k in arg]
             return DictVariableType(
                 # pyrefly: ignore [bad-argument-type]
                 dict.fromkeys(arg_list, value),
@@ -2161,7 +2182,7 @@ def call_custom_dict_fromkeys(
                     mutation_type=ValueMutationNew(),
                 )
 
-        unimplemented_v2(
+        unimplemented(
             gb_type="failed to call dict.fromkeys()",
             context=f"{user_cls.__name__}.fromkeys(): {args} {kwargs}",
             explanation=f"Failed to call {user_cls.__name__}.fromkeys() because "
@@ -2295,7 +2316,7 @@ def call_isinstance(
         try:
             arg_type = arg.python_type()
         except NotImplementedError:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="builtin isinstance() cannot determine type of argument",
                 context=f"isinstance({arg}, {isinstance_type_var})",
                 explanation=f"Dynamo doesn't have a rule to determine the type of argument {arg}",
@@ -2338,7 +2359,7 @@ def check_type(ty: Any) -> bool:
         if isinstance(arg, variables.UserDefinedObjectVariable) and isinstance(
             arg.value, types.MemberDescriptorType
         ):
-            unimplemented_v2(
+            unimplemented(
                 gb_type="isinstance() called on user defined object with C extensions",
                 context=f"isinstance({arg}, {isinstance_type})",
                 explanation="User-defined object with C extensions can have torch.Tensor "
@@ -2406,7 +2427,7 @@ def call_issubclass(
             left_ty_py = left_ty.as_python_constant()
             right_ty_py = right_ty.as_python_constant()
         except NotImplementedError:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="issubclass() with non-constant arguments",
                 context=f"issubclass({left_ty}, {right_ty})",
                 explanation="issubclass() with non-constant arguments not supported.",
@@ -2499,7 +2520,7 @@ def call_getattr(
         default: VariableTracker | None = None,
     ) -> VariableTracker | None:
         if not name_var.is_python_constant():
-            unimplemented_v2(
+            unimplemented(
                 gb_type="getattr() with non-constant name argument",
                 context=f"getattr({obj}, {name_var}, {default})",
                 explanation="getattr() with non-constant name argument is not supported",
@@ -2527,7 +2548,7 @@ def call_getattr(
                     and obj.is_state_mutated
                     and tx.output.side_effects.has_pending_mutation(obj)
                 ):
-                    unimplemented_v2(
+                    unimplemented(
                         gb_type="getattr() on nn.Module with pending mutation",
                         context=f"getattr({obj}, {name}, {default})",
                         explanation="Intentionally graph breaking on getattr() on a nn.Module "
@@ -2592,7 +2613,7 @@ def call_getattr(
                     "assertWarns",
                 )
             ):
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Failed to trace unittest method",
                     context=f"function: unittest.TestCase.{name}",
                     explanation=f"Dynamo does not know how to trace unittest method `{name}` ",
@@ -2608,7 +2629,7 @@ def call_getattr(
                     and is_sparse_any(fake_val)
                     and (not tx.export or not config.capture_sparse_compute)
                 ):
-                    unimplemented_v2(
+                    unimplemented(
                         gb_type="Attempted to wrap sparse Tensor",
                         context="",
                         explanation="torch.compile does not support sparse Tensors",
@@ -2685,7 +2706,7 @@ def call_setattr(
                 # Some special handling for tensor attributes.
                 if name == "requires_grad":
                     # TODO(voz): Make it work properly
-                    unimplemented_v2(
+                    unimplemented(
                         gb_type="setattr() on Tensor.requires_grad",
                         context=f"setattr({obj}, {name}, {val})",
                         explanation="setattr() on Tensor.requires_grad not supported. "
@@ -2697,7 +2718,7 @@ def call_setattr(
                     # See comments on `test_set_data_on_scoped_tensor` for plans
                     # to support this.
                     if obj.source is None:
-                        unimplemented_v2(
+                        unimplemented(
                             gb_type="Failed to mutate tensor data attribute",
                             context=f"setattr({obj}, {name}, {val})",
                             explanation="Dyanmo only supports mutating `.data`"
@@ -2708,7 +2729,7 @@ def call_setattr(
                             ],
                         )
                     elif obj.dtype != val.dtype:  # type: ignore[attr-defined]
-                        unimplemented_v2(
+                        unimplemented(
                             gb_type="Failed to mutate tensor data attribute to different dtype",
                             context=f"setattr({obj}, {name}, {val})",
                             explanation="Dyanmo only supports mutating `.data`"
@@ -2774,7 +2795,7 @@ def _lower_version_count_by_1(x: torch.Tensor) -> torch.Tensor:
                     # Attribute like `torch.Tensor.real` has special setters we
                     # don't yet support; it's not as simple adding an entry to
                     # the side effect mapping.
-                    unimplemented_v2(
+                    unimplemented(
                         gb_type="Failed to set tensor attribute",
                         context=f"setattr({obj}, {name}, {val})",
                         explanation="Dyanmo doesn't support setting these tensor attributes",
@@ -2934,7 +2955,7 @@ def call_id(
         elif istype(args[0], variables.FunctoolsPartialVariable):
             return variables.ConstantVariable.create(id(args[0].fake_value))
         else:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="id() with unsupported args",
                 context=str(args),
                 explanation=f"Dynamo doesn't know how to trace id() call with args {args}",
@@ -2948,7 +2969,7 @@ def call_id(
     def call_deepcopy(
         self, tx: "InstructionTranslator", x: VariableTracker
     ) -> VariableTracker:
-        unimplemented_v2(
+        unimplemented(
             gb_type="copy.deepcopy()",
             context=f"copy.deepcopy({x})",
             explanation="Dynamo does not support copy.deepcopy()",
@@ -2979,7 +3000,7 @@ def _comparison_with_tensor(
                 return ConstantVariable.create(not is_result)
 
         if op not in supported_tensor_comparison_op_values:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="unsupported Tensor comparison op",
                 context=f"{op.__name__}({left}, {right})",
                 explanation=f"Dynamo does not support the comparison op {op.__name__} "
@@ -2996,7 +3017,7 @@ def _comparison_with_tensor(
                 torch.broadcast_shapes(left.size, right.size)
             except RuntimeError:
                 # not broadcastable, can't be compared
-                unimplemented_v2(
+                unimplemented(
                     gb_type="failed to broadcast when attempting Tensor comparison op",
                     context=f"{op.__name__}({left}, {right})",
                     explanation=f"Dynamo was unable to broad cast the arguments {left}, {right} "
@@ -3021,7 +3042,7 @@ def _comparison_with_symnode(
         op = self.fn
 
         if op not in supported_tensor_comparison_op_values:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="unsupported SymNode comparison op",
                 context=f"{op.__name__}({left}, {right})",
                 explanation=f"Dynamo does not support the comparison op {op.__name__} "
@@ -3045,7 +3066,24 @@ def _comparison_with_symnode(
     def call_xor(
         self, tx: "InstructionTranslator", a: VariableTracker, b: VariableTracker
     ) -> VariableTracker | None:
-        if isinstance(a, (DictKeysVariable, SetVariable, UserDefinedObjectVariable)):
+        # Rely on constant_handler
+        if isinstance(a, ConstantVariable) and isinstance(b, ConstantVariable):
+            return None
+        if isinstance(a, (SymNodeVariable, ConstantVariable)) and isinstance(
+            b, (SymNodeVariable, ConstantVariable)
+        ):
+            return SymNodeVariable.create(
+                tx,
+                tx.output.create_proxy(
+                    "call_function", operator.xor, *proxy_args_kwargs([a, b], {})
+                ),
+                sym_num=None,
+            )
+
+        if isinstance(
+            a,
+            (DictKeysVariable, SetVariable, UserDefinedObjectVariable),
+        ):
             return a.call_method(tx, "__xor__", [b], {})
         return None
 
diff --git a/torch/_dynamo/variables/constant.py b/torch/_dynamo/variables/constant.py
index 1793f5c10844e..1e886c6ee7ad7 100644
--- a/torch/_dynamo/variables/constant.py
+++ b/torch/_dynamo/variables/constant.py
@@ -14,7 +14,7 @@
 from torch._dynamo.source import AttrSource, GetItemSource
 
 from .. import graph_break_hints, variables
-from ..exc import raise_observed_exception, unimplemented_v2
+from ..exc import raise_observed_exception, unimplemented
 from ..utils import (
     cmp_name_to_op_mapping,
     common_constant_types,
@@ -182,9 +182,9 @@ def call_method(
 
         if any(isinstance(x, SymNodeVariable) for x in args):
             # Promote to SymNodeVariable for operations involving dynamic shapes.
-            return variables.SymNodeVariable(self.as_proxy(), self.value).call_method(
-                tx, name, args, kwargs
-            )
+            return variables.SymNodeVariable.create(
+                tx, self.as_proxy(), self.value
+            ).call_method(tx, name, args, kwargs)
 
         try:
             const_args = [a.as_python_constant() for a in args]
@@ -192,7 +192,7 @@ def call_method(
         except NotImplementedError:
             return super().call_method(tx, name, args, kwargs)
 
-        if isinstance(self.value, str) and name in str.__dict__.keys():
+        if isinstance(self.value, str) and name in str.__dict__:
             method = getattr(self.value, name)
             try:
                 return ConstantVariable.create(method(*const_args, **const_kwargs))
@@ -233,7 +233,7 @@ def call_method(
         elif isinstance(self.value, bytes) and name == "decode":
             method = getattr(self.value, name)
             return ConstantVariable.create(method(*const_args, **const_kwargs))
-        elif type(self.value) is complex and name in complex.__dict__.keys():
+        elif type(self.value) is complex and name in complex.__dict__:
             method = getattr(self.value, name)
             try:
                 return ConstantVariable.create(method(*const_args, **const_kwargs))
@@ -292,7 +292,7 @@ def create(
             for member in list(cls_type):
                 if member.value == value_vt.as_python_constant():
                     return cls(member, **options)
-        unimplemented_v2(
+        unimplemented(
             gb_type="Failed to construct Enum variable",
             context=f"value: {value_vt}, allowed enum values: {list(cls_type)}",
             explanation="Attempted to construct an Enum value that is non-constant (e.g. int, string) "
diff --git a/torch/_dynamo/variables/ctx_manager.py b/torch/_dynamo/variables/ctx_manager.py
index 0502c58a78420..c79f19216f68b 100644
--- a/torch/_dynamo/variables/ctx_manager.py
+++ b/torch/_dynamo/variables/ctx_manager.py
@@ -1,5 +1,3 @@
-# mypy: ignore-errors
-
 """
 This file contains a collection of context manager classes used by Dynamo for tracking
 and managing various PyTorch runtime states during graph compilation. These context
@@ -23,8 +21,9 @@
 import inspect
 import sys
 import warnings
-from contextlib import ExitStack
-from typing import TYPE_CHECKING, Union
+from collections.abc import Callable, Sequence, Sized
+from contextlib import AbstractContextManager, ExitStack
+from typing import Any, Optional, TYPE_CHECKING, Union
 
 import torch._C
 from torch._guards import Guard
@@ -35,7 +34,7 @@
     create_instruction,
     create_setup_with,
 )
-from ..exc import unimplemented_v2
+from ..exc import unimplemented
 from ..guards import GuardBuilder, install_guard
 from ..source import AttrSource, GlobalStateSource
 from ..utils import _get_error_on_graph_break, _set_error_on_graph_break
@@ -67,35 +66,43 @@ class ContextWrappingVariable(VariableTracker):
         *VariableTracker._nonvar_fields,
     }
 
-    def __init__(self, target_values, initial_values=None, **kwargs) -> None:
+    def __init__(
+        self, target_values: Any, initial_values: Optional[Any] = None, **kwargs: Any
+    ) -> None:
         super().__init__(**kwargs)
         self.target_values = target_values
         self.initial_values = initial_values
 
-    def enter(self, tx):
-        self._call_func(tx, self.target_values)
+    def enter(self, tx: "InstructionTranslator") -> VariableTracker:
+        if hasattr(self, "_call_func"):
+            self._call_func(tx, self.target_values)
         self.set_cleanup_hook(tx)
         return variables.ConstantVariable.create(None)
 
-    def set_cleanup_hook(self, tx: "InstructionTranslator", fn=None):
+    def set_cleanup_hook(
+        self, tx: "InstructionTranslator", fn: Optional[Callable[..., Any]] = None
+    ) -> None:
         if fn is None:
 
-            def fn():
-                self._call_func(tx, self.initial_values)
+            def fn() -> None:
+                if hasattr(self, "_call_func"):
+                    self._call_func(tx, self.initial_values)
 
-        self.cleanup_fn = fn
+        self.cleanup_fn: Optional[Callable[..., Any]] = fn
         tx.output.add_cleanup_hook(self.cleanup)
 
-    def exit(self, tx: "InstructionTranslator", *args):
+    def exit(
+        self, tx: "InstructionTranslator", *args: VariableTracker
+    ) -> VariableTracker:
         self.cleanup_assert()
         return variables.ConstantVariable.create(None)
 
-    def reconstruct_type(self, codegen: "PyCodegen"):
+    def reconstruct_type(self, codegen: "PyCodegen") -> None:
         codegen(
             AttrSource(codegen.tx.import_source(self.module_name()), self.fn_name())
         )
 
-    def reconstruct(self, codegen: "PyCodegen"):
+    def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen.add_push_null(lambda: self.reconstruct_type(codegen))
         target_values = self.target_values
         if not target_values:
@@ -103,18 +110,18 @@ def reconstruct(self, codegen: "PyCodegen"):
         codegen.extend_output([codegen.create_load_const(val) for val in target_values])
         codegen.extend_output(create_call_function(len(target_values), False))
 
-    def module_name(self):
+    def module_name(self) -> str:
         raise NotImplementedError("module_name called on base")
 
-    def fn_name(self):
+    def fn_name(self) -> str:
         raise NotImplementedError("fn_name called on base")
 
     def call_function(
         self,
         tx: "InstructionTranslator",
-        args: "list[VariableTracker]",
-        kwargs: "dict[str, VariableTracker]",
-    ) -> "VariableTracker":
+        args: Sequence[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         assert len(args) == 1
         assert isinstance(
             args[0],
@@ -128,28 +135,27 @@ def call_function(
 
         if isinstance(args[0], NestedUserFunctionVariable):
             return WrappedNestedUserFunctionVariable(args[0], self)
-
-        if isinstance(args[0], SkipFunctionVariable):
+        elif isinstance(args[0], SkipFunctionVariable):
             return WrappedSkipFunctionVariable(args[0], self)
-
-        if isinstance(args[0], UserMethodVariable):
+        elif isinstance(args[0], UserMethodVariable):
             return WrappedUserMethodVariable(args[0], self)
-
-        if isinstance(args[0], UserFunctionVariable):
+        elif isinstance(args[0], UserFunctionVariable):
             return WrappedUserFunctionVariable(args[0], self)
+        else:
+            raise AssertionError("Unexpected arg type")
 
-    def supports_graph_breaks(self):
+    def supports_graph_breaks(self) -> bool:
         return True
 
-    def exit_on_graph_break(self):
+    def exit_on_graph_break(self) -> bool:
         return True
 
-    def cleanup(self):
+    def cleanup(self) -> None:
         if self.cleanup_fn is not None:
             self.cleanup_fn()
             self.cleanup_fn = None
 
-    def cleanup_assert(self):
+    def cleanup_assert(self) -> None:
         assert self.cleanup_fn, "multiple exits?"
         self.cleanup()
 
@@ -157,7 +163,7 @@ def cleanup_assert(self):
 class GenericContextWrappingVariable(UserDefinedObjectVariable):
     # Some methods in ContextWrappingVariable assumes the arguments are
     # python constants. Which might not always be the case here.
-    def __init__(self, cm_obj, **kwargs) -> None:
+    def __init__(self, cm_obj: AbstractContextManager[Any], **kwargs: Any) -> None:
         assert cm_obj is not None
         super().__init__(
             value=cm_obj,
@@ -166,44 +172,46 @@ def __init__(self, cm_obj, **kwargs) -> None:
         )
         self.cm_obj = cm_obj
 
-    def module_name(self):
+    def module_name(self) -> str:
         return self.cm_obj.__module__
 
-    def fn_name(self):
+    def fn_name(self) -> str:
         return type(self.cm_obj).__name__
 
-    def enter(self, tx):
+    def enter(self, tx: "InstructionTranslator") -> VariableTracker:
         source = None if self.source is None else AttrSource(self.source, "__enter__")
         return variables.UserMethodVariable(
-            self.cm_obj.__enter__.__func__,
+            self.cm_obj.__enter__.__func__,  # type: ignore[attr-defined]
             self,
             source=source,
         ).call_function(tx, [], {})
 
-    def exit(self, tx: "InstructionTranslator", *args):
+    def exit(
+        self, tx: "InstructionTranslator", *args: VariableTracker
+    ) -> VariableTracker:
         source = None if self.source is None else AttrSource(self.source, "__exit__")
         x = variables.UserMethodVariable(
-            self.cm_obj.__exit__.__func__,
+            self.cm_obj.__exit__.__func__,  # type: ignore[attr-defined]
             self,
             source=source,
-        ).call_function(tx, args, {})
+        ).call_function(tx, list(args), {})
         tx.active_generic_context_managers.pop()
         return x
 
-    def supports_graph_breaks(self):
+    def supports_graph_breaks(self) -> bool:
         return False
 
-    def exit_on_graph_break(self):
+    def exit_on_graph_break(self) -> bool:
         return True
 
 
 class RepararametrizeModuleContextVariable(GenericContextWrappingVariable):
-    def __init__(self, ctx_manager_vt, mod):
+    def __init__(self, ctx_manager_vt: ContextWrappingVariable, mod: Any) -> None:
         self.cm_vt = ctx_manager_vt
         self.mod = mod
         # We don't call super().__init__() because we're delegating most methods to cm_vt
 
-    def enter(self, tx: "InstructionTranslator"):
+    def enter(self, tx: "InstructionTranslator") -> VariableTracker:
         # Custom enter implementation with side effects
 
         self.old_parameters_var = self.mod.var_getattr(tx, "_parameters").realize()
@@ -212,7 +220,9 @@ def enter(self, tx: "InstructionTranslator"):
         tx.output.side_effects.ignore_mutations_on(self.old_buffer_var)
         return self.cm_vt.enter(tx)
 
-    def exit(self, tx: "InstructionTranslator", *args):
+    def exit(
+        self, tx: "InstructionTranslator", *args: VariableTracker
+    ) -> VariableTracker:
         # Custom exit implementation with side effects
         x = self.cm_vt.exit(tx, *args)
         tx.output.side_effects.stop_ignoring_mutations_on(self.old_buffer_var)
@@ -220,7 +230,7 @@ def exit(self, tx: "InstructionTranslator", *args):
         return x
 
     # Forward all other method calls to self.cm_vt
-    def __getattr__(self, name):
+    def __getattr__(self, name: str) -> Any:
         # This will be called for any attribute not explicitly defined in this class
         return getattr(self.cm_vt, name)
 
@@ -229,14 +239,16 @@ class GradInplaceRequiresGradCtxManagerVariable(ContextWrappingVariable):
     """represents torch grad requires grad"""
 
     @staticmethod
-    def create(tx: "InstructionTranslator", target_values, **kwargs):
+    def create(
+        tx: "InstructionTranslator", target_values: Any, **kwargs: Any
+    ) -> "GradInplaceRequiresGradCtxManagerVariable":
         return GradInplaceRequiresGradCtxManagerVariable(
             target_values=target_values,
             initial_values=None,
             **kwargs,
         )
 
-    def enter(self, tx):
+    def enter(self, tx: "InstructionTranslator") -> VariableTracker:
         [enabled] = self.target_values
         self.prev_state = torch._C._functorch.get_inplace_requires_grad_allowed()
         torch._C._functorch.set_inplace_requires_grad_allowed(enabled)
@@ -254,7 +266,9 @@ def enter(self, tx):
         )
         return variables.ConstantVariable.create(None)
 
-    def exit(self, tx: "InstructionTranslator", *args):
+    def exit(
+        self, tx: "InstructionTranslator", *args: VariableTracker
+    ) -> VariableTracker:
         self.cleanup()
         tx.output.create_node(
             "call_function",
@@ -269,14 +283,16 @@ class TemporarilyPopInterpreterStackCtxManagerVariable(ContextWrappingVariable):
     """represents torch._functorch.pyfunction.temporarily_pop_interpreter_stack()"""
 
     @staticmethod
-    def create(tx: "InstructionTranslator", target_values, **kwargs):
+    def create(
+        tx: "InstructionTranslator", target_values: Any, **kwargs: Any
+    ) -> "TemporarilyPopInterpreterStackCtxManagerVariable":
         return TemporarilyPopInterpreterStackCtxManagerVariable(
             target_values=target_values,
             initial_values=None,
             **kwargs,
         )
 
-    def enter(self, tx):
+    def enter(self, tx: "InstructionTranslator") -> VariableTracker:
         self.saved = torch._C._functorch.pop_dynamic_layer_stack()
         self.set_cleanup_hook(
             tx,
@@ -290,7 +306,9 @@ def enter(self, tx):
         )
         return variables.ConstantVariable.create(None)
 
-    def exit(self, tx: "InstructionTranslator", *args):
+    def exit(
+        self, tx: "InstructionTranslator", *args: VariableTracker
+    ) -> VariableTracker:
         self.cleanup()
         tx.output.create_node(
             "call_function",
@@ -309,10 +327,12 @@ class JvpIncrementNestingCtxManagerVariable(ContextWrappingVariable):
     # being compiled. But the FX graph may be invalid in the case of a jvp
     # call from eager that calls the compiled function, as the jvp levels
     # may be different.
-    _guards_singleton = Guard(GlobalStateSource(), GuardBuilder.FUNCTORCH_STACK_MATCH)
+    _guards_singleton = Guard(GlobalStateSource(), GuardBuilder.FUNCTORCH_STACK_MATCH)  # type: ignore[arg-type]
 
     @staticmethod
-    def create(tx: "InstructionTranslator", **kwargs):
+    def create(
+        tx: "InstructionTranslator", **kwargs: Any
+    ) -> "JvpIncrementNestingCtxManagerVariable":
         var = JvpIncrementNestingCtxManagerVariable(
             target_values=None,
             initial_values=None,
@@ -320,7 +340,7 @@ def create(tx: "InstructionTranslator", **kwargs):
         )
         return var
 
-    def enter(self, tx):
+    def enter(self, tx: "InstructionTranslator") -> VariableTracker:
         install_guard(self._guards_singleton)
         jvp_level = torch._functorch.eager_transforms.enter_jvp_nesting()
         self.set_cleanup_hook(
@@ -334,7 +354,9 @@ def enter(self, tx):
         )
         return variables.ConstantVariable.create(jvp_level)
 
-    def exit(self, tx: "InstructionTranslator", *args):
+    def exit(
+        self, tx: "InstructionTranslator", *args: VariableTracker
+    ) -> VariableTracker:
         self.cleanup()
         tx.output.create_node(
             "call_function", torch._C._functorch._jvp_decrement_nesting, (), {}
@@ -346,14 +368,16 @@ class SetFwdGradEnabledContextManager(ContextWrappingVariable):
     """represents torch.autograd.forward_ad._set_fwd_grad_enabled() to enable/disable fwd grad"""
 
     @staticmethod
-    def create(tx: "InstructionTranslator", target_values, **kwargs):
+    def create(
+        tx: "InstructionTranslator", target_values: Any, **kwargs: Any
+    ) -> "SetFwdGradEnabledContextManager":
         return SetFwdGradEnabledContextManager(
             target_values=target_values,
             initial_values=None,
             **kwargs,
         )
 
-    def enter(self, tx):
+    def enter(self, tx: "InstructionTranslator") -> VariableTracker:
         [mode] = self.target_values
         self.prev_state = torch._C._is_fwd_grad_enabled()
         torch._C._set_fwd_grad_enabled(mode)
@@ -369,7 +393,9 @@ def enter(self, tx):
         )
         return variables.ConstantVariable.create(None)
 
-    def exit(self, tx: "InstructionTranslator", *args):
+    def exit(
+        self, tx: "InstructionTranslator", *args: VariableTracker
+    ) -> VariableTracker:
         self.cleanup()
         tx.output.create_node(
             "call_function",
@@ -383,17 +409,17 @@ def exit(self, tx: "InstructionTranslator", *args):
 class DualLevelContextManager(ContextWrappingVariable):
     """Represents torch.autograd.forward_ad.dual_level ctx manager"""
 
-    _guards_singleton = Guard(GlobalStateSource(), GuardBuilder.DUAL_LEVEL)
+    _guards_singleton = Guard(GlobalStateSource(), GuardBuilder.DUAL_LEVEL)  # type: ignore[arg-type]
 
     @staticmethod
-    def create(tx: "InstructionTranslator", **kwargs):
+    def create(tx: "InstructionTranslator", **kwargs: Any) -> "DualLevelContextManager":
         return DualLevelContextManager(
             target_values=None,
             initial_values=None,
             **kwargs,
         )
 
-    def enter(self, tx):
+    def enter(self, tx: "InstructionTranslator") -> VariableTracker:
         install_guard(self._guards_singleton)
         self.new_level = torch.autograd.forward_ad.enter_dual_level()
         self.set_cleanup_hook(
@@ -407,7 +433,9 @@ def enter(self, tx):
         )
         return variables.ConstantVariable.create(self.new_level)
 
-    def exit(self, tx: "InstructionTranslator", *args):
+    def exit(
+        self, tx: "InstructionTranslator", *args: VariableTracker
+    ) -> VariableTracker:
         self.cleanup()
         tx.output.create_node(
             "call_function",
@@ -426,10 +454,12 @@ class GradIncrementNestingCtxManagerVariable(ContextWrappingVariable):
     # being compiled. But the FX graph may be invalid in the case of a grad
     # call from eager that calls the compiled function, as the grad levels
     # may be different.
-    _guards_singleton = Guard(GlobalStateSource(), GuardBuilder.FUNCTORCH_STACK_MATCH)
+    _guards_singleton = Guard(GlobalStateSource(), GuardBuilder.FUNCTORCH_STACK_MATCH)  # type: ignore[arg-type]
 
     @staticmethod
-    def create(tx: "InstructionTranslator", **kwargs):
+    def create(
+        tx: "InstructionTranslator", **kwargs: Any
+    ) -> "GradIncrementNestingCtxManagerVariable":
         var = GradIncrementNestingCtxManagerVariable(
             target_values=None,
             initial_values=None,
@@ -437,7 +467,7 @@ def create(tx: "InstructionTranslator", **kwargs):
         )
         return var
 
-    def enter(self, tx):
+    def enter(self, tx: "InstructionTranslator") -> VariableTracker:
         install_guard(self._guards_singleton)
         grad_level = torch._C._functorch._grad_increment_nesting()
         self.set_cleanup_hook(tx, lambda: torch._C._functorch._grad_decrement_nesting())
@@ -449,7 +479,9 @@ def enter(self, tx):
         )
         return variables.ConstantVariable.create(grad_level)
 
-    def exit(self, tx: "InstructionTranslator", *args):
+    def exit(
+        self, tx: "InstructionTranslator", *args: VariableTracker
+    ) -> VariableTracker:
         self.cleanup()
         tx.output.create_node(
             "call_function", torch._C._functorch._grad_decrement_nesting, (), {}
@@ -461,19 +493,29 @@ class CatchWarningsCtxManagerVariable(ContextWrappingVariable):
     """Delay a call to warnings.catch_warnings"""
 
     @staticmethod
-    def create(tx: "InstructionTranslator", catch_warnings_args):
+    def create(
+        tx: "InstructionTranslator", catch_warnings_args: dict[str, VariableTracker]
+    ) -> "CatchWarningsCtxManagerVariable":
         return CatchWarningsCtxManagerVariable(
             catch_warnings_args=catch_warnings_args,
             target_values=None,
             initial_values=None,
         )
 
-    def __init__(self, catch_warnings_args, **kwargs) -> None:
+    def __init__(
+        self,
+        catch_warnings_args: dict[str, VariableTracker],
+        target_values: Optional[Any] = None,
+        initial_values: Optional[Any] = None,
+        **kwargs: Any,
+    ) -> None:
         assert isinstance(catch_warnings_args, dict), catch_warnings_args
-        super().__init__(**kwargs)
+        super().__init__(
+            target_values=target_values, initial_values=initial_values, **kwargs
+        )
         self.catch_warnings_args = catch_warnings_args
 
-    def enter(self, tx):
+    def enter(self, tx: "InstructionTranslator") -> VariableTracker:
         kwargs = {
             k: v.as_python_constant() for k, v in self.catch_warnings_args.items()
         }
@@ -481,7 +523,7 @@ def enter(self, tx):
         self.set_cleanup_hook(tx, lambda: ctx_val.__exit__(None, None, None))
         return variables.ConstantVariable.create(ctx_val.__enter__())
 
-    def reconstruct(self, cg):
+    def reconstruct(self, cg: "PyCodegen") -> None:
         cg.add_push_null(lambda: cg.load_import_from("warnings", "catch_warnings"))
         cg.foreach(self.catch_warnings_args.values())
         keys = tuple(self.catch_warnings_args.keys())
@@ -496,10 +538,14 @@ class VmapIncrementNestingCtxManagerVariable(ContextWrappingVariable):
     # being compiled. But the FX graph may be invalid in the case of a vmap
     # call from eager that calls the compiled function, as the vmap levels
     # may be different.
-    _guards_singleton = Guard(GlobalStateSource(), GuardBuilder.FUNCTORCH_STACK_MATCH)
+    _guards_singleton = Guard(GlobalStateSource(), GuardBuilder.FUNCTORCH_STACK_MATCH)  # type: ignore[arg-type]
 
     @staticmethod
-    def create(tx: "InstructionTranslator", target_values, **kwargs):
+    def create(
+        tx: "InstructionTranslator",
+        target_values: Sequence[VariableTracker],
+        **kwargs: Any,
+    ) -> "VmapIncrementNestingCtxManagerVariable":
         var = VmapIncrementNestingCtxManagerVariable(
             target_values=target_values,
             initial_values=None,
@@ -507,7 +553,7 @@ def create(tx: "InstructionTranslator", target_values, **kwargs):
         )
         return var
 
-    def enter(self, tx):
+    def enter(self, tx: "InstructionTranslator") -> VariableTracker:
         install_guard(self._guards_singleton)
         batch_size, randomness = self.target_values
         if isinstance(batch_size, variables.SymNodeVariable):
@@ -527,7 +573,9 @@ def enter(self, tx):
         )
         return variables.ConstantVariable.create(vmap_level)
 
-    def exit(self, tx: "InstructionTranslator", *args):
+    def exit(
+        self, tx: "InstructionTranslator", *args: VariableTracker
+    ) -> VariableTracker:
         self.cleanup()
         tx.output.create_node(
             "call_function",
@@ -541,10 +589,15 @@ def exit(self, tx: "InstructionTranslator", *args):
 class GradModeVariable(ContextWrappingVariable):
     """represents torch.{no_grad,enable_grad,set_grad_mode}()"""
 
-    _guards_singleton = Guard(GlobalStateSource(), GuardBuilder.GRAD_MODE)
+    _guards_singleton = Guard(GlobalStateSource(), GuardBuilder.GRAD_MODE)  # type: ignore[arg-type]
 
     @staticmethod
-    def create(tx: "InstructionTranslator", target_value, initialized=False, **kwargs):
+    def create(
+        tx: "InstructionTranslator",
+        target_value: Any,
+        initialized: bool = False,
+        **kwargs: Any,
+    ) -> "GradModeVariable":
         var = GradModeVariable(
             target_values=[target_value],
             initial_values=[torch.is_grad_enabled()],
@@ -555,31 +608,37 @@ def create(tx: "InstructionTranslator", target_value, initialized=False, **kwarg
         return var
 
     def __init__(
-        self, target_values, initial_values=None, initialized=True, **kwargs
+        self,
+        target_values: Any,
+        initial_values: Optional[Sequence[bool]] = None,
+        initialized: bool = True,
+        **kwargs: Any,
     ) -> None:
         super().__init__(
             target_values=target_values, initial_values=initial_values, **kwargs
         )
         install_guard(self._guards_singleton)
 
-    def enter(self, tx):
+    def enter(self, tx: "InstructionTranslator") -> VariableTracker:
         self._call_func(tx, self.target_values)
         return variables.ConstantVariable.create(None)
 
-    def exit(self, tx: "InstructionTranslator", *args):
+    def exit(
+        self, tx: "InstructionTranslator", *args: VariableTracker
+    ) -> VariableTracker:
         self._call_func(tx, self.initial_values)
         return variables.ConstantVariable.create(None)
 
     def call_function(
         self,
         tx: "InstructionTranslator",
-        args: "list[VariableTracker]",
-        kwargs: "dict[str, VariableTracker]",
-    ):
+        args: Sequence[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         self._call_func(tx, self.initial_values)  # undo eager initialization
         return super().call_function(tx, args, kwargs)
 
-    def _call_func(self, tx: "InstructionTranslator", values):
+    def _call_func(self, tx: "InstructionTranslator", values: Any) -> None:
         assert len(values) == 1
         value = values[0]
         # Coalesce grad mode mutations
@@ -589,16 +648,18 @@ def _call_func(self, tx: "InstructionTranslator", values):
             )
             torch._C._set_grad_enabled(value)
 
-    def module_name(self):
+    def module_name(self) -> str:
         return "torch"
 
-    def fn_name(self):
+    def fn_name(self) -> str:
         return "set_grad_enabled"
 
 
 class InferenceModeVariable(ContextWrappingVariable):
     @staticmethod
-    def create(tx: "InstructionTranslator", target_value, **kwargs):
+    def create(
+        tx: "InstructionTranslator", target_value: Any, **kwargs: Any
+    ) -> "InferenceModeVariable":
         var = InferenceModeVariable(
             [target_value], initial_values=torch.is_inference_mode_enabled(), **kwargs
         )
@@ -606,9 +667,9 @@ def create(tx: "InstructionTranslator", target_value, **kwargs):
 
     def __init__(
         self,
-        target_values,
-        initial_values=None,
-        **kwargs,
+        target_values: Any,
+        initial_values: Optional[bool] = None,
+        **kwargs: Any,
     ) -> None:
         if initial_values is None:
             # This must be called here since function defaults are evaluated at import time
@@ -616,9 +677,10 @@ def __init__(
         super().__init__(
             target_values=target_values, initial_values=initial_values, **kwargs
         )
-        self.target_values = target_values
 
-    def exit(self, tx: "InstructionTranslator", *args):
+    def exit(
+        self, tx: "InstructionTranslator", *args: VariableTracker
+    ) -> VariableTracker:
         self.cleanup_assert()
         tx.output.create_node(
             "call_function",
@@ -626,8 +688,9 @@ def exit(self, tx: "InstructionTranslator", *args):
             (self.proxy,),
             {},
         )
+        return variables.ConstantVariable.create(None)
 
-    def enter(self, tx):
+    def enter(self, tx: "InstructionTranslator") -> VariableTracker:
         disabled_inference_mode_forcibly = False
         if (
             torch._dynamo.config.fake_tensor_disable_inference_mode
@@ -642,7 +705,7 @@ def enter(self, tx):
         else:
             ctx = torch.autograd.grad_mode._enter_inference_mode(*self.target_values)
 
-        def cleanup_hook():
+        def cleanup_hook() -> None:
             if disabled_inference_mode_forcibly:
                 torch._C._set_grad_enabled(prior)
             else:
@@ -655,11 +718,12 @@ def cleanup_hook():
             (*self.target_values,),
             {},
         )
+        return variables.ConstantVariable.create(None)
 
-    def module_name(self):
+    def module_name(self) -> str:
         return "torch"
 
-    def fn_name(self):
+    def fn_name(self) -> str:
         return "inference_mode"
 
 
@@ -667,7 +731,9 @@ class CUDADeviceVariable(ContextWrappingVariable):
     """represents torch.cuda.device"""
 
     @staticmethod
-    def create(tx: "InstructionTranslator", device, **kwargs):
+    def create(
+        tx: "InstructionTranslator", device: Any, **kwargs: Any
+    ) -> "CUDADeviceVariable":
         var = CUDADeviceVariable(
             target_values=[torch.cuda._get_device_index(device, optional=True)],
             initial_values=None,
@@ -677,16 +743,17 @@ def create(tx: "InstructionTranslator", device, **kwargs):
 
     def __init__(
         self,
-        target_values,
-        initial_values=None,
-        **kwargs,
+        target_values: Any,
+        initial_values: Optional[Any] = None,
+        **kwargs: Any,
     ) -> None:
         super().__init__(
             target_values=target_values, initial_values=initial_values, **kwargs
         )
-        self.target_values = target_values
 
-    def exit(self, tx: "InstructionTranslator", *args):
+    def exit(
+        self, tx: "InstructionTranslator", *args: VariableTracker
+    ) -> VariableTracker:
         self.cleanup_assert()
         tx.output.create_node(
             "call_function",
@@ -696,7 +763,7 @@ def exit(self, tx: "InstructionTranslator", *args):
         )
         return variables.ConstantVariable.create(False)
 
-    def enter(self, tx):
+    def enter(self, tx: "InstructionTranslator") -> VariableTracker:
         prev_idx = torch.cuda._exchange_device(*self.target_values)
         self.set_cleanup_hook(tx, lambda: torch.cuda._maybe_exchange_device(prev_idx))
         self.proxy = tx.output.create_node(
@@ -705,21 +772,24 @@ def enter(self, tx):
             (*self.target_values,),
             {},
         )
+        return variables.ConstantVariable.create(None)
 
-    def module_name(self):
+    def module_name(self) -> str:
         return "torch.cuda"
 
-    def fn_name(self):
+    def fn_name(self) -> str:
         return "device"
 
 
 class TorchFunctionDisableVariable(ContextWrappingVariable):
     """represents whether torch function overrides are enabled or not"""
 
-    _guards_singleton = Guard(GlobalStateSource(), GuardBuilder.TORCH_FUNCTION_STATE)
+    _guards_singleton = Guard(GlobalStateSource(), GuardBuilder.TORCH_FUNCTION_STATE)  # type: ignore[arg-type]
 
     @staticmethod
-    def create(tx: "InstructionTranslator", **kwargs):
+    def create(
+        tx: "InstructionTranslator", **kwargs: Any
+    ) -> "TorchFunctionDisableVariable":
         var = TorchFunctionDisableVariable(
             target_values=[],
             initial_values=[],
@@ -728,10 +798,14 @@ def create(tx: "InstructionTranslator", **kwargs):
         return var
 
     def __init__(
-        self, target_values, initial_values=None, only_subclass=True, **kwargs
+        self,
+        target_values: Sized,
+        initial_values: Optional[Sized] = None,
+        only_subclass: bool = True,
+        **kwargs: Any,
     ) -> None:
         assert len(target_values) == 0
-        assert len(initial_values) == 0
+        assert initial_values is not None and len(initial_values) == 0
         from ..symbolic_convert import InstructionTranslator
 
         tx = InstructionTranslator.current_tx()
@@ -748,10 +822,14 @@ def __init__(
         )
         install_guard(self._guards_singleton)
 
-    def set_cleanup_hook(self, tx: "InstructionTranslator", fn=None):
-        if fn is None:
+    def set_cleanup_hook(
+        self,
+        tx: "InstructionTranslator",
+        cleanup_fn: Optional[Callable[..., Any]] = None,
+    ) -> None:
+        if cleanup_fn is None:
 
-            def fn():
+            def cleanup_fn() -> None:
                 tx.symbolic_torch_function_state.torch_function_subclass_enabled = (
                     self.initial_torch_function_subclass_enabled
                 )
@@ -760,19 +838,19 @@ def fn():
                         self.initial_torch_function_subclass_enabled
                     )
 
-        self.cleanup_fn = fn
+        self.cleanup_fn = cleanup_fn
         tx.output.add_cleanup_hook(self.cleanup)
 
-    def _call_func(self, tx: "InstructionTranslator", values):
+    def _call_func(self, tx: "InstructionTranslator", values: Sized) -> None:
         assert len(values) == 0
         tx.symbolic_torch_function_state.torch_function_subclass_enabled = False
         if not self.only_subclass:
             tx.symbolic_torch_function_state.torch_function_mode_enabled = False
 
-    def module_name(self):
+    def module_name(self) -> str:
         return "torch._C"
 
-    def fn_name(self):
+    def fn_name(self) -> str:
         if self.only_subclass:
             return "DisableTorchFunctionSubclass"
         return "DisableTorchFunction"
@@ -782,11 +860,14 @@ class DeterministicAlgorithmsVariable(ContextWrappingVariable):
     """represents torch.{are_deterministic_algorithms_enabled,use_deterministic_algorithms}()"""
 
     _guards_singleton = Guard(
-        GlobalStateSource(), GuardBuilder.DETERMINISTIC_ALGORITHMS
+        GlobalStateSource(),
+        GuardBuilder.DETERMINISTIC_ALGORITHMS,  # type: ignore[arg-type]
     )
 
     @staticmethod
-    def create(tx: "InstructionTranslator", target_value, **kwargs):
+    def create(
+        tx: "InstructionTranslator", target_value: bool, **kwargs: Any
+    ) -> "DeterministicAlgorithmsVariable":
         var = DeterministicAlgorithmsVariable(
             target_values=[target_value],
             initial_values=[torch.are_deterministic_algorithms_enabled()],
@@ -796,16 +877,21 @@ def create(tx: "InstructionTranslator", target_value, **kwargs):
         var.set_cleanup_hook(tx)
         return var
 
-    def __init__(self, target_values, initial_values=None, **kwargs) -> None:
+    def __init__(
+        self,
+        target_values: Sequence[bool],
+        initial_values: Optional[Sequence[bool]] = None,
+        **kwargs: Any,
+    ) -> None:
         super().__init__(
             target_values=target_values, initial_values=initial_values, **kwargs
         )
         install_guard(self._guards_singleton)
 
-    def enter(self, tx):
+    def enter(self, tx: "InstructionTranslator") -> VariableTracker:
         return variables.ConstantVariable.create(None)
 
-    def _call_func(self, tx: "InstructionTranslator", values):
+    def _call_func(self, tx: "InstructionTranslator", values: Sequence[bool]) -> None:
         assert len(values) == 1
         value = values[0]
         tx.output.create_node(
@@ -813,10 +899,10 @@ def _call_func(self, tx: "InstructionTranslator", values):
         )
         torch._C._set_deterministic_algorithms(value)
 
-    def module_name(self):
+    def module_name(self) -> str:
         return "torch"
 
-    def fn_name(self):
+    def fn_name(self) -> str:
         return "use_deterministic_algorithms"
 
 
@@ -824,7 +910,9 @@ class DisabledSavedTensorsHooksVariable(ContextWrappingVariable):
     """represents torch.autograd.graph.disable_saved_tensors_hook."""
 
     @staticmethod
-    def create(tx: "InstructionTranslator", target_value, **kwargs):
+    def create(
+        tx: "InstructionTranslator", target_value: Optional[str], **kwargs: Any
+    ) -> "DisabledSavedTensorsHooksVariable":
         var = DisabledSavedTensorsHooksVariable(
             target_values=[target_value],
             initial_values=[
@@ -836,15 +924,22 @@ def create(tx: "InstructionTranslator", target_value, **kwargs):
         var.set_cleanup_hook(tx)
         return var
 
-    def __init__(self, target_values, initial_values=None, **kwargs) -> None:
+    def __init__(
+        self,
+        target_values: Sequence[Optional[str]],
+        initial_values: Optional[Sequence[Optional[str]]] = None,
+        **kwargs: Any,
+    ) -> None:
         super().__init__(
             target_values=target_values, initial_values=initial_values, **kwargs
         )
 
-    def enter(self, tx):
+    def enter(self, tx: "InstructionTranslator") -> VariableTracker:
         return variables.ConstantVariable.create(None)
 
-    def _call_func(self, tx: "InstructionTranslator", values):
+    def _call_func(
+        self, tx: "InstructionTranslator", values: Sequence[Optional[str]]
+    ) -> None:
         assert len(values) == 1
         value = values[0]
         if value is not None:
@@ -865,16 +960,20 @@ def _call_func(self, tx: "InstructionTranslator", values):
             )
             torch._C._autograd._saved_tensors_hooks_enable()
 
-    def module_name(self):
+    def module_name(self) -> str:
         return "torch.autograd.graph"
 
-    def fn_name(self):
+    def fn_name(self) -> str:
         return "disable_saved_tensors_hooks"
 
 
 class AutocastModeVariable(ContextWrappingVariable):
     @staticmethod
-    def create(func, args, kwargs):
+    def create(
+        func: torch.amp.autocast_mode.autocast,
+        args: Sequence[Any],
+        kwargs: dict[str, Any],
+    ) -> "AutocastModeVariable":
         assert func in [
             torch.amp.autocast_mode.autocast,
             torch.cuda.amp.autocast,
@@ -905,30 +1004,37 @@ def create(func, args, kwargs):
         var = AutocastModeVariable(target_values, initial_values=None, **kwargs)
         return var
 
-    def __init__(self, target_values, initial_values=None, **kwargs) -> None:
+    def __init__(
+        self,
+        target_values: Sequence[Any],
+        initial_values: Optional[Any] = None,
+        **kwargs: Any,
+    ) -> None:
         super().__init__(
             target_values=target_values, initial_values=initial_values, **kwargs
         )
-        self.target_values = target_values
 
-    def exit(self, tx: "InstructionTranslator", *args):
+    def exit(
+        self, tx: "InstructionTranslator", *args: VariableTracker
+    ) -> VariableTracker:
         self.cleanup_assert()
         tx.output.create_node(
             "call_function", torch.amp._exit_autocast, (self.proxy,), {}
         )
         return variables.ConstantVariable.create(None)
 
-    def enter(self, tx):
+    def enter(self, tx: "InstructionTranslator") -> VariableTracker:
         ctx = torch.amp._enter_autocast(*self.target_values)
         self.set_cleanup_hook(tx, lambda: torch.amp._exit_autocast(ctx))
         self.proxy = tx.output.create_node(
             "call_function", torch.amp._enter_autocast, (*self.target_values,), {}
         )
+        return variables.ConstantVariable.create(None)
 
-    def module_name(self):
+    def module_name(self) -> str:
         return "torch.amp.autocast_mode"
 
-    def fn_name(self):
+    def fn_name(self) -> str:
         return "autocast"
 
 
@@ -937,20 +1043,22 @@ class NullContextVariable(ContextWrappingVariable):
     This class represents Python contextlib.nullcontext.
     """
 
-    def __init__(self, target_values=None, **kwargs) -> None:
+    def __init__(self, target_values: Optional[Any] = None, **kwargs: Any) -> None:
         super().__init__(target_values=target_values, **kwargs)
 
-    def enter(self, tx):
+    def enter(self, tx: "InstructionTranslator") -> VariableTracker:
         none = variables.ConstantVariable.create(None)
         return self.target_values if self.target_values else none
 
-    def exit(self, tx: "InstructionTranslator", *args):
+    def exit(
+        self, tx: "InstructionTranslator", *args: VariableTracker
+    ) -> VariableTracker:
         return variables.ConstantVariable.create(None)
 
-    def module_name(self):
+    def module_name(self) -> str:
         return "contextlib"
 
-    def fn_name(self):
+    def fn_name(self) -> str:
         return "nullcontext"
 
 
@@ -963,23 +1071,25 @@ class ProfilerContextVariable(ContextWrappingVariable):
     than `None`, per implementation of the torch objects.
     """
 
-    def __init__(self, **kwargs) -> None:
+    def __init__(self, **kwargs: Any) -> None:
         super().__init__(target_values=None, **kwargs)
 
-    def enter(self, tx):
+    def enter(self, tx: "InstructionTranslator") -> VariableTracker:
         return self
 
-    def exit(self, tx: "InstructionTranslator", *args):
+    def exit(
+        self, tx: "InstructionTranslator", *args: VariableTracker
+    ) -> VariableTracker:
         return variables.ConstantVariable.create(None)
 
-    def module_name(self):
+    def module_name(self) -> str:
         return "contextlib"
 
-    def fn_name(self):
+    def fn_name(self) -> str:
         return "nullcontext"
 
-    def reconstruct(self, cg):
-        unimplemented_v2(
+    def reconstruct(self, cg: "PyCodegen") -> None:
+        unimplemented(
             gb_type="torch.profiler object escaped from compiled region",
             context=str(self),
             explanation="Dynamo doesn't support compiling a region that returns a torch.profiler context manager.",
@@ -995,27 +1105,37 @@ class PreserveVersionContextVariable(ContextWrappingVariable):
     """
 
     @staticmethod
-    def _create_lambda_from_tensors(tx, tensors):
+    def _create_lambda_from_tensors(
+        tx: "InstructionTranslator",
+        tensors: VariableTracker,
+    ) -> "PreserveVersionContextVariable":
         if isinstance(tensors, variables.TensorVariable):
             versions = variables.TupleVariable(
                 [x.var_getattr(tx, "_version") for x in [tensors]]
             )
-            tensors = variables.TupleVariable([tensors])
+            tensors_tuple = variables.TupleVariable([tensors])
         else:
+            assert isinstance(tensors, variables.TupleVariable)
             versions = variables.TupleVariable(
                 [x.var_getattr(tx, "_version") for x in tensors.items]
             )
-        return PreserveVersionContextVariable(tensors, versions)
+            tensors_tuple = tensors
+        return PreserveVersionContextVariable(tensors_tuple, versions)
 
     @staticmethod
-    def constructor(tx):
+    def constructor(tx: "InstructionTranslator") -> VariableTracker:
         return variables.LambdaVariable(
             lambda tensors: PreserveVersionContextVariable._create_lambda_from_tensors(
                 tx, tensors
             )
         )
 
-    def __init__(self, tensors, prev_versions, **kwargs) -> None:
+    def __init__(
+        self,
+        tensors: VariableTracker,
+        prev_versions: VariableTracker,
+        **kwargs: Any,
+    ) -> None:
         kwargs.setdefault("target_values", None)
         super().__init__(**kwargs)
         self.tensors = tensors
@@ -1028,18 +1148,20 @@ def __init__(self, tensors, prev_versions, **kwargs) -> None:
         ):
             self.prev_versions = variables.TupleVariable([self.prev_versions])
 
-    def enter(self, tx):
-        pass
+    def enter(self, tx: "InstructionTranslator") -> VariableTracker:
+        return variables.ConstantVariable.create(None)
 
-    def exit(self, tx: "InstructionTranslator", *args):
+    def exit(
+        self, tx: "InstructionTranslator", *args: VariableTracker
+    ) -> VariableTracker:
         from ..tensor_version_op import _unsafe_set_version_counter
 
         return variables.TorchInGraphFunctionVariable(
             _unsafe_set_version_counter
         ).call_function(tx, [self.tensors, self.prev_versions], {})
 
-    def reconstruct(self, codegen: "PyCodegen"):
-        unimplemented_v2(
+    def reconstruct(self, codegen: "PyCodegen") -> None:
+        unimplemented(
             gb_type="torch.autograd._unsafe_preserve_version_counter escaped from compiled region",
             context=str(self),
             explanation=(
@@ -1053,10 +1175,15 @@ def reconstruct(self, codegen: "PyCodegen"):
 
 
 class FSDPParamGroupUseTrainingStateVariable(ContextWrappingVariable):
-    _guards_singleton = Guard(GlobalStateSource(), GuardBuilder.FSDP_TRAINING_STATE)
+    _guards_singleton = Guard(GlobalStateSource(), GuardBuilder.FSDP_TRAINING_STATE)  # type: ignore[arg-type]
 
     @staticmethod
-    def create(tx: "InstructionTranslator", param_group_var, target_value, **kwargs):
+    def create(
+        tx: "InstructionTranslator",
+        param_group_var: Any,
+        target_value: Any,
+        **kwargs: Any,
+    ) -> "FSDPParamGroupUseTrainingStateVariable":
         var = FSDPParamGroupUseTrainingStateVariable(
             param_group_var=param_group_var,
             target_values=[target_value],
@@ -1066,7 +1193,11 @@ def create(tx: "InstructionTranslator", param_group_var, target_value, **kwargs)
         return var
 
     def __init__(
-        self, param_group_var, target_values, initial_values=None, **kwargs
+        self,
+        param_group_var: Any,
+        target_values: Sequence[Any],
+        initial_values: Optional[Sequence[Any]] = None,
+        **kwargs: Any,
     ) -> None:
         super().__init__(
             target_values=target_values, initial_values=initial_values, **kwargs
@@ -1074,24 +1205,27 @@ def __init__(
         self.param_group_var = param_group_var
         install_guard(self._guards_singleton)
 
-    def enter(self, tx):
+    def enter(self, tx: "InstructionTranslator") -> VariableTracker:
         self._call_func(tx, self.target_values)
         return variables.ConstantVariable.create(None)
 
-    def exit(self, tx: "InstructionTranslator", *args):
-        self._call_func(tx, self.initial_values)
+    def exit(
+        self, tx: "InstructionTranslator", *args: VariableTracker
+    ) -> VariableTracker:
+        self._call_func(tx, self.initial_values)  # type: ignore[arg-type]
         return variables.ConstantVariable.create(None)
 
     def call_function(
         self,
         tx: "InstructionTranslator",
-        args: "list[VariableTracker]",
-        kwargs: "dict[str, VariableTracker]",
-    ):
-        self._call_func(tx, self.initial_values)  # undo eager initialization
+        args: Sequence[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
+        # undo eager initialization
+        self._call_func(tx, self.initial_values)  # type: ignore[arg-type]
         return super().call_function(tx, args, kwargs)
 
-    def _call_func(self, tx: "InstructionTranslator", values):
+    def _call_func(self, tx: "InstructionTranslator", values: Sequence[Any]) -> None:
         assert len(values) == 1
         value = values[0]
         if self.param_group_var.value._training_state != value:
@@ -1106,10 +1240,10 @@ def _call_func(self, tx: "InstructionTranslator", values):
             )
             self.param_group_var.value._training_state = value
 
-    def module_name(self):
+    def module_name(self) -> str:
         return "torch.distributed.fsdp._fully_shard._fsdp_param_group.FSDPParamGroup"
 
-    def fn_name(self):
+    def fn_name(self) -> str:
         return "use_training_state"
 
 
@@ -1117,7 +1251,12 @@ class SDPAKernelVariable(ContextWrappingVariable):
     """represents torch.nn.attention.sdpa_kernel"""
 
     @staticmethod
-    def create(tx: "InstructionTranslator", backends, set_priority=False, **kwargs):
+    def create(
+        tx: "InstructionTranslator",
+        backends: Any,
+        set_priority: bool = False,
+        **kwargs: Any,
+    ) -> "SDPAKernelVariable":
         if isinstance(backends, torch.nn.attention.SDPBackend):
             backends = [backends]
         var = SDPAKernelVariable(
@@ -1131,9 +1270,9 @@ def create(tx: "InstructionTranslator", backends, set_priority=False, **kwargs):
     def __init__(
         self,
         target_values: list[torch.nn.attention.SDPBackend],
-        initial_values=None,
+        initial_values: Any = None,
         set_priority: bool = False,
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         super().__init__(
             target_values=target_values, initial_values=initial_values, **kwargs
@@ -1141,7 +1280,10 @@ def __init__(
         self.set_priority = set_priority
 
     @staticmethod
-    def _backends_to_nodes(tx, backends):
+    def _backends_to_nodes(
+        tx: "InstructionTranslator",
+        backends: list[Any],
+    ) -> list[Any]:
         # convert to/from string in order to bake the backend into FX graph
         nodes = [
             tx.output.create_node(
@@ -1154,7 +1296,7 @@ def _backends_to_nodes(tx, backends):
         ]
         return nodes
 
-    def enter(self, tx):
+    def enter(self, tx: "InstructionTranslator") -> VariableTracker:
         self.prev_backends = torch.nn.attention._cur_sdpa_kernel_backends(
             with_priority=self.set_priority
         )
@@ -1176,7 +1318,9 @@ def enter(self, tx):
         )
         return variables.ConstantVariable.create(None)
 
-    def exit(self, tx: "InstructionTranslator", *args):
+    def exit(
+        self, tx: "InstructionTranslator", *args: VariableTracker
+    ) -> VariableTracker:
         self.cleanup_assert()
         arg = self._backends_to_nodes(tx, self.prev_backends)
         tx.output.create_node(
@@ -1187,12 +1331,12 @@ def exit(self, tx: "InstructionTranslator", *args):
         )
         return variables.ConstantVariable.create(None)
 
-    def module_name(self):
+    def module_name(self) -> str:
         return "torch.nn.attention"
 
     # use a private version of sdpa_kernel that accepts variadic arguments
     # since dynamo reconstructs the contents of target_values one-by-one
-    def fn_name(self):
+    def fn_name(self) -> str:
         return "_sdpa_kernel_variadic"
 
 
@@ -1206,12 +1350,16 @@ class FxTracebackAnnotateVariable(ContextWrappingVariable):
     __exit__ method (instead of tracing).
     """
 
-    def __init__(self, target_values, initial_values=None, **kwargs) -> None:
+    def __init__(
+        self, target_values: Any, initial_values: Any = None, **kwargs: Any
+    ) -> None:
         super().__init__(
             target_values=target_values, initial_values=initial_values, **kwargs
         )
 
-    def enter(self, tx, *args):
+    def enter(
+        self, tx: "InstructionTranslator", *args: VariableTracker
+    ) -> VariableTracker:
         # Run the annotation ctx manager in eager. Also ensure that
         # preserve_node_meta context manager is setup. This is important to pass
         # on the metadata to the create_proxy nodes.
@@ -1221,14 +1369,14 @@ def enter(self, tx, *args):
         self.set_cleanup_hook(tx, lambda: stack.close())
         return variables.ConstantVariable.create(None)
 
-    def module_name(self):
+    def module_name(self) -> str:
         return "torch.fx.traceback"
 
-    def fn_name(self):
+    def fn_name(self) -> str:
         return "annotate"
 
-    def reconstruct_type(self, codegen: "PyCodegen"):
-        unimplemented_v2(
+    def reconstruct_type(self, codegen: "PyCodegen") -> None:
+        unimplemented(
             gb_type="torch.fx.traceback.annotate escaped from compiled region",
             context=str(self),
             explanation="Dynamo doesn't support graph break on torch.fx.traceback.annotate.",
@@ -1243,50 +1391,52 @@ class DynamoConfigPatchVariable(ContextWrappingVariable):
 
     # NOTE: no need to guard on dynamo config because dynamo config should not affect soundness
     # (though it may affect tracing behavior)
-    def __init__(self, target_values, **kwargs) -> None:
-        target_values = tuple(target_values.items())
-        super().__init__(target_values=(target_values,), initial_values=None, **kwargs)
-        self.initial_values = {}
-        for key, _ in target_values:
-            self.initial_values[key] = torch._dynamo.config.__getattr__(key)
-        self.initial_values = (tuple(self.initial_values.items()),)
-
-    def _call_func(self, tx: "InstructionTranslator", values):
+    def __init__(self, target_values: dict[str, Any], **kwargs: Any) -> None:
+        target_values_tuple = tuple(target_values.items())
+        super().__init__(
+            target_values=(target_values_tuple,), initial_values=None, **kwargs
+        )
+        initial_values_dict = {}
+        for key, _ in target_values_tuple:
+            initial_values_dict[key] = torch._dynamo.config.__getattr__(key)  # type: ignore[attr-defined]
+        self.initial_values = (tuple(initial_values_dict.items()),)
+
+    def _call_func(self, tx: "InstructionTranslator", values: Any) -> None:
         assert len(values) == 1
         value = values[0]
         # manually patch dynamo config
         for key, val in value:
-            torch._dynamo.config.__setattr__(key, val)
+            torch._dynamo.config.__setattr__(key, val)  # type: ignore[attr-defined]
         # No need to keep track of global side effects because
         # dynamo will properly restore this context manager for
         # unsupported instructions and continuation functions.
         # Dynamo config also should not affect the semantics of the compiled graph.
 
-    def module_name(self):
+    def module_name(self) -> str:
         return "torch._dynamo"
 
-    def fn_name(self):
+    def fn_name(self) -> str:
         return "patch_dynamo_config"
 
 
 class ErrorOnGraphBreakVariable(ContextWrappingVariable):
     """represents torch._dynamo.error_on_graph_break"""
 
-    def __init__(self, error_on_graph_break, **kwargs) -> None:
+    def __init__(self, error_on_graph_break: bool, **kwargs: Any) -> None:
         super().__init__(
             target_values=(error_on_graph_break,),
             initial_values=(_get_error_on_graph_break(),),
             **kwargs,
         )
 
-    def _call_func(self, tx: "InstructionTranslator", values):
+    def _call_func(self, tx: "InstructionTranslator", values: Sequence[bool]) -> None:
         assert len(values) == 1
         _set_error_on_graph_break(values[0])
 
-    def module_name(self):
+    def module_name(self) -> str:
         return "torch._dynamo"
 
-    def fn_name(self):
+    def fn_name(self) -> str:
         return "error_on_graph_break"
 
 
@@ -1294,7 +1444,7 @@ class WithEnterFunctionVariable(VariableTracker):
     def __init__(
         self,
         ctx: Union[ContextWrappingVariable, GenericContextWrappingVariable],
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         super().__init__(**kwargs)
         self.ctx = ctx
@@ -1302,21 +1452,22 @@ def __init__(
     def call_function(
         self,
         tx: "InstructionTranslator",
-        args: "list[VariableTracker]",
-        kwargs: "dict[str, VariableTracker]",
-    ) -> "VariableTracker":
+        args: Sequence[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         assert not args
         assert not kwargs
         # NOTE: we assume that the instruction immediately after the current CALL instruction
         # is the first instruction of the block.
+        # pyrefly: ignore [bad-argument-type]
         return tx.enter_ctx(self.ctx, tx.current_instruction)
 
-    def reconstruct(self, codegen: "PyCodegen"):
+    def reconstruct(self, codegen: "PyCodegen") -> None:
         try:
             type_str = f"{self.ctx.module_name()}.{self.ctx.fn_name()}"
         except NotImplementedError:
             type_str = str(type(self.ctx))
-        unimplemented_v2(
+        unimplemented(
             gb_type="Attempted to reconstruct context manager's __enter__ method",
             context=str(self.ctx),
             explanation=f"Attempted to reconstruct context manager {type_str} while tracing `with ...:`",
@@ -1339,8 +1490,8 @@ class WithExitFunctionVariable(VariableTracker):
     def __init__(
         self,
         ctx: Union[ContextWrappingVariable, GenericContextWrappingVariable],
-        target,
-        **kwargs,
+        target: Any,
+        **kwargs: Any,
     ) -> None:
         super().__init__(**kwargs)
         assert isinstance(
@@ -1352,27 +1503,29 @@ def __init__(
     def call_function(
         self,
         tx: "InstructionTranslator",
-        args: "list[VariableTracker]",
-        kwargs: "dict[str, VariableTracker]",
-    ) -> "VariableTracker":
+        args: Sequence[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         assert not kwargs
         return self.ctx.exit(tx, *args)
 
-    def reconstruct(self, codegen: "PyCodegen"):
+    def reconstruct(self, codegen: "PyCodegen") -> None:
         # Note here we reconstruct the context manager rather than the
         # exit function.  The handler generated by BlockStackEntry
         # will re-enter the context in the resume function.
-        self.ctx.reconstruct_type(codegen)
+        self.ctx.reconstruct_type(codegen)  # type: ignore[union-attr]
         if codegen.tx.output.partial_convert:
             if sys.version_info >= (3, 11):
                 codegen.append_output(create_instruction("PUSH_NULL"))
                 if sys.version_info < (3, 13):
                     codegen.append_output(create_instruction("SWAP", arg=2))
+            # We rely on classes subtyping `GenericContextWrappingVariable`
+            # to implement these fns and have these attributes
             codegen.extend_output(
-                [codegen.create_load_const(val) for val in self.ctx.target_values]
+                [codegen.create_load_const(val) for val in self.ctx.target_values]  # type: ignore[union-attr]
             )
             codegen.extend_output(
-                create_call_function(len(self.ctx.target_values), False)
+                create_call_function(len(self.ctx.target_values), False)  # type: ignore[union-attr]
             )
             codegen.append_output(create_setup_with(self.target))
             codegen.append_output(create_instruction("POP_TOP"))
diff --git a/torch/_dynamo/variables/dicts.py b/torch/_dynamo/variables/dicts.py
index 4f1f84a55b0b0..24cd5007da37d 100644
--- a/torch/_dynamo/variables/dicts.py
+++ b/torch/_dynamo/variables/dicts.py
@@ -1,5 +1,3 @@
-# mypy: ignore-errors
-
 """
 Dictionary-related variable tracking classes for PyTorch Dynamo.
 
@@ -26,13 +24,13 @@
 import operator
 import types
 from collections.abc import Hashable as py_Hashable
-from typing import Optional, TYPE_CHECKING
+from typing import Any, Optional, TYPE_CHECKING, Union
 
 from torch._subclasses.fake_tensor import is_fake
 
 from .. import graph_break_hints, polyfills, variables
 from ..bytecode_transformation import create_call_function, create_instruction
-from ..exc import raise_observed_exception, unimplemented_v2
+from ..exc import raise_observed_exception, unimplemented
 from ..guards import GuardBuilder, install_guard
 from ..source import is_constant_source, is_from_local_source
 from ..utils import (
@@ -59,11 +57,13 @@
 # - (perhaps) Define how it is compared in _HashableTracker._eq_impl
 
 
-def was_instancecheck_override(obj):
+def was_instancecheck_override(obj: Any) -> bool:
     return type(obj).__dict__.get("__instancecheck__", False)
 
 
-def raise_unhashable(arg, tx=None):
+def raise_unhashable(
+    arg: VariableTracker, tx: Optional["InstructionTranslator"] = None
+) -> None:
     if tx is None:
         from torch._dynamo.symbolic_convert import InstructionTranslator
 
@@ -75,7 +75,7 @@ def raise_unhashable(arg, tx=None):
     )
 
 
-def is_hashable(x):
+def is_hashable(x: VariableTracker) -> bool:
     # NB - performing isinstance check on a LazVT realizes the VT, accidentally
     # inserting the guard. To avoid this, lazyVT `is_hashable` methods looks at
     # the underlying value without realizing the VT. Consider updating the
@@ -143,7 +143,7 @@ class _HashableTracker:
         Note that it's also fine to put VTs into dictionaries and sets, but doing so does not take into account aliasing
         """
 
-        def __init__(self, vt) -> None:
+        def __init__(self, vt: VariableTracker) -> None:
             # We specialize SymNodes
             vt = specialize_symnode(vt)
             # TODO Temporarily remove to figure out what keys are we breaking on
@@ -153,7 +153,7 @@ def __init__(self, vt) -> None:
             self.vt = vt
 
         @property
-        def underlying_value(self):
+        def underlying_value(self) -> Any:
             if (
                 isinstance(self.vt, variables.LazyVariableTracker)
                 and not self.vt.is_realized()
@@ -178,7 +178,8 @@ def underlying_value(self):
             elif isinstance(self.vt, variables.FrozenDataClassVariable):
                 Hashable = ConstDictVariable._HashableTracker
                 fields_values = {
-                    k: Hashable(v).underlying_value for k, v in self.vt.fields.items()
+                    k: Hashable(v).underlying_value
+                    for k, v in self.vt.fields.items()  # type: ignore[attr-defined]
                 }
                 return variables.FrozenDataClassVariable.HashWrapper(
                     self.vt.python_type(), fields_values
@@ -187,16 +188,16 @@ def underlying_value(self):
                 # The re module in Python 3.13+ has a dictionary (_cache2) with
                 # an object as key (`class _ZeroSentinel(int): ...`):
                 # python test/dynamo/test_unittest.py CPythonTestLongMessage.test_baseAssertEqual
-                return self.vt.value
+                return self.vt.value  # type: ignore[attr-defined,union-attr]
             else:
                 x = self.vt.as_python_constant()
             return x
 
-        def __hash__(self):
+        def __hash__(self) -> int:
             return hash(self.underlying_value)
 
         @staticmethod
-        def _eq_impl(a, b):
+        def _eq_impl(a: Any, b: Any) -> bool:
             # TODO: Put this in utils and share it between variables/builtin.py and here
             type_a, type_b = type(a), type(b)
             if not (issubclass(type_a, type_b) or issubclass(type_b, type_a)):
@@ -212,7 +213,7 @@ def _eq_impl(a, b):
             else:
                 return a == b
 
-        def __eq__(self, other: "ConstDictVariable._HashableTracker") -> bool:
+        def __eq__(self, other: object) -> bool:
             Hashable = ConstDictVariable._HashableTracker
             assert isinstance(other, Hashable) or ConstantVariable.is_literal(other), (
                 type(other)
@@ -226,8 +227,8 @@ def __eq__(self, other: "ConstDictVariable._HashableTracker") -> bool:
     def __init__(
         self,
         items: dict[VariableTracker, VariableTracker],
-        user_cls=dict,
-        **kwargs,
+        user_cls: type = dict,
+        **kwargs: Any,
     ) -> None:
         # .clone() pass these arguments in kwargs but they're recreated a few
         # lines below
@@ -247,18 +248,22 @@ def __init__(
             for x, v in items.items()
         )
 
-        def make_hashable(key):
+        def make_hashable(
+            key: Union[VariableTracker, "ConstDictVariable._HashableTracker"],
+        ) -> "ConstDictVariable._HashableTracker":
             return key if isinstance(key, Hashable) else Hashable(key)
 
         dict_cls = self._get_dict_cls_from_user_cls(user_cls)
         self.items = dict_cls({make_hashable(x): v for x, v in items.items()})
         # need to reconstruct everything if the dictionary is an intermediate value
         # or if a pop/delitem was executed
-        self.should_reconstruct_all = not is_from_local_source(self.source)
+        self.should_reconstruct_all = (
+            not is_from_local_source(self.source) if self.source else True
+        )
         self.original_items = items.copy()
         self.user_cls = user_cls
 
-    def _get_dict_cls_from_user_cls(self, user_cls):
+    def _get_dict_cls_from_user_cls(self, user_cls: type) -> type:
         accepted_dict_types = (dict, collections.OrderedDict, collections.defaultdict)
 
         # avoid executing user code if user_cls is a dict subclass
@@ -277,10 +282,10 @@ def _get_dict_cls_from_user_cls(self, user_cls):
             dict_cls = dict
         return dict_cls
 
-    def as_proxy(self):
+    def as_proxy(self) -> dict[Any, Any]:
         return {k.vt.as_proxy(): v.as_proxy() for k, v in self.items.items()}
 
-    def debug_repr(self):
+    def debug_repr(self) -> str:
         return (
             "{"
             + ", ".join(
@@ -289,20 +294,20 @@ def debug_repr(self):
             + "}"
         )
 
-    def as_python_constant(self):
+    def as_python_constant(self) -> dict[Any, Any]:
         return {
             k.vt.as_python_constant(): v.as_python_constant()
             for k, v in self.items.items()
         }
 
-    def keys_as_python_constant(self):
+    def keys_as_python_constant(self) -> dict[Any, VariableTracker]:
         self.install_dict_keys_match_guard()
         return {k.vt.as_python_constant(): v for k, v in self.items.items()}
 
-    def python_type(self):
+    def python_type(self) -> type:
         return self.user_cls
 
-    def __contains__(self, vt) -> bool:
+    def __contains__(self, vt: VariableTracker) -> bool:
         assert isinstance(vt, VariableTracker)
         Hashable = ConstDictVariable._HashableTracker
         return (
@@ -322,13 +327,15 @@ def has_new_items(self) -> bool:
             for key, value in self.items.items()
         )
 
-    def is_new_item(self, value, other):
+    def is_new_item(
+        self, value: Optional[VariableTracker], other: VariableTracker
+    ) -> bool:
         # compare the id of the realized values if both values are not lazy VTs
         if value and value.is_realized() and other.is_realized():
             return id(value.realize()) != id(other.realize())
         return id(value) != id(other)
 
-    def reconstruct_kvs_into_new_dict(self, codegen):
+    def reconstruct_kvs_into_new_dict(self, codegen: "PyCodegen") -> None:
         # Build a dictionary that contains the keys and values.
         num_args = 0
         for key, value in self.items.items():
@@ -340,7 +347,7 @@ def reconstruct_kvs_into_new_dict(self, codegen):
                 num_args += 1
         codegen.append_output(create_instruction("BUILD_MAP", arg=num_args))
 
-    def reconstruct(self, codegen: "PyCodegen"):
+    def reconstruct(self, codegen: "PyCodegen") -> None:
         if self.user_cls is collections.OrderedDict:
             # emit `OrderedDict(constructed_dict)`
             codegen.add_push_null(
@@ -358,19 +365,21 @@ def reconstruct(self, codegen: "PyCodegen"):
 
     def getitem_const_raise_exception_if_absent(
         self, tx: "InstructionTranslator", arg: VariableTracker
-    ):
+    ) -> VariableTracker:
         key = ConstDictVariable._HashableTracker(arg)
         if key not in self.items:
             raise_observed_exception(KeyError, tx)
         return self.items[key]
 
-    def getitem_const(self, tx: "InstructionTranslator", arg: VariableTracker):
+    def getitem_const(
+        self, tx: "InstructionTranslator", arg: VariableTracker
+    ) -> VariableTracker:
         key = ConstDictVariable._HashableTracker(arg)
         if key not in self.items:
-            msg = f"Dictionary key {arg.value} not found during tracing"
-            unimplemented_v2(
+            msg = f"Dictionary key {arg.value} not found during tracing"  # type: ignore[attr-defined]
+            unimplemented(
                 gb_type="key not found in dict",
-                context=f"Key {arg.value}",
+                context=f"Key {arg.value}",  # type: ignore[attr-defined]
                 explanation=msg,
                 hints=[
                     "Check if the key exists in the dictionary before accessing it.",
@@ -379,13 +388,13 @@ def getitem_const(self, tx: "InstructionTranslator", arg: VariableTracker):
             )
         return self.items[key]
 
-    def maybe_getitem_const(self, arg: VariableTracker):
+    def maybe_getitem_const(self, arg: VariableTracker) -> Optional[VariableTracker]:
         key = ConstDictVariable._HashableTracker(arg)
         if key not in self.items:
             return None
         return self.items[key]
 
-    def realize_key_vt(self, arg: VariableTracker):
+    def realize_key_vt(self, arg: VariableTracker) -> None:
         # Realize the LazyVT on a particular index
         assert arg in self
         key = ConstDictVariable._HashableTracker(arg)
@@ -394,11 +403,13 @@ def realize_key_vt(self, arg: VariableTracker):
         if isinstance(original_key_vt, variables.LazyVariableTracker):
             original_key_vt.realize()
 
-    def install_dict_keys_match_guard(self):
+    def install_dict_keys_match_guard(self) -> None:
         if self.source:
             install_guard(self.make_guard(GuardBuilder.DICT_KEYS_MATCH))
 
-    def install_dict_contains_guard(self, tx, args):
+    def install_dict_contains_guard(
+        self, tx: "InstructionTranslator", args: list[VariableTracker]
+    ) -> None:
         # Key guarding - These are the cases to consider
         # 1) The dict has been mutated. In this case, we would have already
         # inserted a DICT_KEYS_MATCH guard, so we can skip.
@@ -439,11 +450,11 @@ def install_dict_contains_guard(self, tx, args):
 
     def call_method(
         self,
-        tx,
-        name,
-        args: "list[VariableTracker]",
-        kwargs: "dict[str, VariableTracker]",
-    ) -> "VariableTracker":
+        tx: "InstructionTranslator",
+        name: str,
+        args: list[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         # NB - Both key and value are LazyVariableTrackers in the beginning. So,
         # we have to insert guards when a dict method is accessed. For this to
         # be simple, we are conservative and overguard. We skip guard only for
@@ -462,7 +473,7 @@ def call_method(
                 tx, *args, **kwargs
             )
             tx.output.side_effects.mutation(self)
-            self.items.update(temp_dict_vt.items)
+            self.items.update(temp_dict_vt.items)  # type: ignore[attr-defined]
             return ConstantVariable.create(None)
         elif name == "__getitem__":
             # Key guarding - Nothing to do. LazyVT for value will take care.
@@ -526,7 +537,7 @@ def call_method(
             return ConstantVariable.create(len(self.items))
         elif name == "__setitem__" and self.is_mutable():
             if not arg_hashable:
-                raise_unhashable(args[0])
+                raise_unhashable(args[0], tx)
 
             self.install_dict_keys_match_guard()
             if kwargs or len(args) != 2:
@@ -550,7 +561,7 @@ def call_method(
                 raise_args_mismatch(tx, name, "1 or 2 args", f"{len(args)} args")
 
             if not arg_hashable:
-                raise_unhashable(args[0])
+                raise_unhashable(args[0], tx)
 
             if args[0] not in self:
                 self.install_dict_contains_guard(tx, args)
@@ -565,7 +576,7 @@ def call_method(
                 raise_args_mismatch(tx, name, "1 or 2 args", f"{len(args)} args")
 
             if not arg_hashable:
-                raise_unhashable(args[0])
+                raise_unhashable(args[0], tx)
 
             if args[0] not in self:
                 # missing item, return the default value. Install no DICT_CONTAINS guard.
@@ -599,7 +610,7 @@ def call_method(
                     last = v.value
                 else:
                     raise_args_mismatch(tx, name)
-                k, v = self.items.popitem(last=last)
+                k, v = self.items.popitem(last=last)  # type: ignore[possibly-undefined]
             else:
                 k, v = self.items.popitem()
 
@@ -632,17 +643,17 @@ def call_method(
                         # NB - Guard on all the keys of the other dict to ensure
                         # correctness.
                         args[0].install_dict_keys_match_guard()
-                        dict_vt = args[0]
+                        dict_vt: ConstDictVariable = args[0]
                     else:
-                        dict_vt = BuiltinVariable.call_custom_dict(tx, dict, args[0])
-                    self.items.update(dict_vt.items)
+                        dict_vt = BuiltinVariable.call_custom_dict(tx, dict, args[0])  # type: ignore[assignment]
+                    self.items.update(dict_vt.items)  # type: ignore[attr-defined]
                 if has_kwargs:
                     # Handle kwargs
-                    kwargs = {
+                    kwargs_hashable = {
                         Hashable(ConstantVariable.create(k)): v
                         for k, v in kwargs.items()
                     }
-                    self.items.update(kwargs)
+                    self.items.update(kwargs_hashable)
                 return ConstantVariable.create(None)
             else:
                 return super().call_method(tx, name, args, kwargs)
@@ -656,7 +667,7 @@ def call_method(
                 )
 
             if not arg_hashable:
-                raise_unhashable(args[0])
+                raise_unhashable(args[0], tx)
 
             self.install_dict_contains_guard(tx, args)
             contains = args[0] in self
@@ -671,7 +682,7 @@ def call_method(
                 )
 
             if not arg_hashable:
-                raise_unhashable(args[0])
+                raise_unhashable(args[0], tx)
 
             self.install_dict_keys_match_guard()
             if kwargs or len(args) > 2:
@@ -707,7 +718,7 @@ def call_method(
                 and "last" in kwargs
                 and isinstance(kwargs["last"], ConstantVariable)
             ):
-                last = kwargs.get("last").value
+                last = kwargs.get("last").value  # type: ignore[union-attr]
 
             key = Hashable(args[0])
             self.items.move_to_end(key, last=last)
@@ -723,7 +734,7 @@ def call_method(
             )
         elif name == "__ne__":
             return ConstantVariable.create(
-                not self.call_method(tx, "__eq__", args, kwargs).value
+                not self.call_method(tx, "__eq__", args, kwargs).value  # type: ignore[attr-defined]
             )
         elif name == "__or__":
             if len(args) != 1:
@@ -750,14 +761,14 @@ def call_method(
             if not istype(
                 other, (ConstDictVariable, variables.UserDefinedDictVariable)
             ):
-                msg = (
+                err_msg = (
                     f"unsupported operand type(s) for |: '{self.python_type().__name__}'"
                     f"and '{other.python_type().__name__}'"
                 )
-                raise_observed_exception(TypeError, tx, args=[msg])
+                raise_observed_exception(TypeError, tx, args=[err_msg])
 
             # OrderedDict overloads __ror__
-            ts = {self.user_cls, other.user_cls}
+            ts = {self.user_cls, other.user_cls}  # type: ignore[attr-defined]
             user_cls = (
                 collections.OrderedDict
                 if any(issubclass(t, collections.OrderedDict) for t in ts)
@@ -774,8 +785,8 @@ def call_method(
 
             # NB - Guard on all the keys of the other dict to ensure
             # correctness.
-            args[0].install_dict_keys_match_guard()
-            new_dict_vt.items.update(args[0].items)
+            args[0].install_dict_keys_match_guard()  # type: ignore[attr-defined]
+            new_dict_vt.items.update(args[0].items)  # type: ignore[attr-defined]
             return new_dict_vt
         elif name == "__ior__":
             self.call_method(tx, "update", args, kwargs)
@@ -789,11 +800,13 @@ def call_method(
         else:
             return super().call_method(tx, name, args, kwargs)
 
-    def unpack_var_sequence(self, tx):
+    def unpack_var_sequence(self, tx: "InstructionTranslator") -> list[VariableTracker]:
         self.install_dict_keys_match_guard()
-        return [x.vt for x in self.items.keys()]
+        return [x.vt for x in self.items]
 
-    def call_obj_hasattr(self, tx, name):
+    def call_obj_hasattr(
+        self, tx: "InstructionTranslator", name: str
+    ) -> VariableTracker:
         # dict not allow setting arbitrary attributes.  OrderedDict and
         # defaultdict allow arbitrary setattr, but not deletion of default attrs
         if any(
@@ -806,7 +819,7 @@ def call_obj_hasattr(self, tx, name):
                 return ConstantVariable.create(False)
 
         msg = f"hasattr on {self.user_cls} is not supported"
-        unimplemented_v2(
+        unimplemented(
             gb_type="unsupported hasattr operation",
             context=f"Class {self.user_cls}",
             explanation=msg,
@@ -816,32 +829,32 @@ def call_obj_hasattr(self, tx, name):
             ],
         )
 
-    def clone(self, **kwargs):
+    def clone(self, **kwargs: Any) -> VariableTracker:
         self.install_dict_keys_match_guard()
         return super().clone(**kwargs)
 
 
 class MappingProxyVariable(VariableTracker):
     # proxies to the original dict_vt
-    def __init__(self, dv_dict: ConstDictVariable, **kwargs) -> None:
+    def __init__(self, dv_dict: ConstDictVariable, **kwargs: Any) -> None:
         super().__init__(**kwargs)
         assert isinstance(dv_dict, ConstDictVariable)
         self.dv_dict = dv_dict
 
-    def python_type(self):
+    def python_type(self) -> type:
         return types.MappingProxyType
 
-    def unpack_var_sequence(self, tx):
+    def unpack_var_sequence(self, tx: "InstructionTranslator") -> list[VariableTracker]:
         return self.dv_dict.unpack_var_sequence(tx)
 
-    def reconstruct(self, codegen: "PyCodegen"):
+    def reconstruct(self, codegen: "PyCodegen") -> None:
         # load types.MappingProxyType
         if self.source:
             msg = (
                 f"Preexisting MappingProxyVariable (source: {self.source}) cannot be reconstructed "
                 "because the connection to the original dict will be lost."
             )
-            unimplemented_v2(
+            unimplemented(
                 gb_type="mapping proxy cannot be reconstructed",
                 context=f"Source: {self.source}",
                 explanation=msg,
@@ -863,11 +876,11 @@ def reconstruct(self, codegen: "PyCodegen"):
 
     def call_method(
         self,
-        tx,
-        name,
-        args: list["VariableTracker"],
-        kwargs: dict[str, "VariableTracker"],
-    ) -> "VariableTracker":
+        tx: "InstructionTranslator",
+        name: str,
+        args: list[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         if self.source and tx.output.side_effects.has_existing_dict_mutation():
             msg = (
                 "A dict has been modified while we have an existing mappingproxy object. "
@@ -879,7 +892,7 @@ def call_method(
                 "are trying to access a proxy object."
             )
 
-            unimplemented_v2(
+            unimplemented(
                 gb_type="mapping proxy affected by dictionary mutation",
                 context=f"Source: {self.source}, Dict mutation detected",
                 explanation=msg,
@@ -892,7 +905,7 @@ def call_method(
 
     def call_obj_hasattr(
         self, tx: "InstructionTranslator", name: str
-    ) -> "VariableTracker":
+    ) -> VariableTracker:
         if self.python_type() is types.MappingProxyType:
             return ConstantVariable.create(name in types.MappingProxyType.__dict__)
         return super().call_obj_hasattr(tx, name)
@@ -900,45 +913,62 @@ def call_obj_hasattr(
 
 class NNModuleHooksDictVariable(ConstDictVariable):
     # Special class to avoid adding any guards on the nn module hook ids.
-    def install_dict_keys_match_guard(self):
+    def install_dict_keys_match_guard(self) -> None:
         pass
 
-    def install_dict_contains_guard(self, tx, args):
+    def install_dict_contains_guard(
+        self, tx: "InstructionTranslator", args: list[VariableTracker]
+    ) -> None:
         pass
 
 
 class DefaultDictVariable(ConstDictVariable):
-    def __init__(self, items, user_cls, default_factory=None, **kwargs) -> None:
+    def __init__(
+        self,
+        items: dict[VariableTracker, VariableTracker],
+        user_cls: type,
+        default_factory: Optional[VariableTracker] = None,
+        **kwargs: Any,
+    ) -> None:
         super().__init__(items, user_cls, **kwargs)
         assert user_cls is collections.defaultdict
+        if default_factory is None:
+            default_factory = ConstantVariable.create(None)
         self.default_factory = default_factory
 
-    def is_python_constant(self):
+    def is_python_constant(self) -> bool:
         # Return false for unsupported defaults. This ensures that a bad handler
         # path is not taken in BuiltinVariable for getitem.
         if self.default_factory not in [list, tuple, dict] and not self.items:
             return False
         return super().is_python_constant()
 
-    def debug_repr(self):
+    def debug_repr(self) -> str:
+        assert self.default_factory is not None
         return (
             f"defaultdict({self.default_factory.debug_repr()}, {super().debug_repr()})"
         )
 
     @staticmethod
-    def is_supported_arg(arg):
+    def is_supported_arg(arg: VariableTracker) -> bool:
         if isinstance(arg, variables.BuiltinVariable):
             return arg.fn in (list, tuple, dict, set)
         else:
-            return isinstance(arg, variables.functions.BaseUserFunctionVariable)
+            return isinstance(
+                arg,
+                (
+                    variables.functions.BaseUserFunctionVariable,
+                    variables.functions.PolyfilledFunctionVariable,
+                ),
+            )
 
     def call_method(
         self,
-        tx,
-        name,
-        args: "list[VariableTracker]",
-        kwargs: "dict[str, VariableTracker]",
-    ) -> "VariableTracker":
+        tx: "InstructionTranslator",
+        name: str,
+        args: list[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         if name == "__getitem__":
             if len(args) != 1:
                 raise_args_mismatch(tx, name, "1 args", f"{len(args)} args")
@@ -946,18 +976,21 @@ def call_method(
             if args[0] in self:
                 return self.getitem_const(tx, args[0])
             else:
-                if self.default_factory is None:
-                    raise KeyError(f"{args[0]}")
+                if (
+                    istype(self.default_factory, ConstantVariable)
+                    and self.default_factory.value is None
+                ):
+                    raise_observed_exception(KeyError, tx, args=[args[0]])
                 else:
                     default_var = self.default_factory.call_function(tx, [], {})
                     super().call_method(
-                        tx, "__setitem__", (args[0], default_var), kwargs
+                        tx, "__setitem__", [args[0], default_var], kwargs
                     )
                     return default_var
         else:
             return super().call_method(tx, name, args, kwargs)
 
-    def reconstruct(self, codegen):
+    def reconstruct(self, codegen: "PyCodegen") -> None:
         # emit `defaultdict(default_factory, new_dict)`
         codegen.add_push_null(
             lambda: codegen.extend_output(
@@ -983,40 +1016,48 @@ class SetVariable(ConstDictVariable):
     def __init__(
         self,
         items: list[VariableTracker],
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
+        # pyrefly: ignore[bad-assignment]
         items = dict.fromkeys(items, SetVariable._default_value())
+        # pyrefly: ignore[bad-argument-type]
         super().__init__(items, **kwargs)
 
-    def debug_repr(self):
+    def debug_repr(self) -> str:
         if not self.items:
             return "set()"
         else:
-            return "{" + ",".join(k.vt.debug_repr() for k in self.items.keys()) + "}"
+            return "{" + ",".join(k.vt.debug_repr() for k in self.items) + "}"
 
     @property
-    def set_items(self):
+    def set_items(self) -> set["ConstDictVariable._HashableTracker"]:
         return set(self.items.keys())
 
     @staticmethod
-    def _default_value():
+    def _default_value() -> VariableTracker:
         # Variable to fill in he keys of the dictionary
         return ConstantVariable.create(None)
 
-    def as_proxy(self):
+    def as_proxy(self) -> Any:
         return {k.vt.as_proxy() for k in self.set_items}
 
-    def python_type(self):
+    def python_type(self) -> type:
         return set
 
-    def as_python_constant(self):
+    def as_python_constant(self) -> Any:
         return {k.vt.as_python_constant() for k in self.set_items}
 
-    def reconstruct(self, codegen: "PyCodegen"):
+    def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen.foreach([x.vt for x in self.set_items])
         codegen.append_output(create_instruction("BUILD_SET", arg=len(self.set_items)))
 
-    def _fast_set_method(self, tx, fn, args, kwargs):
+    def _fast_set_method(
+        self,
+        tx: "InstructionTranslator",
+        fn: Any,
+        args: list[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         try:
             res = fn(
                 *[x.as_python_constant() for x in [self, *args]],
@@ -1026,15 +1067,16 @@ def _fast_set_method(self, tx, fn, args, kwargs):
             raise_observed_exception(
                 type(exc), tx, args=list(map(ConstantVariable.create, exc.args))
             )
+        # pyrefly: ignore[unbound-name]
         return VariableTracker.build(tx, res)
 
     def call_method(
         self,
-        tx,
-        name,
+        tx: "InstructionTranslator",
+        name: str,
         args: list[VariableTracker],
         kwargs: dict[str, VariableTracker],
-    ) -> "VariableTracker":
+    ) -> VariableTracker:
         # We forward the calls to the dictionary model
         from ..utils import check_constant_args
 
@@ -1054,10 +1096,10 @@ def call_method(
             return self._fast_set_method(tx, getattr(py_type, name), args, kwargs)
 
         if name == "__init__":
-            temp_set_vt = variables.BuiltinVariable(set).call_set(tx, *args, *kwargs)
+            temp_set_vt = variables.BuiltinVariable(set).call_set(tx, *args, **kwargs)
             tx.output.side_effects.mutation(self)
             self.items.clear()
-            self.items.update(temp_set_vt.items)
+            self.items.update(temp_set_vt.items)  # type: ignore[attr-defined]
             return ConstantVariable.create(None)
         elif name == "add":
             if kwargs or len(args) != 1:
@@ -1068,7 +1110,7 @@ def call_method(
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
             name = "__setitem__"
-            args = (args[0], SetVariable._default_value())
+            args = [args[0], SetVariable._default_value()]
         elif name == "pop":
             if kwargs or args:
                 raise_args_mismatch(
@@ -1079,12 +1121,14 @@ def call_method(
                 )
             # Choose an item at random and pop it via the Dict.pop method
             try:
-                result = self.set_items.pop().vt
+                result: VariableTracker = self.set_items.pop().vt  # type: ignore[assignment]
             except KeyError as e:
                 raise_observed_exception(
                     KeyError, tx, args=list(map(ConstantVariable.create, e.args))
                 )
-            super().call_method(tx, name, (result,), kwargs)
+            # pyrefly: ignore[unbound-name]
+            super().call_method(tx, name, [result], kwargs)
+            # pyrefly: ignore[unbound-name]
             return result
         elif name == "isdisjoint":
             if kwargs or len(args) != 1:
@@ -1206,6 +1250,7 @@ def call_method(
                     f"unsupported operand type(s) for {name}: '{self.python_type_name()}' and '{args[0].python_type_name()}'"
                 )
                 raise_observed_exception(TypeError, tx, args=[msg])
+            assert m is not None
             return self.call_method(tx, m, args, kwargs)
         elif name in ("__iand__", "__ior__", "__ixor__", "__isub__"):
             if not isinstance(args[0], (SetVariable, variables.UserDefinedSetVariable)):
@@ -1219,29 +1264,34 @@ def call_method(
                 "__ixor__": "symmetric_difference_update",
                 "__isub__": "difference_update",
             }.get(name)
+            assert m is not None
             self.call_method(tx, m, args, kwargs)
             return self
         elif name == "__eq__":
             if not isinstance(args[0], (SetVariable, variables.UserDefinedSetVariable)):
                 return ConstantVariable.create(False)
             r = self.call_method(tx, "symmetric_difference", args, kwargs)
-            return ConstantVariable.create(len(r.set_items) == 0)
+            return ConstantVariable.create(len(r.set_items) == 0)  # type: ignore[attr-defined]
         elif name in cmp_name_to_op_mapping:
             if not isinstance(args[0], (SetVariable, variables.UserDefinedSetVariable)):
                 return ConstantVariable.create(NotImplemented)
             return ConstantVariable.create(
-                cmp_name_to_op_mapping[name](self.set_items, args[0].set_items)
+                cmp_name_to_op_mapping[name](self.set_items, args[0].set_items)  # type: ignore[attr-defined]
             )
         return super().call_method(tx, name, args, kwargs)
 
-    def getitem_const(self, tx: "InstructionTranslator", arg: VariableTracker):
+    def getitem_const(
+        self, tx: "InstructionTranslator", arg: VariableTracker
+    ) -> VariableTracker:
         raise RuntimeError("Illegal to getitem on a set")
 
-    def install_dict_keys_match_guard(self):
+    def install_dict_keys_match_guard(self) -> None:
         # Already EQUALS_MATCH guarded
         pass
 
-    def install_dict_contains_guard(self, tx, args):
+    def install_dict_contains_guard(
+        self, tx: "InstructionTranslator", args: list[VariableTracker]
+    ) -> None:
         super().install_dict_contains_guard(tx, args)
 
 
@@ -1249,27 +1299,27 @@ class FrozensetVariable(SetVariable):
     def __init__(
         self,
         items: list[VariableTracker],
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         super().__init__(items, **kwargs)
 
-    def debug_repr(self):
+    def debug_repr(self) -> str:
         if not self.items:
             return "frozenset()"
         else:
-            return "{" + ",".join(k.vt.debug_repr() for k in self.items.keys()) + "}"
+            return "{" + ",".join(k.vt.debug_repr() for k in self.items) + "}"
 
     @property
-    def set_items(self):
+    def set_items(self) -> set["ConstDictVariable._HashableTracker"]:
         return self.items.keys()
 
-    def python_type(self):
+    def python_type(self) -> type:
         return frozenset
 
-    def as_python_constant(self):
+    def as_python_constant(self) -> Any:
         return frozenset({k.vt.as_python_constant() for k in self.set_items})
 
-    def reconstruct(self, codegen: "PyCodegen"):
+    def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen.foreach([x.vt for x in self.set_items])
         codegen.add_push_null(
             lambda: codegen.extend_output(
@@ -1282,11 +1332,11 @@ def reconstruct(self, codegen: "PyCodegen"):
 
     def call_method(
         self,
-        tx,
-        name,
+        tx: "InstructionTranslator",
+        name: str,
         args: list[VariableTracker],
         kwargs: dict[str, VariableTracker],
-    ) -> "VariableTracker":
+    ) -> VariableTracker:
         if name in ["add", "pop", "update", "remove", "discard", "clear"]:
             raise RuntimeError(f"Illegal call_method {name} on a frozenset")
         elif name == "__init__":
@@ -1305,7 +1355,7 @@ def call_method(
             "symmetric_difference",
         ):
             r = super().call_method(tx, name, args, kwargs)
-            return FrozensetVariable(r.items)
+            return FrozensetVariable(r.items)  # type: ignore[attr-defined]
         return super().call_method(tx, name, args, kwargs)
 
 
@@ -1313,47 +1363,47 @@ class DictKeySetVariable(SetVariable):
     def __init__(
         self,
         items: list[VariableTracker],
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         super().__init__(items, **kwargs)
 
-    def debug_repr(self):
+    def debug_repr(self) -> str:
         if not self.items:
             return "dict_keys([])"
         else:
             return (
-                "dict_keys(["
-                + ",".join(k.vt.debug_repr() for k in self.items.keys())
-                + "])"
+                "dict_keys([" + ",".join(k.vt.debug_repr() for k in self.items) + "])"
             )
 
-    def install_dict_keys_match_guard(self):
+    def install_dict_keys_match_guard(self) -> None:
         # Already EQUALS_MATCH guarded
         pass
 
-    def install_dict_contains_guard(self, tx, args):
+    def install_dict_contains_guard(
+        self, tx: "InstructionTranslator", args: list[VariableTracker]
+    ) -> None:
         # Already EQUALS_MATCH guarded
         pass
 
     @property
-    def set_items(self):
+    def set_items(self) -> Any:
         return self.items
 
-    def python_type(self):
+    def python_type(self) -> type:
         return dict_keys
 
-    def as_python_constant(self):
+    def as_python_constant(self) -> Any:
         return dict.fromkeys(
             {k.vt.as_python_constant() for k in self.set_items}, None
         ).keys()
 
     def call_method(
         self,
-        tx,
-        name,
+        tx: "InstructionTranslator",
+        name: str,
         args: list[VariableTracker],
         kwargs: dict[str, VariableTracker],
-    ) -> "VariableTracker":
+    ) -> VariableTracker:
         if name in ["add", "pop", "update", "remove", "discard", "clear"]:
             raise RuntimeError(f"Illegal call_method {name} on a dict_keys")
         return super().call_method(tx, name, args, kwargs)
@@ -1368,42 +1418,47 @@ class DictViewVariable(VariableTracker):
 
     kv: Optional[str] = None
 
-    def __init__(self, dv_dict: ConstDictVariable, **kwargs) -> None:
+    def __init__(self, dv_dict: ConstDictVariable, **kwargs: Any) -> None:
         super().__init__(**kwargs)
         assert self.kv in ("keys", "values", "items")
         assert isinstance(dv_dict, ConstDictVariable)
         self.dv_dict = dv_dict
 
     @property
-    def view_items(self):
+    def view_items(self) -> Any:
+        assert self.kv is not None
         return getattr(self.dv_dict.items, self.kv)()
 
     @property
-    def view_items_vt(self):
+    def view_items_vt(self) -> list[VariableTracker]:
         # Returns an iterable of the unpacked items
         # Implement in the subclasses
         raise NotImplementedError
 
-    def unpack_var_sequence(self, tx):
+    def unpack_var_sequence(self, tx: "InstructionTranslator") -> list[VariableTracker]:
         return self.view_items_vt
 
-    def reconstruct(self, codegen: "PyCodegen"):
+    def reconstruct(self, codegen: "PyCodegen") -> None:
+        assert self.kv is not None
         codegen(self.dv_dict)
         codegen.load_method(self.kv)
         codegen.call_method(0)
 
-    def call_obj_hasattr(self, tx, name):
+    def call_obj_hasattr(
+        self, tx: "InstructionTranslator", name: str
+    ) -> VariableTracker:
+        assert self.kv is not None
         if name in self.python_type().__dict__:
             return ConstantVariable.create(True)
         return ConstantVariable.create(False)
 
     def call_method(
         self,
-        tx,
-        name,
-        args: list["VariableTracker"],
-        kwargs: dict[str, "VariableTracker"],
-    ) -> "VariableTracker":
+        tx: "InstructionTranslator",
+        name: str,
+        args: list[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         if name == "__len__":
             return self.dv_dict.call_method(tx, name, args, kwargs)
         elif name == "__iter__":
@@ -1417,24 +1472,24 @@ class DictKeysVariable(DictViewVariable):
     kv = "keys"
 
     @property
-    def set_items(self):
+    def set_items(self) -> set[VariableTracker]:
         return set(self.view_items)
 
     @property
-    def view_items_vt(self):
+    def view_items_vt(self) -> list[VariableTracker]:
         # Returns an iterable of the unpacked items
         return [x.vt for x in self.view_items]
 
-    def python_type(self):
+    def python_type(self) -> type:
         return dict_keys
 
     def call_method(
         self,
-        tx,
-        name,
-        args: list["VariableTracker"],
-        kwargs: dict[str, "VariableTracker"],
-    ) -> "VariableTracker":
+        tx: "InstructionTranslator",
+        name: str,
+        args: list[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         if name == "__contains__":
             return self.dv_dict.call_method(tx, name, args, kwargs)
         elif name in (
@@ -1449,13 +1504,13 @@ def call_method(
         ):
             # These methods always returns a set
             m = getattr(self.set_items, name)
-            r = m(args[0].set_items)
+            r = m(args[0].set_items)  # type: ignore[attr-defined]
             return SetVariable(r)
         if name in cmp_name_to_op_mapping:
             if not isinstance(args[0], (SetVariable, DictKeysVariable)):
                 return ConstantVariable.create(NotImplemented)
             return ConstantVariable.create(
-                cmp_name_to_op_mapping[name](self.set_items, args[0].set_items)
+                cmp_name_to_op_mapping[name](self.set_items, args[0].set_items)  # type: ignore[attr-defined]
             )
         return super().call_method(tx, name, args, kwargs)
 
@@ -1465,10 +1520,10 @@ class DictValuesVariable(DictViewVariable):
     kv = "values"
 
     @property
-    def view_items_vt(self):
+    def view_items_vt(self) -> list[VariableTracker]:
         return list(self.view_items)
 
-    def python_type(self):
+    def python_type(self) -> type:
         return dict_values
 
 
@@ -1476,14 +1531,20 @@ class DictItemsVariable(DictViewVariable):
     kv = "items"
 
     @property
-    def view_items_vt(self):
+    def view_items_vt(self) -> list[VariableTracker]:
         # Returns an iterable of the unpacked items
         return [variables.TupleVariable([k.vt, v]) for k, v in self.view_items]
 
-    def python_type(self):
+    def python_type(self) -> type:
         return dict_items
 
-    def call_method(self, tx, name, args, kwargs):
+    def call_method(
+        self,
+        tx: "InstructionTranslator",
+        name: str,
+        args: list[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         # TODO(guilhermeleobas): This should actually check if args[0]
         # implements the mapping protocol.
         if name == "__eq__":
diff --git a/torch/_dynamo/variables/distributed.py b/torch/_dynamo/variables/distributed.py
index eb39dd8fa3e07..f6faf4414d1da 100644
--- a/torch/_dynamo/variables/distributed.py
+++ b/torch/_dynamo/variables/distributed.py
@@ -20,7 +20,8 @@
 
 import functools
 import inspect
-from typing import Any, Sequence, TYPE_CHECKING
+from collections.abc import Sequence
+from typing import Any, TYPE_CHECKING
 
 import torch
 from torch.fx.experimental._backward_state import BackwardState
@@ -28,7 +29,7 @@
 from .. import compiled_autograd, variables
 from .._trace_wrapped_higher_order_op import trace_wrapped
 from ..bytecode_transformation import create_call_function
-from ..exc import unimplemented_v2
+from ..exc import unimplemented
 from ..external_utils import call_module_hooks_from_backward_state
 from ..guards import GuardBuilder, install_guard
 from ..source import AttrSource
@@ -56,7 +57,7 @@ class DistributedVariable(VariableTracker):
     def __init__(self, value: Any, **kwargs: Any) -> None:
         super().__init__(**kwargs)
         if not DistributedVariable.is_available():
-            unimplemented_v2(
+            unimplemented(
                 gb_type="torch.distributed package is not available!",
                 context="",
                 explanation="The PyTorch package doesn't include torch.distributed when building from source.",
@@ -211,7 +212,7 @@ def call_method(
             try:
                 value_type = type(self.value)
                 if inspect.getattr_static(value_type, "__getattr__", None) is not None:
-                    unimplemented_v2(
+                    unimplemented(
                         gb_type="Placement with custom __getattr__ not supported",
                         context=f"{value_type.__name__} with custom __getattr__",
                         explanation="Dynamo does not support Placement types with custom __getattr__ methods",
@@ -393,7 +394,7 @@ def create(
         user_pre_hooks: VariableTracker,
     ) -> "BackwardHookVariable":
         if not compiled_autograd.compiled_autograd_enabled:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Module-level backwards hooks require compiled autograd.",
                 context="",
                 explanation="",
diff --git a/torch/_dynamo/variables/functions.py b/torch/_dynamo/variables/functions.py
index 0752a413fce6e..e30eeeb2c2fde 100644
--- a/torch/_dynamo/variables/functions.py
+++ b/torch/_dynamo/variables/functions.py
@@ -1,5 +1,3 @@
-# mypy: ignore-errors
-
 """
 Function-related variable tracking classes for Dynamo's symbolic execution.
 
@@ -32,17 +30,19 @@
 import traceback
 import types
 from collections.abc import Callable, Sequence
-from types import FunctionType
+from types import CellType, FunctionType
 from typing import Any, Optional, TYPE_CHECKING, TypeVar
 from typing_extensions import Never
 from weakref import WeakKeyDictionary
 
 import torch
 from torch._dynamo.exc import get_stack_above_dynamo
+from torch._guards import Source
 
 from .. import config, graph_break_hints, polyfills, variables
 from ..bytecode_transformation import create_call_function, create_rot_n, is_generator
 from ..exc import (
+    format_skip_frame_message,
     get_dynamo_observed_exception,
     handle_observed_exception,
     InfiniteGeneratorError,
@@ -52,7 +52,7 @@
     raise_observed_exception,
     SkipFrame,
     StepUnsupported,
-    unimplemented_v2,
+    unimplemented,
     Unsupported,
 )
 from ..guards import GuardBuilder, install_guard
@@ -87,25 +87,32 @@
 try:
     from torch.distributed.fsdp._fully_shard import _fsdp_param_group
 except ModuleNotFoundError:
-    _fsdp_param_group = None
+    _fsdp_param_group = None  # type: ignore[assignment]
 
 
 if TYPE_CHECKING:
     from torch._dynamo.codegen import PyCodegen
-    from torch._dynamo.symbolic_convert import InstructionTranslator
+    from torch._dynamo.symbolic_convert import (
+        InstructionTranslator,
+        InstructionTranslatorBase,
+    )
+    from torch._dynamo.variables.ctx_manager import ContextWrappingVariable
     from torch._higher_order_ops.triton_kernel_wrap import (
         TritonGridType,
         TritonKernelType,
     )
 
+    from .lists import BaseListVariable, ListVariable
+    from .tensor import TensorVariable
+
 
-_F = TypeVar("_F", bound=Callable)
+_F = TypeVar("_F", bound=Callable[..., Any])
 CO_VARARGS = 0x04
 CO_VARKEYWORDS = 0x08
 
 
 # Module-level cache keyed by the function object
-_spec_cache = WeakKeyDictionary()
+_spec_cache: WeakKeyDictionary[Any, Any] = WeakKeyDictionary()
 
 
 class FunctionSpec:
@@ -127,7 +134,7 @@ def __init__(self, func: FunctionType):
         off += 1 if self.varargs_name else 0
         self.varkw_name = vn[off] if code.co_flags & CO_VARKEYWORDS else None
 
-    def update_defaults(self, func: FunctionType):
+    def update_defaults(self, func: FunctionType) -> None:
         # Defaults can change from function call to function call. So re-update
         # them on every call.
         self.defaults = func.__defaults__ or ()
@@ -147,7 +154,13 @@ def _get_spec(func: FunctionType) -> FunctionSpec:
     return spec
 
 
-def bind_args_cached(func, tx, fn_source, args, kwargs):
+def bind_args_cached(
+    func: FunctionType,
+    tx: "InstructionTranslator",
+    fn_source: Optional[Source],
+    args: Sequence[Any],
+    kwargs: dict[str, Any],
+) -> dict[str, VariableTracker]:
     spec = _get_spec(func)
     spec.update_defaults(func)
     ba = {}
@@ -240,7 +253,9 @@ def bind_args_cached(func, tx, fn_source, args, kwargs):
     return ba
 
 
-def wrap_bound_arg(tx: "InstructionTranslator", val, source=None):
+def wrap_bound_arg(
+    tx: "InstructionTranslator", val: Any, source: Optional[Source] = None
+) -> VariableTracker:
     # Source propagation is best effort since not every object we encounter has a source to begin with.
     if isinstance(val, VariableTracker):
         return val
@@ -252,14 +267,18 @@ def wrap_bound_arg(tx: "InstructionTranslator", val, source=None):
         return variables.LazyVariableTracker.create(val, source)
 
 
-def wrap_args_kwargs(tx: "InstructionTranslator", result):
+def wrap_args_kwargs(tx: "InstructionTranslator", result: dict[str, Any]) -> None:
     for k, v in list(result.items()):
         if isinstance(v, (tuple, dict)):
             # args/kwargs
             result[k] = wrap_bound_arg(tx, v)
 
 
-def init_cellvars(parent, result: dict[str, VariableTracker], code):
+def init_cellvars(
+    parent: "InstructionTranslator",
+    result: dict[str, VariableTracker],
+    code: types.CodeType,
+) -> None:
     """
     Update `result` to add mapping from local name to new cells created
     directly by `code`, or update SideEffects in `parent` if the a local cell is
@@ -277,8 +296,14 @@ def init_cellvars(parent, result: dict[str, VariableTracker], code):
 
 
 def _create_nested_fn(
-    code, f_globals, name, defaults, closure, kwdefaults, annotations
-):
+    code: types.CodeType,
+    f_globals: dict[str, Any],
+    name: str,
+    defaults: Optional[tuple[object, ...]],
+    closure: Optional[tuple[CellType]],
+    kwdefaults: Optional[dict[str, Any]],
+    annotations: Optional[dict[str, Any]],
+) -> types.FunctionType:
     from types import FunctionType
 
     func = FunctionType(code, f_globals, name, defaults, closure)
@@ -291,7 +316,7 @@ def _create_nested_fn(
 
     # TypeError: __annotations__ must be set to a dict object
     assert annotations is None or isinstance(annotations, dict)
-    func.__annotations__ = annotations
+    func.__annotations__ = annotations  # type: ignore[assignment]
 
     return func
 
@@ -307,7 +332,9 @@ def _create_nested_fn(
 }
 
 
-def fn_var_getattr(tx, fn, source, name):
+def fn_var_getattr(
+    tx: "InstructionTranslator", fn: object, source: Optional[Source], name: str
+) -> VariableTracker:
     source = source and AttrSource(source, name)
 
     if source and name == "__annotations__":
@@ -316,6 +343,7 @@ def fn_var_getattr(tx, fn, source, name):
         # graph is even rarer. So skip guards.
         source = SkipGuardSource(source)
 
+    subobj = None
     try:
         subobj = inspect.getattr_static(fn, name)
     except AttributeError:
@@ -332,19 +360,22 @@ def fn_var_getattr(tx, fn, source, name):
 
 
 class BaseUserFunctionVariable(VariableTracker):
-    def get_filename(self):
-        return self.get_code().co_filename
+    def get_filename(self) -> str:
+        return self.get_code().co_filename  # type: ignore[attr-defined]
 
-    def get_name(self):
-        return self.get_code().co_name
+    def get_name(self) -> str:
+        return self.get_code().co_name  # type: ignore[attr-defined]
+
+    def get_globals(self):
+        raise NotImplementedError
 
     def call_function(
         self,
         tx: "InstructionTranslator",
-        args: "list[VariableTracker]",
-        kwargs: "dict[str, VariableTracker]",
-    ) -> "VariableTracker":
-        return tx.inline_user_function_return(self, [*self.self_args(), *args], kwargs)
+        args: Sequence[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
+        return tx.inline_user_function_return(self, [*self.self_args(), *args], kwargs)  # type: ignore[attr-defined]
 
     def call_obj_hasattr(
         self, tx: "InstructionTranslator", name: str
@@ -352,18 +383,21 @@ def call_obj_hasattr(
         result = False
 
         try:
-            result = hasattr(self.get_function(), name)
+            result = hasattr(self.get_function(), name)  # type: ignore[attr-defined]
         except NotImplementedError:
             if name == "__name__" and isinstance(self, NestedUserFunctionVariable):
                 result = True
         return variables.ConstantVariable.create(result)
 
-    def inspect_parameter_names(self):
-        return list(inspect.signature(self.get_function()).parameters)
-
-    def closure_vars(self, tx):
+    def closure_vars(self, tx: "InstructionTranslator") -> dict[str, VariableTracker]:
         return {}
 
+    # Override to set whether or not nested graph breaks should be allowed
+    # if we create an inlining tx for this BaseUserFunctionVariable.
+    # See symbolic_convert.py for where this function is called.
+    def should_allow_nested_graph_breaks(self):
+        return True
+
 
 class UserFunctionVariable(BaseUserFunctionVariable):
     """Some unsupported user-defined global function"""
@@ -375,11 +409,16 @@ class UserFunctionVariable(BaseUserFunctionVariable):
     }
 
     @classmethod
-    def create_with_source(cls, value, source):
+    def create_with_source(cls, value: Any, source: Any) -> "UserFunctionVariable":
         install_guard(source.make_guard(GuardBuilder.CLOSURE_MATCH))
         return cls(value, source=source)
 
-    def __init__(self, fn, is_constant=False, **kwargs) -> None:
+    def __init__(
+        self,
+        fn: types.FunctionType | torch.jit.ScriptFunction,  # type: ignore[type-arg]
+        is_constant: bool = False,
+        **kwargs: Any,
+    ) -> None:
         super().__init__(**kwargs)
         if getattr(fn, "_dynamo_marked_constant", False):
             # This method should be treated as a constant for the purposes of compilation
@@ -390,7 +429,7 @@ def __init__(self, fn, is_constant=False, **kwargs) -> None:
         # TODO putting this here to avoid duplication, because we could hit this
         # from several paths (e.g., SuperVariable or `var_getattr`s).
         if not isinstance(fn, (types.FunctionType, torch.jit.ScriptFunction)):
-            unimplemented_v2(
+            unimplemented(
                 gb_type="can't handle functions not implemented in python ",
                 context=f"{fn}",
                 explanation="Dynamo can only handle functions defined in python",
@@ -403,40 +442,45 @@ def __init__(self, fn, is_constant=False, **kwargs) -> None:
         # VariableBuilder, which handles the wrapping of _torchdynamo_inline.
         # unpack @torch._dynamo.optimize()(fn) wrapped function
         fn = inspect.getattr_static(fn, "_torchdynamo_inline", fn)
-        self.fn: types.FunctionType = fn
+        self.fn = fn
 
-    def as_python_constant(self):
+    def as_python_constant(self) -> Any:
         if istype(self, UserFunctionVariable):
             return self.fn
         # subclasses (such as methods) usually aren't a constant
         return super().as_python_constant()
 
-    def self_args(self):
+    def self_args(self) -> list[VariableTracker]:
         return []
 
-    def get_function(self):
+    def get_function(self) -> types.FunctionType:
         return self.fn
 
-    def get_code(self):
+    def get_code(self) -> types.CodeType:
         return self.fn.__code__
 
-    def python_type(self):
+    def python_type(self) -> type:
         return types.FunctionType
 
-    def has_self(self):
+    def has_self(self) -> bool:
         return getattr(self.fn, "__self__", None) is not None
 
-    def get_globals(self):
+    def get_globals(self) -> dict[str, Any]:
         return self.fn.__globals__
 
-    def get_source(self):
+    def get_source(self) -> Source:
         source = self.source
 
         if source and isinstance(self, variables.UserMethodVariable):
-            source = self.source_fn
-        return source
+            source = self.source_fn  # type: ignore[assignment]
+        return source  # type: ignore[return-value]
 
-    def bind_args(self, parent, args, kwargs) -> dict[str, VariableTracker]:
+    def bind_args(
+        self,
+        parent: "InstructionTranslator",
+        args: Sequence[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> dict[str, VariableTracker]:
         """
         Assume `args` and `kwargs` are VariableTracker arguments for a call to
         this function, create new bindings for initial locals.
@@ -450,7 +494,7 @@ def bind_args(self, parent, args, kwargs) -> dict[str, VariableTracker]:
         root_tx = parent.output.root_tx
 
         source = self.get_source()
-        result = bind_args_cached(fn, root_tx, source, args, kwargs)
+        result = bind_args_cached(fn, root_tx, source, args, kwargs)  # type: ignore[arg-type]
 
         init_cellvars(parent, result, fn.__code__)
         closure = self.fn.__closure__ or ()
@@ -491,7 +535,7 @@ def bind_args(self, parent, args, kwargs) -> dict[str, VariableTracker]:
 
         return result
 
-    def var_getattr(self, tx: "InstructionTranslator", name: str):
+    def var_getattr(self, tx: "InstructionTranslator", name: str) -> VariableTracker:
         if name in cmp_name_to_op_mapping:
             return variables.GetAttrVariable(self, name)
         source = self.get_source()
@@ -506,9 +550,9 @@ def call_obj_hasattr(
     def call_function(
         self,
         tx: "InstructionTranslator",
-        args: "list[VariableTracker]",
-        kwargs: "dict[str, VariableTracker]",
-    ) -> "VariableTracker":
+        args: Sequence[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         # Handle patch_dynamo_config call
         if self.fn is torch._dynamo.patch_dynamo_config:
             try:
@@ -546,9 +590,9 @@ def call_function(
             if not isinstance(fn_var, BaseUserFunctionVariable):
                 typ = fn_var.python_type()
                 msg = f"`nonstrict_trace` expects a callable, but got value of type <{typ.__name__}>"
-                unimplemented_v2(
+                unimplemented(
                     gb_type="TypeError from user code",
-                    context=f"call_function({self.value}, {args}, {kwargs})",
+                    context=f"call_function({self.value}, {args}, {kwargs})",  # type: ignore[attr-defined]
                     explanation=msg,
                     hints=[
                         *graph_break_hints.USER_ERROR,
@@ -558,7 +602,7 @@ def call_function(
             if not isinstance(fn_var, UserFunctionVariable):
                 fn_name = fn_var.get_name()
                 msg = f"Applying `nonstrict_trace` to function <{fn_name}>; however, `nonstrict_trace` currently requires the function to be defined outside `torch.compile` region."  # noqa: B950
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Limitation of `nonstrict_trace",
                     context=f"{self}",
                     explanation=msg,
@@ -567,7 +611,7 @@ def call_function(
                         "`torch.compile` region",
                     ],
                 )
-
+            # pyrefly: ignore[missing-attribute]
             fn = fn_var.fn
             return variables.TorchInGraphFunctionVariable(fn, nonstrict_traceable=True)
 
@@ -593,7 +637,7 @@ def call_function(
             try:
                 from torch.distributed.fsdp._fully_shard._fsdp_state import FSDPState
             except Exception:
-                FSDPState = None
+                FSDPState = None  # type: ignore[assignment, misc]
             if FSDPState is not None and self.fn in [
                 FSDPState._pre_forward,
                 FSDPState._post_forward,
@@ -604,13 +648,15 @@ def call_function(
 
 
 class BuiltinMethodVariable(BaseUserFunctionVariable):
-    def __init__(self, fn, is_constant=False, **kwargs) -> None:
+    def __init__(
+        self, fn: types.BuiltinMethodType, is_constant: bool = False, **kwargs: Any
+    ) -> None:
         super().__init__(**kwargs)
         assert isinstance(fn, types.BuiltinMethodType)
         self.fn = fn
 
     @staticmethod
-    def is_supported_builtin_method(obj):
+    def is_supported_builtin_method(obj: Any) -> bool:
         method_self = obj.__self__
         method_name = obj.__name__
 
@@ -623,9 +669,9 @@ def is_supported_builtin_method(obj):
     def call_function(
         self,
         tx: "InstructionTranslator",
-        args: "list[VariableTracker]",
-        kwargs: "dict[str, VariableTracker]",
-    ) -> "VariableTracker":
+        args: Sequence[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         method_self = self.fn.__self__
         name = self.fn.__name__
         obj_source = self.source and AttrSource(self.source, "__self__")
@@ -637,39 +683,39 @@ class LocalGeneratorObjectVariable(VariableTracker):
     def __init__(
         self,
         code: types.CodeType,
-        f_globals,
+        f_globals: dict[str, Any],
         inline_tracer: Optional["InstructionTranslator"],
-        **kwargs,
-    ):
+        **kwargs: Any,
+    ) -> None:
         super().__init__(**kwargs)
         self.code = code
         self.f_globals = f_globals
         self.inline_tracer = inline_tracer
 
-    def get_code(self):
+    def get_code(self) -> types.CodeType:
         return self.code
 
-    def get_filename(self):
+    def get_filename(self) -> str:
         return self.get_code().co_filename
 
-    def get_name(self):
+    def get_name(self) -> str:
         return self.get_code().co_name
 
-    def get_function(self):
+    def get_function(self) -> Never:
         raise NotImplementedError
 
-    def has_self(self):
+    def has_self(self) -> bool:
         return False
 
-    def __name__(self):
+    def __name__(self) -> str:
         return self.get_name()
 
-    def __str__(self):
+    def __str__(self) -> str:
         return f"{self.__class__.__name__}({self.get_name()})"
 
     __repr__ = __str__
 
-    def reconstruct(self, codegen: "PyCodegen"):
+    def reconstruct(self, codegen: "PyCodegen") -> None:
         from torch._dynamo.side_effects import disallow_side_effects_in_generator
         from torch._dynamo.symbolic_convert import (
             InstructionTranslator,
@@ -688,25 +734,30 @@ def reconstruct(self, codegen: "PyCodegen"):
                 self.remaining_items = self.force_unpack_var_sequence(tx)
             variables.ListIteratorVariable(self.remaining_items).reconstruct(codegen)
 
-    def bind_args(self, tx, args, kwargs):
-        return self.fn.bind_args(tx, args, kwargs)
+    def bind_args(
+        self,
+        tx: "InstructionTranslator",
+        args: Sequence[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> dict[str, VariableTracker]:
+        return self.vt.bind_args(tx, args, kwargs)  # type: ignore[attr-defined]
 
-    def get_globals(self):
+    def get_globals(self) -> dict[str, Any]:
         return self.f_globals
 
-    def python_type(self):
+    def python_type(self) -> type:
         return types.GeneratorType
 
-    def _get_inline_tracer(self, tx):
+    def _get_inline_tracer(self, tx: "InstructionTranslator") -> Any:
         from torch._dynamo.symbolic_convert import InliningInstructionTranslator
 
         if self.inline_tracer is None:
-            self.inline_tracer = InliningInstructionTranslator.build_inline_tracer(
+            self.inline_tracer = InliningInstructionTranslator.build_inline_tracer(  # type: ignore[assignment]
                 tx, self, [], {}
             )
         return self.inline_tracer
 
-    def next_variable(self, tx):
+    def next_variable(self, tx: "InstructionTranslator") -> VariableTracker:
         tracer = self._get_inline_tracer(tx)
 
         if self._is_generator_exhausted():
@@ -727,23 +778,29 @@ def next_variable(self, tx):
             torch._dynamo.eval_frame.skip_code(self.get_code())
             raise SkipFrame from e
 
-    def call_obj_hasattr(self, tx, name):
+    def call_obj_hasattr(
+        self, tx: "InstructionTranslator", name: str
+    ) -> VariableTracker:
         if name in self.python_type().__dict__:
             return ConstantVariable.create(True)
         return ConstantVariable.create(False)
 
-    def has_unpack_var_sequence(self, tx):
+    def has_unpack_var_sequence(self, tx: "InstructionTranslator") -> bool:
         return False
 
-    def has_force_unpack_var_sequence(self, tx) -> builtins.bool:
+    def has_force_unpack_var_sequence(self, tx: "InstructionTranslator") -> bool:
         return True
 
-    def force_unpack_var_sequence(self, tx) -> list[VariableTracker]:
-        result = []
+    def force_unpack_var_sequence(
+        self, tx: "InstructionTranslator"
+    ) -> list[VariableTracker]:
+        result: list[VariableTracker] = []
         self.force_apply_to_var_sequence(tx, result.append)
         return result
 
-    def force_apply_to_var_sequence(self, tx, fn) -> None:
+    def force_apply_to_var_sequence(
+        self, tx: "InstructionTranslator", fn: Callable[[VariableTracker], Any]
+    ) -> None:
         while True:
             try:
                 fn(self.next_variable(tx))
@@ -751,7 +808,13 @@ def force_apply_to_var_sequence(self, tx, fn) -> None:
                 handle_observed_exception(tx)
                 break
 
-    def _setup_exception(self, tx, exc):
+    # no nested graph breaks in generators
+    def should_allow_nested_graph_breaks(self):
+        return False
+
+    def _setup_exception(
+        self, tx: "InstructionTranslator", exc: VariableTracker
+    ) -> None:
         tracer = self._get_inline_tracer(tx)
         try:
             tracer._raise_exception_variable(exc)
@@ -760,19 +823,19 @@ def _setup_exception(self, tx, exc):
             # exception is raised again.
             tracer.exception_handler(e)
 
-    def _is_generator_just_started(self):
+    def _is_generator_just_started(self) -> bool:
         return self.inline_tracer is None or self.inline_tracer.instruction_pointer == 0
 
-    def _is_generator_exhausted(self):
+    def _is_generator_exhausted(self) -> bool:
         return getattr(self.inline_tracer, "generator_exhausted", False)
 
     def call_method(
         self,
         tx: "InstructionTranslator",
         name: str,
-        args: "list[VariableTracker]",
-        kwargs: "dict[str, VariableTracker]",
-    ) -> "VariableTracker":
+        args: list[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         if name == "__next__":
             return self.next_variable(tx)
         elif name == "__iter__":
@@ -952,7 +1015,7 @@ def call_method(
                 raise_observed_exception(RuntimeError, tracer)
             return retval
 
-        super().call_method(tx, name, args, kwargs)
+        return super().call_method(tx, name, args, kwargs)
 
 
 class ContextlibContextManagerLocalGeneratorObjectVariable(
@@ -980,19 +1043,27 @@ def __init__(
         self,
         vt: VariableTracker,
         *,
-        generator_cls=LocalGeneratorObjectVariable,
-        **kwargs,
-    ):
+        generator_cls: type = LocalGeneratorObjectVariable,
+        **kwargs: Any,
+    ) -> None:
         super().__init__(**kwargs)
         self.vt = vt
         self.generator_cls = generator_cls
 
     def __getattr__(self, name):
-        if name in self.__class__.__dict__.keys():
+        if name in self.__class__.__dict__:
             return getattr(self, name)
         return getattr(self.vt, name)
 
-    def _build_inline_tracer(self, tx, args, kwargs):
+    def get_globals(self) -> dict[str, Any]:
+        return self.vt.get_globals()  # type: ignore[attr-defined]
+
+    def _build_inline_tracer(
+        self,
+        tx: "InstructionTranslatorBase",
+        args: list[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> "InstructionTranslatorBase":
         from torch._dynamo.symbolic_convert import InliningInstructionTranslator
 
         return InliningInstructionTranslator.build_inline_tracer(
@@ -1005,13 +1076,13 @@ def _build_inline_tracer(self, tx, args, kwargs):
     def call_function(
         self,
         tx: "InstructionTranslator",
-        args: "list[VariableTracker]",
-        kwargs: "dict[str, VariableTracker]",
-    ) -> "VariableTracker":
-        if not is_generator(self.vt.get_code()):
-            unimplemented_v2(
+        args: Sequence[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
+        if not is_generator(self.vt.get_code()):  # type: ignore[attr-defined]
+            unimplemented(
                 gb_type="non-generator contextlib.contextmanager",
-                context=str(self.vt.get_code()),
+                context=str(self.vt.get_code()),  # type: ignore[attr-defined]
                 explanation="Cannot compile function decorated with `@contextlib.contextmanager` that is not a generator"
                 ", i.e. does not use `yield`",
                 hints=[
@@ -1020,15 +1091,15 @@ def call_function(
                 ],
             )
 
-        inline_tracer = self._build_inline_tracer(tx, args, kwargs)
-        code = self.vt.get_code()
-        f_globals = self.vt.get_globals()
+        inline_tracer = self._build_inline_tracer(tx, list(args), kwargs)
+        code = self.vt.get_code()  # type: ignore[attr-defined]
+        f_globals = self.vt.get_globals()  # type: ignore[attr-defined]
 
         # calling a generator returns a generator object
         return self.generator_cls(
             code,
             f_globals,
-            inline_tracer,
+            inline_tracer,  # type: ignore[arg-type]
             source=self.source,
         )
 
@@ -1042,14 +1113,19 @@ class FunctionDecoratedByContextlibContextManagerVariable(
         This is only used when the function is annotated with @contextlib.contextmanager
     """
 
-    def __init__(self, vt, **kwargs):
+    def __init__(self, vt: VariableTracker, **kwargs: Any):
         super().__init__(
             vt,
             generator_cls=ContextlibContextManagerLocalGeneratorObjectVariable,
             **kwargs,
         )
 
-    def _build_inline_tracer(self, tx, args, kwargs):
+    def _build_inline_tracer(
+        self,
+        tx: "InstructionTranslatorBase",
+        args: list[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> "InstructionTranslatorBase":
         # NOTE: This only exists to not break support for context manager when
         # config.enable_faithful_generator_behavior = False and
         # config.enable_trace_contextlib = True. In case the former is false,
@@ -1066,8 +1142,14 @@ def _build_inline_tracer(self, tx, args, kwargs):
 class UserMethodVariable(UserFunctionVariable):
     """Some unsupported user-defined method"""
 
-    def __init__(self, fn, obj, source_fn=None, **kwargs) -> None:
-        super().__init__(fn=fn, **kwargs)
+    def __init__(
+        self,
+        fn: Callable[..., Any],
+        obj: VariableTracker,
+        source_fn: Optional[Callable[..., Any]] = None,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(fn=fn, **kwargs)  # type: ignore[arg-type]
         self.obj = obj
         self.source_fn = source_fn
         # Note on source and source_fn
@@ -1083,24 +1165,24 @@ def __init__(self, fn, obj, source_fn=None, **kwargs) -> None:
         # operates on the unbound function, most guards should target
         # `source_fn` rather than the original `source`.
         if source_fn is None and kwargs.get("source") is not None:
-            self.source_fn = AttrSource(kwargs.get("source"), "__func__")
+            self.source_fn = AttrSource(kwargs.get("source"), "__func__")  # type: ignore[assignment, arg-type]
 
     def __repr__(self) -> str:
         return f"{self.__class__.__name__}({self.fn}, {self.obj})"
 
-    def self_args(self):
+    def self_args(self) -> list[VariableTracker]:
         return [self.obj]
 
-    def python_type(self):
+    def python_type(self) -> type[types.MethodType]:
         return types.MethodType
 
     def call_function(
         self,
         tx: "InstructionTranslator",
-        args: "list[VariableTracker]",
-        kwargs: "dict[str, VariableTracker]",
-    ) -> "VariableTracker":
-        # NOTE this is to handle methods annotated by `nonstrict_trace`. Usually
+        args: Sequence[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
+        # NOTE this is to handle methods annotated by `nonstrict_trace`.
         # a `nonstrict_trace`-ed function will be wrapped by
         # `VariableTracker.build` and route to `TorchInGraphFunctionVariable`,
         # but in the case of method, we manually wrap it with `UserMethodVariable`
@@ -1141,36 +1223,38 @@ def call_function(
                 or self.is_constant
             ):
                 return self.obj.call_method(
-                    tx, self.fn.__name__, args, kwargs, constant=self.is_constant
+                    tx, self.fn.__name__, list(args), kwargs, constant=self.is_constant
                 )
         elif (
             _fsdp_param_group is not None
-            and self.fn is _fsdp_param_group.FSDPParamGroup.use_training_state
+            and self.fn is _fsdp_param_group.FSDPParamGroup.use_training_state  # type: ignore[attr-defined]
         ):
             return variables.TorchCtxManagerClassVariable(self.fn).call_function(
                 tx, (self.obj, *args), kwargs
             )
         if self.is_constant:
-            fn = getattr(self.obj.value, self.fn.__name__)
+            fn = getattr(self.obj.value, self.fn.__name__)  # type: ignore[attr-defined]
             return invoke_and_store_as_constant(tx, fn, self.get_name(), args, kwargs)
         return super().call_function(tx, args, kwargs)
 
-    def inspect_parameter_names(self):
-        return super().inspect_parameter_names()[1:]
-
-    def var_getattr(self, tx: "InstructionTranslator", name: str):
+    def var_getattr(self, tx: "InstructionTranslator", name: str) -> VariableTracker:
         if name == "__self__":
             return self.obj
         if name == "__func__":
             # We might have a better way to access the function object, this
             # information is stored in self.source_fn, use that to construct the
             # variable tracker.
-            return VariableTracker.build(tx, self.fn, self.source_fn)
+            return VariableTracker.build(tx, self.fn, self.source_fn)  # type: ignore[arg-type]
         return super().var_getattr(tx, name)
 
 
 class WrappedUserMethodVariable(UserMethodVariable):
-    def __init__(self, wrapped, context, **kwargs) -> None:
+    def __init__(
+        self,
+        wrapped: UserMethodVariable,
+        context: "ContextWrappingVariable",
+        **kwargs: Any,
+    ) -> None:
         kwargs.pop("fn", None)
         kwargs.pop("obj", None)
         super().__init__(wrapped.fn, wrapped.obj, **kwargs)
@@ -1180,22 +1264,27 @@ def __init__(self, wrapped, context, **kwargs) -> None:
     def call_function(
         self,
         tx: "InstructionTranslator",
-        args: "list[VariableTracker]",
-        kwargs: "dict[str, VariableTracker]",
-    ) -> "VariableTracker":
+        args: Sequence[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         self.context.enter(tx)
         result = super().call_function(tx, args, kwargs)
         self.context.exit(tx)
         return result
 
-    def reconstruct(self, codegen):
-        codegen.add_push_null(lambda: codegen(self.context))
+    def reconstruct(self, codegen: "PyCodegen") -> None:
+        codegen.add_push_null(lambda: codegen(self.context))  # type: ignore[arg-type]
         codegen(self.wrapped)
         codegen.extend_output(create_call_function(1, False))
 
 
 class WrappedUserFunctionVariable(UserFunctionVariable):
-    def __init__(self, wrapped, context, **kwargs) -> None:
+    def __init__(
+        self,
+        wrapped: UserFunctionVariable,
+        context: "ContextWrappingVariable",
+        **kwargs: Any,
+    ) -> None:
         kwargs.pop("fn", None)
         super().__init__(wrapped.fn, **kwargs)
         self.wrapped = wrapped
@@ -1204,22 +1293,28 @@ def __init__(self, wrapped, context, **kwargs) -> None:
     def call_function(
         self,
         tx: "InstructionTranslator",
-        args: "list[VariableTracker]",
-        kwargs: "dict[str, VariableTracker]",
-    ) -> "VariableTracker":
+        args: Sequence[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         self.context.enter(tx)
         result = super().call_function(tx, args, kwargs)
         self.context.exit(tx)
         return result
 
-    def reconstruct(self, codegen):
-        codegen.add_push_null(lambda: codegen(self.context))
+    def reconstruct(self, codegen: "PyCodegen") -> None:
+        codegen.add_push_null(lambda: codegen(self.context))  # type: ignore[arg-type]
         codegen(self.wrapped)
         codegen.extend_output(create_call_function(1, False))
 
 
-def invoke_and_store_as_constant(tx: "InstructionTranslator", fn, name, args, kwargs):
-    def convert(x):
+def invoke_and_store_as_constant(
+    tx: "InstructionTranslator",
+    fn: Callable[..., Any],
+    name: str,
+    args: Sequence[VariableTracker],
+    kwargs: dict[str, VariableTracker],
+) -> VariableTracker:
+    def convert(x: VariableTracker) -> Any:
         if isinstance(x, variables.TensorVariable):
             return x.get_real_value()
         return x.as_python_constant()
@@ -1242,17 +1337,17 @@ class NestedUserFunctionVariable(BaseUserFunctionVariable):
 
     def __init__(
         self,
-        fn_name,
-        code,
-        f_globals,
-        defaults,
-        kwdefaults,
-        annotations,
-        closure,
+        fn_name: VariableTracker,
+        code: VariableTracker,
+        f_globals: dict[str, Any],
+        defaults: Optional[VariableTracker],
+        kwdefaults: Optional[VariableTracker],
+        annotations: Optional[VariableTracker],
+        closure: Optional[VariableTracker],
         # This is present when this function is created by
         # `functools.wrap(wrapped_fn)(this_fn)`.
-        wrapped_fn=None,
-        **kwargs,
+        wrapped_fn: Optional[VariableTracker] = None,
+        **kwargs: Any,
     ) -> None:
         if kwargs.get("mutation_type") is None:
             kwargs.update(mutation_type=AttributeMutationNew())
@@ -1269,16 +1364,16 @@ def __init__(
         self.closure = closure
         self.wrapped_fn: Optional[VariableTracker] = wrapped_fn
 
-    def self_args(self):
+    def self_args(self) -> list[VariableTracker]:
         return []
 
-    def get_code(self):
+    def get_code(self) -> types.CodeType:
         return self.code.as_python_constant()
 
-    def python_type(self):
+    def python_type(self) -> type:
         return types.FunctionType
 
-    def get_function(self):
+    def get_function(self) -> types.FunctionType:
         if self.closure:
             raise NotImplementedError
         func = types.FunctionType(
@@ -1307,19 +1402,25 @@ def call_setattr(
         tx: "InstructionTranslator",
         name_var: VariableTracker,
         val: VariableTracker,
-    ):
-        tx.output.side_effects.store_attr(self, name_var.value, val)
+    ) -> VariableTracker:
+        tx.output.side_effects.store_attr(self, name_var.value, val)  # type: ignore[attr-defined]
         return ConstantVariable(None)
 
-    def call_method(self, tx, name, args, kwargs):
+    def call_method(
+        self,
+        tx: "InstructionTranslator",
+        name: str,
+        args: Sequence[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         if name == "__setattr__":
             return self.call_setattr(tx, *args)
-        return super().call_method(tx, name, args, kwargs)
+        return super().call_method(tx, name, list(args), kwargs)
 
-    def has_closure(self):
+    def has_closure(self) -> bool:
         return self.closure is not None
 
-    def const_getattr(self, tx, name):
+    def const_getattr(self, tx: "InstructionTranslator", name: str) -> Any:
         if name == "__name__":
             return self.get_name()
         if name == "__code__":
@@ -1329,50 +1430,57 @@ def const_getattr(self, tx, name):
             return d.as_python_constant() if d else None
         return super().const_getattr(tx, name)
 
-    def call_obj_hasattr(self, tx: "InstructionTranslator", name):
+    def call_obj_hasattr(
+        self, tx: "InstructionTranslator", name: str
+    ) -> VariableTracker:
         if name == "__code__":
             return variables.ConstantVariable.create(hasattr(self, "code"))
         if name == "__defaults__":
             return variables.ConstantVariable.create(hasattr(self, "defaults"))
         return super().call_obj_hasattr(tx, name)
 
-    def has_self(self):
+    def has_self(self) -> bool:
         return False
 
-    def get_globals(self):
+    def get_globals(self) -> dict[str, Any]:
         return self.f_globals
 
-    def bind_args(self, parent, args, kwargs):
+    def bind_args(
+        self,
+        parent: "InstructionTranslator",
+        args: Sequence[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> dict[str, VariableTracker]:
         code = self.get_code()
         func = types.FunctionType(
             code,
             self.f_globals,
             self.fn_name.as_python_constant(),
-            tuple(self.defaults.items) if self.defaults else None,
+            tuple(self.defaults.items) if self.defaults else None,  # type: ignore[attr-defined]
             tuple(make_cell(None) for _ in range(len(self.get_code().co_freevars))),
         )
         if self.kwdefaults:
-            func.__kwdefaults__ = self.kwdefaults.keys_as_python_constant()
+            func.__kwdefaults__ = self.kwdefaults.keys_as_python_constant()  # type: ignore[attr-defined]
         bound = inspect.signature(func).bind(*args, **kwargs)
         bound.apply_defaults()
         result = dict(bound.arguments.items())
-        wrap_args_kwargs(parent.output.root_tx, result)
+        wrap_args_kwargs(parent.output.root_tx, result)  # type: ignore[arg-type]
         init_cellvars(parent, result, code)
 
         for idx, name in enumerate(code.co_freevars):
             assert name not in result
-            cell = self.closure.items[idx]
+            cell = self.closure.items[idx]  # type: ignore[attr-defined, union-attr]
             result[name] = cell
 
         return result
 
-    def reconstruct(self, codegen: "PyCodegen"):
+    def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen.add_push_null(
             lambda: codegen.load_import_from(__name__, "_create_nested_fn")
         )
         codegen(self.code)
         codegen.extend_output([codegen.create_load_const_unchecked(self.f_globals)])
-        codegen(ConstantVariable.create(self.code.value.co_name))
+        codegen(ConstantVariable.create(self.code.value.co_name))  # type: ignore[attr-defined]
 
         if self.defaults:
             codegen(self.defaults)
@@ -1426,7 +1534,12 @@ def reconstruct(self, codegen: "PyCodegen"):
 
 
 class WrappedNestedUserFunctionVariable(NestedUserFunctionVariable):
-    def __init__(self, wrapped, context, **kwargs) -> None:
+    def __init__(
+        self,
+        wrapped: Any,
+        context: "ContextWrappingVariable",
+        **kwargs: Any,
+    ) -> None:
         kwargs.pop("fn_name", None)
         kwargs.pop("code", None)
         kwargs.pop("f_globals", None)
@@ -1451,16 +1564,16 @@ def __init__(self, wrapped, context, **kwargs) -> None:
     def call_function(
         self,
         tx: "InstructionTranslator",
-        args: "list[VariableTracker]",
-        kwargs: "dict[str, VariableTracker]",
-    ) -> "VariableTracker":
+        args: Sequence[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         self.context.enter(tx)
         result = super().call_function(tx, args, kwargs)
         self.context.exit(tx)
         return result
 
-    def reconstruct(self, codegen):
-        codegen.add_push_null(lambda: codegen(self.context))
+    def reconstruct(self, codegen: "PyCodegen") -> None:
+        codegen.add_push_null(lambda: codegen(self.context))  # type: ignore[arg-type]
         codegen(self.wrapped)
         codegen.extend_output(create_call_function(1, False))
 
@@ -1472,16 +1585,16 @@ class SkipFunctionVariable(VariableTracker):
         *VariableTracker._nonvar_fields,
     }
 
-    def __init__(self, value, reason=None, **kwargs) -> None:
+    def __init__(self, value: Any, reason: Optional[str] = None, **kwargs: Any) -> None:
         super().__init__(**kwargs)
         self.value = value
         self.reason = reason
 
-    def as_python_constant(self):
+    def as_python_constant(self) -> Any:
         return self.value
 
     @classmethod
-    def create_with_source(cls, value, source):
+    def create_with_source(cls, value: Any, source: Source) -> "SkipFunctionVariable":
         # Use closure match guard (i.e. guard on __code__ object instead of
         # function id) to avoid guarding on nested functions.
         if inspect.getattr_static(value, "_torchdynamo_disable", False):
@@ -1510,12 +1623,12 @@ def create_with_source(cls, value, source):
     def call_function(
         self,
         tx: "InstructionTranslator",
-        args: "list[VariableTracker]",
-        kwargs: "dict[str, VariableTracker]",
-    ) -> "VariableTracker":
+        args: Sequence[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         if inspect.getattr_static(self.value, "_torchdynamo_disable", False):
             msg = inspect.getattr_static(self.value, "_torchdynamo_disable_msg", None)
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Skip calling `torch.compiler.disable()`d function",
                 context=str(self.value),
                 explanation=f"Skip calling function `{self.value}` since it was wrapped "
@@ -1525,10 +1638,10 @@ def call_function(
                 ],
             )
         elif self.value is torch._dynamo.graph_break:
-            graph_break_msg = kwargs.get("msg", None)
+            graph_break_msg = kwargs.get("msg")
             if graph_break_msg:
                 graph_break_msg = graph_break_msg.as_python_constant()
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Call to `torch._dynamo.graph_break()`",
                 context=f"Called `torch._dynamo.graph_break()` with args `{args}`, kwargs `{kwargs}`",
                 explanation=f"User-inserted graph break. Message: {graph_break_msg}",
@@ -1537,11 +1650,16 @@ def call_function(
                 ],
             )
         elif self.value is torch._dynamo.skip_frame:
-            skip_frame_msg = kwargs.get("msg", None)
+            skip_frame_msg = kwargs.get("msg")
             if skip_frame_msg:
                 skip_frame_msg = skip_frame_msg.as_python_constant()
+            else:
+                skip_frame_msg = ""
             raise SkipFrame(
-                f"Skip frame due to `torch._dynamo.skip_frame()`. Message: {skip_frame_msg}"
+                format_skip_frame_message(
+                    tx.f_code,
+                    f"Skip frame due to `torch._dynamo.skip_frame()`. Message: {skip_frame_msg}",
+                )
             )
         elif self.value is torch._dynamo.step_unsupported:
             raise StepUnsupported
@@ -1622,17 +1740,19 @@ def call_function(
                 )
                 hints = []
             reason = self.reason if self.reason else "<missing reason>"
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Attempted to call function marked as skipped",
                 context=f"module: {module_name}, qualname: {qualname}, skip reason: {reason}",
                 explanation=explanation,
                 hints=hints,
             )
 
-    def call_obj_hasattr(self, tx: "InstructionTranslator", name):
+    def call_obj_hasattr(
+        self, tx: "InstructionTranslator", name: str
+    ) -> VariableTracker:
         return variables.ConstantVariable.create(hasattr(self.value, name))
 
-    def var_getattr(self, tx: "InstructionTranslator", name: str):
+    def var_getattr(self, tx: "InstructionTranslator", name: str) -> VariableTracker:
         if name in cmp_name_to_op_mapping:
             return variables.GetAttrVariable(self, name)
 
@@ -1640,26 +1760,31 @@ def var_getattr(self, tx: "InstructionTranslator", name: str):
 
 
 class WrappedSkipFunctionVariable(SkipFunctionVariable):
-    def __init__(self, wrapped, context, **kwargs) -> None:
+    def __init__(
+        self,
+        wrapped: VariableTracker,
+        context: "ContextWrappingVariable",
+        **kwargs: Any,
+    ) -> None:
         kwargs.pop("value", None)
         kwargs.pop("reason", None)
-        super().__init__(wrapped.value, reason=wrapped.reason, **kwargs)
+        super().__init__(wrapped.value, reason=wrapped.reason, **kwargs)  # type: ignore[attr-defined]
         self.wrapped = wrapped
         self.context = context
 
     def call_function(
         self,
         tx: "InstructionTranslator",
-        args: "list[VariableTracker]",
-        kwargs: "dict[str, VariableTracker]",
-    ) -> "VariableTracker":
+        args: Sequence[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         self.context.enter(tx)
         result = super().call_function(tx, args, kwargs)
         self.context.exit(tx)
         return result
 
-    def reconstruct(self, codegen):
-        codegen.add_push_null(lambda: codegen(self.context))
+    def reconstruct(self, codegen: "PyCodegen") -> None:
+        codegen.add_push_null(lambda: codegen(self.context))  # type: ignore[arg-type]
         codegen(self.wrapped)
         codegen.extend_output(create_call_function(1, False))
 
@@ -1672,12 +1797,12 @@ class WrapperUserFunctionVariable(VariableTracker):
     __script_if_tracing_wrapper have the original attr at "__original_fn".
     """
 
-    def __init__(self, wrapper_obj, attr_to_trace, **kwargs) -> None:
+    def __init__(self, wrapper_obj: Any, attr_to_trace: str, **kwargs: Any) -> None:
         super().__init__(**kwargs)
         self.wrapper_obj = wrapper_obj
         self.attr_to_trace = attr_to_trace
 
-    def var_getattr(self, tx: "InstructionTranslator", name):
+    def var_getattr(self, tx: "InstructionTranslator", name: str) -> VariableTracker:
         if name == self.attr_to_trace:
             val = getattr(self.wrapper_obj, self.attr_to_trace)
             source = self.source and AttrSource(self.source, name)
@@ -1685,15 +1810,15 @@ def var_getattr(self, tx: "InstructionTranslator", name):
 
         return super().var_getattr(tx, name)
 
-    def self_args(self):
+    def self_args(self) -> list[VariableTracker]:
         return []
 
     def call_function(
         self,
         tx: "InstructionTranslator",
-        args: "list[VariableTracker]",
-        kwargs: "dict[str, VariableTracker]",
-    ) -> "VariableTracker":
+        args: Sequence[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         if hasattr(self.wrapper_obj, "cache_info"):
             target_fn = getattr(self.wrapper_obj, self.attr_to_trace, None)
             module_name = getattr(target_fn, "__module__", "") or ""
@@ -1719,9 +1844,9 @@ def call_function(
                     user_stack_trace += str(user_stack_formatted)
                     dynamo_logger.debug(user_stack_trace)
 
-        all_args = self.self_args() + args
+        all_args = self.self_args() + list(args)
         return variables.UserFunctionVariable(
-            polyfills.getattr_and_trace
+            polyfills.getattr_and_trace  # type: ignore[arg-type]
         ).call_function(
             tx,
             [self, variables.ConstantVariable(self.attr_to_trace), *all_args],
@@ -1736,15 +1861,21 @@ class WrapperUserMethodVariable(WrapperUserFunctionVariable):
     WrapperUserFunctionVariable in `call_function` method.
     """
 
-    def __init__(self, wrapper_obj, attr_to_trace, self_obj, **kwargs) -> None:
+    def __init__(
+        self,
+        wrapper_obj: Any,
+        attr_to_trace: str,
+        self_obj: VariableTracker,
+        **kwargs: Any,
+    ) -> None:
         super().__init__(wrapper_obj, attr_to_trace, **kwargs)
         self.obj = self_obj
 
-    def self_args(self):
+    def self_args(self) -> list[VariableTracker]:
         return [self.obj]
 
 
-def _traceable_collective_remaps():
+def _traceable_collective_remaps() -> dict[Any, Any]:
     # We can't rely on importing from distributed, since it's not always built
     if torch.distributed.is_available():
         from torch.distributed._functional_collectives import (
@@ -1755,7 +1886,9 @@ def _traceable_collective_remaps():
     return {}
 
 
-def _traceable_collectives_source(tx: "InstructionTranslator", fn):
+def _traceable_collectives_source(
+    tx: "InstructionTranslator", fn: Callable[..., Any]
+) -> AttrSource:
     assert torch.distributed.is_available(), "Illegal invocation."
     assert fn in _traceable_collective_remaps().values()
 
@@ -1775,13 +1908,24 @@ class CollectiveFunctionRewriteVariable(UserFunctionVariable):
     than status-quo as we currently graph-break on all distributed.* collectives.
     """
 
-    def __init__(self, fn, *, replacement_var, **kwargs) -> None:
-        super().__init__(fn, **kwargs)
+    def __init__(
+        self,
+        fn: Callable[..., Any],
+        *,
+        replacement_var: UserFunctionVariable,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(fn, **kwargs)  # type: ignore[arg-type]
         assert isinstance(replacement_var, UserFunctionVariable)
         self.replacement_var = replacement_var
 
     @staticmethod
-    def create(tx: "InstructionTranslator", old_fn, source, **options):
+    def create(
+        tx: "InstructionTranslator",
+        old_fn: Callable[..., Any],
+        source: Source,
+        **options: Any,
+    ) -> "CollectiveFunctionRewriteVariable":
         new_fn, new_source = CollectiveFunctionRewriteVariable.rewrite(tx, old_fn)
         return CollectiveFunctionRewriteVariable(
             old_fn,
@@ -1791,22 +1935,24 @@ def create(tx: "InstructionTranslator", old_fn, source, **options):
         )
 
     @staticmethod
-    def can_rewrite(variable):
+    def can_rewrite(variable: Any) -> bool:
         return (
             inspect.isfunction(variable) and variable in _traceable_collective_remaps()
         )
 
     @staticmethod
-    def rewrite(tx: "InstructionTranslator", fn):
+    def rewrite(
+        tx: "InstructionTranslator", fn: Callable[..., Any]
+    ) -> tuple[Any, AttrSource]:
         new_fn = _traceable_collective_remaps()[fn]
         return new_fn, _traceable_collectives_source(tx, new_fn)
 
     def call_function(
         self,
         tx: "InstructionTranslator",
-        args: "list[VariableTracker]",
-        kwargs: "dict[str, VariableTracker]",
-    ) -> "VariableTracker":
+        args: Sequence[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         # call_function must check any unsupported arguments and graph-break.
         # It's safe to assume args/kwargs from orig_fn map 1:1 to args/kwargs of remapped_fn,
         # since that's the contract for putting a mapping in `traceable_collective_remaps`
@@ -1820,7 +1966,7 @@ def call_function(
         args = ()
 
         if "async_op" in kwargs and kwargs["async_op"].as_python_constant():
-            unimplemented_v2(
+            unimplemented(
                 gb_type="async_op=True for distributed collectives",
                 context=f"{self.fn}, {args=}, {kwargs=}",
                 explanation=f"`torch.compile` doesn't support `async_op=True for {self.fn}",
@@ -1836,7 +1982,7 @@ def call_function(
         ):
             reduce_op_var = kwargs.get("op")
             reduce_op = (
-                reduce_op_var.value
+                reduce_op_var.value  # type: ignore[attr-defined]
                 if reduce_op_var is not None
                 else signature.parameters["op"].default
             )
@@ -1852,15 +1998,15 @@ class FunctoolsWrapsVariable(UserFunctionVariable):
     def call_function(
         self,
         tx: "InstructionTranslator",
-        args: "list[VariableTracker]",
-        kwargs: "dict[str, VariableTracker]",
-    ) -> "VariableTracker":
+        args: Sequence[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         if not kwargs and len(args) == 1:
 
-            def wraps(fn):
+            def wraps(fn: Any) -> VariableTracker:
                 if isinstance(fn, variables.NestedUserFunctionVariable):
                     return fn.clone(wrapped_fn=args[0])
-                unimplemented_v2(
+                unimplemented(
                     gb_type="functools.wraps",
                     context=f"{fn}",
                     explanation="`torch.compile` can't trace `functools.wraps` on functions defined outside the compile region",
@@ -1875,15 +2021,15 @@ def wraps(fn):
 
 
 class CollectionsNamedTupleFunction(UserFunctionVariable):
-    def as_python_constant(self):
+    def as_python_constant(self) -> Any:
         return self.fn
 
     def call_function(
         self,
         tx: "InstructionTranslator",
-        args: "list[VariableTracker]",
-        kwargs: "dict[str, VariableTracker]",
-    ) -> "VariableTracker":
+        args: Sequence[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         constant_args = check_constant_args(args, kwargs)
         if constant_args:
             try:
@@ -1898,9 +2044,11 @@ def call_function(
                     args=list(map(ConstantVariable.create, exc.args)),
                 )
             return variables.UserDefinedClassVariable(
-                value, mutation_type=ValueMutationNew()
+                # pyrefly: ignore[unbound-name]
+                value,
+                mutation_type=ValueMutationNew(),
             )
-        unimplemented_v2(
+        unimplemented(
             gb_type="namedtuple construction",
             context=f"{args=}, {kwargs=}",
             explanation="`torch.compile` only support certain input types for namedtuple",
@@ -1911,7 +2059,13 @@ def call_function(
 
 
 class FunctoolsPartialVariable(VariableTracker):
-    def __init__(self, func: VariableTracker, args, keywords, **kwargs) -> None:
+    def __init__(
+        self,
+        func: VariableTracker,
+        args: Sequence[VariableTracker],
+        keywords: dict[str, VariableTracker],
+        **kwargs: Any,
+    ) -> None:
         super().__init__(**kwargs)
         self.func = func
         assert isinstance(args, list)
@@ -1922,10 +2076,10 @@ def __init__(self, func: VariableTracker, args, keywords, **kwargs) -> None:
         # on it is sufficient for the tracing purposes.
         self.fake_value = functools.partial(identity)
 
-    def python_type(self):
+    def python_type(self) -> type:
         return functools.partial
 
-    def reconstruct(self, codegen: "PyCodegen"):
+    def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen.add_push_null(lambda: codegen.load_import_from("functools", "partial"))
         codegen(self.func)
         if self.args:
@@ -1940,16 +2094,16 @@ def reconstruct(self, codegen: "PyCodegen"):
             codegen.create_call_function_kw(len(keys) + len(self.args) + 1, keys, False)
         )
 
-    def get_function(self):
+    def get_function(self) -> Any:
         return self.as_python_constant()
 
     def call_function(
         self,
         tx: "InstructionTranslator",
-        args: "list[VariableTracker]",
-        kwargs: "dict[str, VariableTracker]",
-    ) -> "VariableTracker":
-        merged_args = self.args + args
+        args: Sequence[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
+        merged_args = self.args + list(args)
         merged_kwargs = {**self.keywords, **kwargs}
         return self.func.call_function(tx, merged_args, merged_kwargs)
 
@@ -1961,7 +2115,7 @@ def call_obj_hasattr(
             hasattr(functools.partial(identity), name)
         )
 
-    def var_getattr(self, tx: "InstructionTranslator", name: str):
+    def var_getattr(self, tx: "InstructionTranslator", name: str) -> VariableTracker:
         source = self.source and AttrSource(self.source, name)
         # Handle __slots__
         if name == "func":
@@ -1975,14 +2129,14 @@ def var_getattr(self, tx: "InstructionTranslator", name: str):
             return variables.GetAttrVariable(self, name)
         raise_observed_exception(AttributeError, tx)
 
-    def as_python_constant(self):
+    def as_python_constant(self) -> Any:
         return functools.partial(
             self.func.as_python_constant(),
             *[arg.as_python_constant() for arg in self.args],
             **{k: v.as_python_constant() for k, v in self.keywords.items()},
         )
 
-    def guard_as_python_constant(self):
+    def guard_as_python_constant(self) -> Any:
         """Similar to as_python_constant(), but add ID_MATCH guards to try to force things to become constants"""
         return functools.partial(
             self.func.guard_as_python_constant(),
@@ -2005,16 +2159,20 @@ def _get_polyfill_handlers(cls) -> dict[Callable[..., Any], types.FunctionType]:
         return {}
 
     @classmethod
-    def create_with_source(cls, value, source):
+    def create_with_source(
+        cls, value: Any, source: Source
+    ) -> "PolyfilledFunctionVariable":
         install_guard(source.make_guard(GuardBuilder.CLOSURE_MATCH))
 
         return cls(value, source=source)
 
-    def __init__(self, fn: _F, **kwargs) -> None:
+    def __init__(self, fn: _F, **kwargs: Any) -> None:
         super().__init__(**kwargs)
+        # pyrefly: ignore[invalid-type-var]
         self.fn: _F = fn
 
         handler = self._get_polyfill_handlers().get(fn, fn)
+        traceable_fn = None
         assert callable(handler), f"Polyfill handler {handler} is not callable for {fn}"
         for candidate_attr in (
             "__torch_dynamo_polyfill__",  # registered polyfill
@@ -2029,28 +2187,29 @@ def __init__(self, fn: _F, **kwargs) -> None:
             raise RuntimeError(
                 f"Polyfill handler {handler} does not have a traceable function"
             )
-
-        self.wrapped_fn: _F = handler
+        # pyrefly: ignore[invalid-type-var]
+        self.wrapped_fn = handler
+        # pyrefly: ignore[invalid-type-var]
         self.traceable_fn: _F = traceable_fn
 
     @property
-    def polyfill_fn(self) -> _F:
+    def polyfill_fn(self) -> Callable[..., Any]:
         return self.traceable_fn
 
-    def can_constant_fold_through(self):
+    def can_constant_fold_through(self) -> bool:
         return getattr(
             self.wrapped_fn, "__torch_dynamo_can_constant_fold_through__", False
         )
 
-    def get_function(self):
+    def get_function(self) -> Any:
         return self.as_python_constant()
 
     def call_function(
         self,
         tx: "InstructionTranslator",
-        args: "list[VariableTracker]",
-        kwargs: "dict[str, VariableTracker]",
-    ) -> "VariableTracker":
+        args: Sequence[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         if self.can_constant_fold_through() and check_unspec_or_constant_args(
             args, kwargs
         ):
@@ -2087,7 +2246,7 @@ def call_function(
                         (
                             x.value
                             if isinstance(x, variables.ConstantVariable)
-                            else x.sym_num
+                            else x.sym_num  # type: ignore[attr-defined]
                         )
                         for x in args[0].items
                     ]
@@ -2099,11 +2258,11 @@ def call_function(
 
     def call_method(
         self,
-        tx,
-        name,
-        args: "list[VariableTracker]",
-        kwargs: "dict[str, VariableTracker]",
-    ) -> "VariableTracker":
+        tx: "InstructionTranslator",
+        name: str,
+        args: list[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         if name == "__call__":
             return self.call_function(tx, args, kwargs)
 
@@ -2113,27 +2272,33 @@ def call_method(
         options = {}
         if self.source:
             options["source"] = AttrSource(self.source, name)
+        # pyrefly: ignore[bad-specialization]
         polyfilled_method_variable = PolyfilledFunctionVariable(method, **options)
         return polyfilled_method_variable.call_function(tx, args, kwargs)
 
-    def as_python_constant(self):
+    def as_python_constant(self) -> Any:
         return self.fn
 
 
 class TracebackVariable(VariableTracker):
     # We don't track traceback. A call to any function in this module is a no-op
-    def call_function(self, tx, args, kwargs): ...
+    def call_function(  # type: ignore[empty-body]
+        self,
+        tx: "InstructionTranslator",
+        args: Sequence[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker: ...
 
 
 class SysFunctionVariable(VariableTracker):
-    def __init__(self, value, **kwargs):
+    def __init__(self, value: Any, **kwargs: Any) -> None:
         super().__init__(**kwargs)
         self.value = value
 
-    def exc_info(self, tx):
+    def exc_info(self, tx: "InstructionTranslator") -> "variables.TupleVariable":
         if len(tx.exn_vt_stack):
             exn = tx.exn_vt_stack[-1]
-            typ = exn.exc_type
+            typ = exn.exc_type  # type: ignore[union-attr]
             tb = None
             items = [
                 VariableTracker.build(tx, typ),
@@ -2146,12 +2311,17 @@ def exc_info(self, tx):
                 variables.ConstantVariable(None),
                 variables.ConstantVariable(None),
             ]
-        return variables.TupleVariable(items)
+        return variables.TupleVariable(items)  # type: ignore[arg-type]
 
-    def exception(self, tx):
+    def exception(self, tx: "InstructionTranslator") -> VariableTracker:
         return self.exc_info(tx).items[1]
 
-    def call_function(self, tx, args, kwargs):
+    def call_function(
+        self,
+        tx: "InstructionTranslator",
+        args: Sequence[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         if self.value is sys.exc_info:
             return self.exc_info(tx)
         assert self.value is sys.exception
@@ -2168,23 +2338,28 @@ def call_function(self, tx, args, kwargs):
 
 class DynamoTritonHOPifier(TritonHOPifier):
     def raise_unsupported(self, msg: str) -> Never:
-        raise Unsupported(msg)
+        unimplemented(
+            gb_type="triton kernel unsupported feature",
+            context="",
+            explanation=f"Encountered triton kernel unsupported feature: {msg}",
+            hints=[],
+        )
 
-    def is_callable(self, maybe_callable: Any) -> bool:
+    def is_callable(self, maybe_callable: VariableTracker) -> bool:
         return isinstance(
             maybe_callable, (NestedUserFunctionVariable, UserFunctionVariable)
         )
 
-    def get_value(self, val: Any) -> Any:
-        return val.value
+    def get_value(self, val: VariableTracker) -> Any:
+        return val.value  # type: ignore[attr-defined]
 
-    def check_grid(self, grid) -> tuple[torch.fx.proxy.Proxy, ...]:
+    def check_grid(self, grid: "BaseListVariable") -> tuple[torch.fx.proxy.Proxy, ...]:
         from .lists import BaseListVariable
 
         if isinstance(grid, BaseListVariable):
             return grid.as_proxy()
         else:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="unsupported grid type for triton hop check_grid",
                 context=f"grid type = {type(grid)}",
                 explanation="`torch.compile` only supports list-like grid for check_grid",
@@ -2193,20 +2368,35 @@ def check_grid(self, grid) -> tuple[torch.fx.proxy.Proxy, ...]:
                 ],
             )
 
-    def call_grid(self, grid, meta, tx):
-        meta = {variables.ConstantVariable.create(k): v for k, v in meta.items()}
-        grid = grid.call_function(tx, [meta], {})
+    def call_grid(
+        self, grid: Any, meta: dict[str, Any], tx: "InstructionTranslator"
+    ) -> Any:
+        meta_var = {variables.ConstantVariable.create(k): v for k, v in meta.items()}
+        grid = grid.call_function(tx, [meta_var], {})
         return grid
 
     # We use this function to wrap call_prune_configs
-    def call_user_defined_fn(self, user_fn, args, kwargs, tx, variable):
+    def call_user_defined_fn(
+        self,
+        user_fn: Callable[..., Any],
+        args: Sequence[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+        tx: Optional["InstructionTranslator"],
+        variable: Any,
+    ) -> VariableTracker:
         from .builder import SourcelessBuilder
 
-        wrapped_user_function = SourcelessBuilder.create(tx, user_fn)
+        wrapped_user_function = SourcelessBuilder.create(tx, user_fn)  # type: ignore[arg-type]
         result = wrapped_user_function.call_function(tx, args, kwargs)
         return result
 
-    def wrap_user_defined_obj(self, user_obj, tx, variable, name):
+    def wrap_user_defined_obj(
+        self,
+        user_obj: Any,
+        tx: Optional["InstructionTranslator"],
+        variable: Any,
+        name: str,
+    ) -> VariableTracker:
         from .builder import VariableBuilder
 
         wrapped_user_obj = VariableBuilder(
@@ -2214,7 +2404,9 @@ def wrap_user_defined_obj(self, user_obj, tx, variable, name):
         )._wrap(user_obj)
         return wrapped_user_obj
 
-    def maybe_unpack_configs(self, configs, tx):
+    def maybe_unpack_configs(
+        self, configs: Any, tx: Optional["InstructionTranslator"]
+    ) -> list[Any]:
         # unpack the list of configs
         configs = configs.unpack_var_sequence(tx)
 
@@ -2223,7 +2415,7 @@ def maybe_unpack_configs(self, configs, tx):
 
         return configs
 
-    def maybe_unpack_heuristic_result(self, result: Any) -> Any:
+    def maybe_unpack_heuristic_result(self, result: VariableTracker) -> Any:
         if not result.is_python_constant():
             self.raise_unsupported(
                 "@triton.heuristics must return constant values because configs can only contain constant values."
@@ -2233,7 +2425,7 @@ def maybe_unpack_heuristic_result(self, result: Any) -> Any:
 
     # We need to override call_getitem here so that we can add the source in the case
     # where we call the triton kernel with a grid
-    def call_getitem(
+    def call_getitem(  # type: ignore[override]
         self,
         variable: "TritonKernelVariable",
         args: Sequence[Any],
@@ -2251,7 +2443,13 @@ def call_getitem(
             kernel_source=variable.source,
         )
 
-    def call_HOP(self, variable, grids, combined_args_raw, tx) -> ConstantVariable:
+    def call_HOP(
+        self,
+        variable: "TritonKernelVariable",
+        grids: Any,
+        combined_args_raw: dict[str, Any],
+        tx: "InstructionTranslator",
+    ) -> "variables.ConstantVariable":
         from .constant import ConstantVariable
         from .dicts import ConstDictVariable
 
@@ -2330,7 +2528,9 @@ class TritonKernelVariable(VariableTracker):
     kernel_idx: Optional[int]
     kernel_source: "AttrSource"
 
-    def __init__(self, kernel, kernel_idx, grid, **kwargs) -> None:
+    def __init__(
+        self, kernel: Any, kernel_idx: Optional[int], grid: Any, **kwargs: Any
+    ) -> None:
         self.kernel_source = kwargs.pop("kernel_source", None)
         super().__init__(**kwargs)
         dynamo_triton_hopifier_singleton.init_variable(self, kernel, kernel_idx, grid)
@@ -2338,24 +2538,24 @@ def __init__(self, kernel, kernel_idx, grid, **kwargs) -> None:
     def call_function(
         self,
         tx: "InstructionTranslator",
-        args: "list[VariableTracker]",
-        kwargs: "dict[str, VariableTracker]",
-    ) -> "VariableTracker":
-        return dynamo_triton_hopifier_singleton.call_triton_kernel(
+        args: Sequence[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
+        return dynamo_triton_hopifier_singleton.call_triton_kernel(  # type: ignore[return-value]
             self, args, kwargs, tx
         )
 
     def call_method(
         self,
-        tx,
-        name,
-        args: "list[VariableTracker]",
-        kwargs: "dict[str, VariableTracker]",
-    ) -> "VariableTracker":
+        tx: "InstructionTranslator",
+        name: str,
+        args: list[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         if name == "__getitem__":
             return dynamo_triton_hopifier_singleton.call_getitem(self, args)
         elif name == "run":
-            return dynamo_triton_hopifier_singleton.call_run(self, args, kwargs, tx)
+            return dynamo_triton_hopifier_singleton.call_run(self, args, kwargs, tx)  # type: ignore[return-value]
 
         # Bail out to parent's implementation
         return super().call_method(tx, name, args, kwargs)
@@ -2374,11 +2574,11 @@ class TMADescriptorExperimentalVariable(VariableTracker):
     def __init__(
         self,
         data_ptr: "variables.DataPtrVariable",
-        dims: "list[ConstantVariable]",
-        block_dims: "list[ConstantVariable]",
-        element_size: "ConstantVariable",
-        **kwargs,
-    ):
+        dims: list[VariableTracker],
+        block_dims: list[VariableTracker],
+        element_size: VariableTracker,
+        **kwargs: Any,
+    ) -> None:
         assert isinstance(data_ptr, variables.DataPtrVariable)
         super().__init__(**kwargs)
         self.data_ptr = data_ptr
@@ -2386,14 +2586,14 @@ def __init__(
         self.block_dims = block_dims
         self.element_size = element_size
 
-    def to_metadata(self):
+    def to_metadata(self) -> Any:
         return create_tma_experimental_metadata(
             [dim.as_proxy() for dim in self.dims],
             [dim.as_proxy() for dim in self.block_dims],
             self.element_size.as_proxy(),
         )
 
-    def reconstruct(self, codegen: "PyCodegen"):
+    def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen.add_push_null(
             lambda: codegen.load_import_from(
                 "triton.tools.experimental_descriptor",
@@ -2405,28 +2605,28 @@ def reconstruct(self, codegen: "PyCodegen"):
         codegen.foreach(args)
         codegen.call_function(len(args) + 1, False)
 
-    def get_tensor(self):
+    def get_tensor(self) -> VariableTracker:
         return self.data_ptr.from_tensor
 
 
 class TMADescriptorStableVariable(VariableTracker):
     def __init__(
         self,
-        tensor: "variables.TensorVariable",
-        block_shape: "variables.ListVariable",
-        **kwargs,
-    ):
+        tensor: "TensorVariable",
+        block_shape: "ListVariable",
+        **kwargs: Any,
+    ) -> None:
         assert isinstance(tensor, variables.TensorVariable)
         super().__init__(**kwargs)
         self.tensor = tensor
         self.block_shape = block_shape
 
-    def to_metadata(self):
+    def to_metadata(self) -> Any:
         return create_tma_stable_metadata(
             self.block_shape.as_proxy(),
         )
 
-    def reconstruct(self, codegen: "PyCodegen"):
+    def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen.add_push_null(
             lambda: codegen.load_import_from(
                 "triton.tools.tensor_descriptor",
@@ -2438,7 +2638,7 @@ def reconstruct(self, codegen: "PyCodegen"):
         codegen(self.block_shape)
         codegen.call_method(2)
 
-    def get_tensor(self) -> "variables.TensorVariable":
+    def get_tensor(self) -> Any:
         return self.tensor
 
 
@@ -2446,7 +2646,7 @@ class CreateTMADescriptorExperimentalVariable(VariableTracker):
     def __init__(
         self,
         rank: int,
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         assert rank in (1, 2)
         super().__init__(**kwargs)
@@ -2455,9 +2655,9 @@ def __init__(
     def call_function(
         self,
         tx: "InstructionTranslator",
-        args: "list[VariableTracker]",
-        kwargs: "dict[str, VariableTracker]",
-    ) -> "VariableTracker":
+        args: Sequence[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         ptr = kwargs["ptr"] if "ptr" in kwargs else args[0]
 
         if not isinstance(ptr, variables.DataPtrVariable):
@@ -2507,13 +2707,13 @@ class CreateTMADescriptorStableVariable(VariableTracker):
     def call_function(
         self,
         tx: "InstructionTranslator",
-        args: "list[VariableTracker]",
-        kwargs: "dict[str, VariableTracker]",
-    ) -> "VariableTracker":
+        args: Sequence[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         tensor = kwargs["tensor"] if "tensor" in kwargs else args[0]
         block_shape = kwargs["block_shape"] if "block_shape" in kwargs else args[1]
 
         return TMADescriptorStableVariable(
-            tensor=tensor,
-            block_shape=block_shape,
+            tensor=tensor,  # type: ignore[arg-type]
+            block_shape=block_shape,  # type: ignore[arg-type]
         )
diff --git a/torch/_dynamo/variables/higher_order_ops.py b/torch/_dynamo/variables/higher_order_ops.py
index c330a700fd66b..4ae8868f15e84 100644
--- a/torch/_dynamo/variables/higher_order_ops.py
+++ b/torch/_dynamo/variables/higher_order_ops.py
@@ -28,7 +28,7 @@
 import warnings
 from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Any, Optional, TYPE_CHECKING
+from typing import Any, Literal, Optional, TYPE_CHECKING
 
 import torch._C
 import torch.fx
@@ -48,11 +48,9 @@
 
 from .. import graph_break_hints, variables
 from ..exc import (
-    IncorrectUsage,
     ObservedException,
     UncapturedHigherOrderOpError,
     unimplemented,
-    unimplemented_v2,
     Unsupported,
 )
 from ..source import AttrSource, DictGetItemSource
@@ -89,6 +87,8 @@ class OutputSpec:
     # that this is the same length as the mask, we just look at the indices
     # where mask is True.
     const_values: Optional[list[Any]] = None
+    # Number of intermediate nodes that are also made subgraph outputs.
+    num_intermediate_nodes_as_outputs: int = 0
 
     def __post_init__(self):
         if (
@@ -163,7 +163,12 @@ def _unwrap_var(var):
         elif isinstance(var, ConstantVariable):
             return var.as_python_constant()
         else:
-            unimplemented(f"Cannot unwrap var {var}")
+            unimplemented(
+                gb_type="cannot unwrap variable for check_meta_consistency",
+                context=str(var),
+                explanation=f"Expected {var} to be TensorVariable, SymNodeVariable, or ConstantVariable",
+                hints=[],
+            )
 
     unwrapped1 = [_unwrap_var(var) for var in vars1]
     unwrapped2 = [_unwrap_var(var) for var in vars2]
@@ -210,7 +215,10 @@ def find_mismatched_vars(var, types, allow_none=False):
         A set of variables whose type is not an instance of the specified types.
     """
     mismatched_vars = set()
-    if isinstance(var, (TupleVariable, ListVariable)):
+    if isinstance(var, (list, tuple)):
+        for item in var:
+            mismatched_vars.update(find_mismatched_vars(item, types, allow_none))
+    elif isinstance(var, (TupleVariable, ListVariable)):
         for item in var.items:
             mismatched_vars.update(find_mismatched_vars(item, types, allow_none))
     elif isinstance(var, ConstDictVariable):
@@ -243,8 +251,80 @@ def inline_call(*args, **kwargs):
     return inline_call
 
 
+def _call_function_with_auto_output_flattening(
+    tx: "InstructionTranslator",
+    fn: Any,
+    args: tuple[Any, ...],
+    kwargs: dict[str, Any],
+    flat_example_value: Any,
+    body_r: Optional[VariableTracker],
+    graph_output_vts: VariableTracker | tuple[VariableTracker, ...],
+) -> Optional[VariableTracker]:
+    """
+    Create HOP call node and reproxify output VTs for HOPs with auto output semantics.
+
+    This function is used by HOPs with auto output semantics (see speculate_subgraph_with_auto_output_flattening)
+    to create the actual HOP call in the FX graph and properly handle the output variable trackers.
+
+    The key operation is "reproxifying" - updating the proxies of the original tensor VTs
+    (from body_r) to point to the HOP call outputs, ensuring the outer graph correctly
+    references the HOP outputs while allowing body_r to contain arbitrary Python objects.
+
+    Args:
+        tx: The instruction translator
+        fn: The HOP function to call
+        args: Arguments for the HOP call (typically includes the subgraph node)
+        kwargs: Keyword arguments for the HOP call
+        flat_example_value: Example value for the HOP output
+        body_r: The output VT structure that Dynamo continues tracing with (may be None)
+        graph_output_vts: Tensor/symint VTs that were actual graph outputs
+
+    Returns:
+        The body_r VT (unchanged), which Dynamo will continue tracing with
+    """
+    from .builder import wrap_fx_proxy
+
+    # Store the invocation as a call
+    flat_variable = wrap_fx_proxy(
+        tx=tx,
+        proxy=tx.output.create_proxy(
+            "call_function",
+            fn,
+            args=args,
+            kwargs=kwargs,
+        ),
+        example_value=flat_example_value,
+    )
+
+    # wrap_fx_proxy creates fresh variable trackers. However, the main program
+    # after the speculate subgraph can still use the original tensor vts that
+    # are still pointing to the nodes present in the subgraph. So, we reproxify
+    # the original tensor vts with the subgraph outputs. This way, whenever the
+    # outer graph uses an original vt, it uses the subgraph output.
+    #
+    # This is critical for maintaining the separation between:
+    # - `body_r`: The output VT structure that Dynamo continues tracing (may
+    #   contain non-proxyable objects, nested structures, etc.)
+    # - `graph_output_vts`: Only the tensor/symint VTs that were actual graph
+    #   outputs from speculate_subgraph
+    #
+    # By overwriting the proxies of VTs in `body_r` with the proxies from the
+    # HOP call, we ensure the outer graph correctly references the HOP outputs
+    # while still allowing `body_r` to contain arbitrary Python objects.
+    if body_r is not None:
+        for orig_vt, subgraph_vt in zip(graph_output_vts, flat_variable.items):
+            if isinstance(
+                orig_vt, (variables.SymNodeVariable, variables.TensorVariable)
+            ):
+                assert isinstance(
+                    subgraph_vt, (variables.SymNodeVariable, variables.TensorVariable)
+                )
+                orig_vt.proxy = subgraph_vt.proxy
+    return body_r
+
+
 def _call_function_and_unflatten_output(
-    tx, fn, args, kwargs, flat_example_value, ret_spec
+    tx, fn, args, kwargs, flat_example_value, ret_spec, body_r
 ):
     from .builder import wrap_fx_proxy
 
@@ -260,6 +340,28 @@ def _call_function_and_unflatten_output(
         example_value=flat_example_value,
     )
 
+    # wrap_fx_proxy creates fresh variable trackers. However, the main program
+    # after the speculate subgraph can still use the original tensor vts that
+    # are still pointing to the nodes present in the subgraph. So, we reproxify
+    # the original tensor vts with the subgraph outputs. This way, whenever the
+    # outer graph uses an original vt, it uses the subgraph output.
+    if body_r is not None:
+        for orig_vt, subgraph_vt in zip(body_r.items, flat_variable.items):
+            if isinstance(
+                orig_vt, (variables.SymNodeVariable, variables.TensorVariable)
+            ):
+                assert isinstance(
+                    subgraph_vt, (variables.SymNodeVariable, variables.TensorVariable)
+                )
+                orig_vt.proxy = subgraph_vt.proxy
+
+    if ret_spec.num_intermediate_nodes_as_outputs:
+        # The treespec was computed w/o any extra intermediate outputs. At this
+        # point, it is safe to just get rid of the extra outputs
+        flat_variable = TupleVariable(
+            flat_variable.items[: -ret_spec.num_intermediate_nodes_as_outputs]
+        )
+
     if ret_spec.masks_to_filter_const_values:
         from torch._dynamo.external_utils import insert_const_values_with_mask
 
@@ -296,7 +398,10 @@ def _check_all_tensorvariable(args):
 
     if not all(type(a.realize()) is TensorVariable for a in args):
         unimplemented(
-            f"Expected all leaves to be of torch.Tensor type, but got {[type(a.realize()) for a in args]}."
+            gb_type="HOP: non torch.Tensor leaf",
+            context=f"args types: {[type(a.realize()) for a in args]}",
+            explanation="Expected all leaves to be of torch.Tensor type.",
+            hints=[],
         )
 
 
@@ -308,7 +413,10 @@ def _check_supported_callable_arg(
     )
     if not is_callable:
         unimplemented(
-            f"{arg_name} should be a Callable but is of type {str(func_var)}."
+            gb_type="HOP: non-callable variable",
+            context=f"arg name: {arg_name}, func_var type: {str(func_var)}",
+            explanation=f"{arg_name} should be a callable but is of type {str(func_var)}.",
+            hints=[],
         )
 
 
@@ -334,13 +442,16 @@ def _call_while_loop(
             )
             args.append(v)
 
-    if kwargs:
-        unimplemented(f"torch.while_loop: Got unexpected kwargs: {list(kwargs.keys())}")
-
-    if len(args) != 4:
+    if kwargs or len(args) != 4:
         unimplemented(
-            f"Expected 4 arguments but got {len(args)}.\n"
-            f"Usage: while_loop(cond_fn, body_fn, operands)",
+            gb_type="torch.while_loop: improper args/kwargs",
+            context=f"args: {args}, kwargs: {kwargs}",
+            explanation=f"torch.while_loop expects 4 positional arguments (got {len(args)}) "
+            f"and no keyword arguments (got {len(kwargs)}) "
+            "Usage: while_loop(cond_fn, body_fn, operands)",
+            hints=[
+                *graph_break_hints.USER_ERROR,
+            ],
         )
 
     # cond_fn and body_fn input check
@@ -353,9 +464,12 @@ def _call_while_loop(
     # additional_inputs input check
     if not isinstance(additional_inputs, (ListVariable, TupleVariable)):
         unimplemented(
-            f"Expected additional_inputs to be a list/tuple but got "
-            f"{additional_inputs.python_type()}. It seems to be an "
-            f"internal error, please report an issue to PyTorch."
+            gb_type="torch.while_loop: improper additional_inputs",
+            context=str(additional_inputs),
+            explanation=f"Expected additional_inputs to be a list/tuple but got {additional_inputs.python_type()}",
+            hints=[
+                *graph_break_hints.DYNAMO_BUG,
+            ],
         )
     additional_inputs_seq = additional_inputs.unpack_var_sequence(tx)
 
@@ -455,14 +569,24 @@ def unspecialize_carried_inputs(tx, carry) -> VariableTracker:
         )
         if cond_r_meta.dtype != torch.bool or cond_r_meta.shape != torch.Size([]):
             unimplemented(
-                f"Expected cond_fn to return a scalar tensor or a bool but got {cond_r_meta.shape}"
+                gb_type="torch.while_loop: unsupported cond_fn return type",
+                context=str(cond_r),
+                explanation=f"Expected cond_fn to return a scalar tensor or a bool but got {cond_r_meta.shape}.",
+                hints=[
+                    *graph_break_hints.USER_ERROR,
+                ],
             )
     elif isinstance(cond_r, ConstantVariable):
         # short-circuiting while_loop when cond_fn returns a constant such as 0, 1 True or False
         pred = cond_r.as_python_constant()
         if pred:
             unimplemented(
-                f"Infinite loop detected because while_loop's cond_fn always returns the same value {pred}"
+                gb_type="torch.while_loop: infinite loop detected",
+                context=str(cond_r),
+                explanation=f"Infinite loop detected because while_loop's cond_fn always returns the same value {pred}.",
+                hints=[
+                    *graph_break_hints.USER_ERROR,
+                ],
             )
         else:
             return operands
@@ -547,6 +671,7 @@ def unspecialize_carried_inputs(tx, carry) -> VariableTracker:
         {},
         None,
         body_treespec,
+        body_r,
     )
 
 
@@ -771,8 +896,13 @@ def validate_args_and_maybe_create_graph_inputs(
             else:
                 # HOPs work much better if they use speculate_subgraph(set_subgraph_inputs="automatic").
                 unimplemented(
-                    f"{description} with body that accepts non-Tensors as input. "
-                    f"Got: {a.python_type()}"
+                    gb_type="HOP body taking non-Tensor as input",
+                    context=str(sub_args),
+                    explanation=f"{description} with body that accepts non-Tensors as input. "
+                    f"Got type {a.python_type()} at index {idx}.",
+                    hints=[
+                        *graph_break_hints.USER_ERROR,
+                    ],
                 )
             args.append(new_arg)
         return args
@@ -880,6 +1010,469 @@ def _insert_or_replace_phs(new_args, name_suffix):
     return l_graph, r_graph, l_shared, r_shared, unique_l, unique_r
 
 
+# NOTE: [HigherOrderOperator subgraph input ordering]
+# The input ordering of the higher order ops is determined by the order of
+# the creation of the placeholder.
+# Manually created inputs are created in validate_args_and_maybe_create_graph_inputs before
+# speculating subgraph.
+# During subgraph speculation, we may lift closured tensors and free symbols as inputs,
+# their ordering is determined by the time they are lifted: earlier lifted ones precede later
+# lifted ones.
+#
+# Suppose the placeholders are
+# O1, O2, X1, O3, O4, X2, X3, O5 where Xs are lifted phs
+# The following code re-order the placeholders to
+# O1, O2, O3, O4, O5, X1, X2, X3
+def move_lifted_freevars_phs_to_end(
+    graph: torch.fx.Graph, lifted_freevars: tuple[torch.fx.Node]
+):
+    lifted_ph_set = {child_p.node for child_p in lifted_freevars.values()}
+
+    prev_phs = [n for n in graph.nodes if n.op == "placeholder"]
+
+    # No need to reorder when graph doesn't have args or doesn't
+    # have lifted freevars or all inputs are lifted freevars.
+    if (
+        len(prev_phs) == 0
+        or len(lifted_ph_set) == 0
+        or len(prev_phs) == len(lifted_ph_set)
+    ):
+        return
+
+    # Step 1: find first X1
+    for x1 in prev_phs:
+        if x1 in lifted_ph_set:
+            break
+
+    assert x1 is not None and x1.op == "placeholder"
+    # Step 2: starting from the X1, skip Xs and prepend Os before X1.
+    cand_x = x1.next
+    while cand_x is not None and cand_x.op == "placeholder":
+        if cand_x in lifted_ph_set:
+            cand_x = cand_x.next
+        else:
+            nxt = cand_x.next
+            cand_x._remove_from_list()
+            x1.prepend(cand_x)
+            cand_x = nxt
+
+    # Step 3: assert that all placeholders are in the correct order as .
+    # in lifted_freevars
+    after_phs = [node for node in graph.nodes if node.op == "placeholder"][
+        -len(lifted_freevars) :
+    ]
+    assert len(after_phs) == len(lifted_freevars)
+    for child_proxy, ph in zip(lifted_freevars.values(), after_phs):
+        assert child_proxy.node is ph, (
+            "The order of placeholders is different from the order of lifted_freevars"
+        )
+
+    graph.lint()
+
+
+def check_aliasing_and_input_mutation(
+    subtracer, graph, supports_input_mutation, supports_aliasing, source_target
+):
+    if not supports_input_mutation:
+        mutation_info = subtracer.has_input_mutation()
+        if mutation_info.has_mutation:
+            context = f"{mutation_info.msg} in\n {graph}"
+            unimplemented(
+                gb_type="Encountered input mutation during higher order op tracing",
+                context=context,
+                explanation=f"Higher order ops do not support input mutation. Found in {source_target.name()}",
+                hints=[
+                    "Consider using the debug context to change user code to avoid mutation.",
+                    "Please open an issue.",
+                ],
+            )
+
+    if not supports_aliasing:
+        aliasing_info = subtracer.has_aliasing()
+        if aliasing_info.has_aliasing:
+            context = f"{aliasing_info.msg} in\n {graph}"
+            unimplemented(
+                gb_type="Encountered aliasing during higher order op tracing",
+                context=context,
+                explanation=f"Higher order ops do not support aliasing. Found in {source_target.name()}",
+                hints=[
+                    "Replace `return input` with `return input.clone()` to avoid aliasing.",
+                    "Consider using the debug context to change user code to avoid aliasing.",
+                    "Please open an issue.",
+                ],
+            )
+
+
+def trace_hop_function(
+    f,
+    tx,
+    subtracer,
+    enable_grad,
+    under_activation_checkpoint,
+    restore_side_effects,
+    args,
+    sub_kwargs,
+):
+    autograd_ctx = (
+        dynamo_enable_grad(tx, enable_grad)
+        if enable_grad is not None
+        else contextlib.nullcontext()
+    )
+    checkpoint_ctx = (
+        dynamo_under_activation_checkpoint(tx)
+        if under_activation_checkpoint
+        else contextlib.nullcontext()
+    )
+
+    # For handling side effects, we can make an argument that we don't
+    # have to do anything here. The side effects infra does a good job
+    # of graph breaking if we mutate any nonlocal or global variable
+    # while subtracing. As a result if tracing succeeds, side effects
+    # data structure will only contain read-only data structures that
+    # are put there for tracking purposes.
+    # But on the other hand, there is an argument that if we ever write
+    # a new side effect in Dynamo which does not go through the side
+    # effect infra, we can end up in bad state.
+    # Therefore we restore the side effects after tracing. The catch is
+    # that we have to special handle tensor variables. If we have seen a
+    # nonlocal variable tensor during subtracing, we want to keep a
+    # track of that tensor, so that later subtracing or the root tracer
+    # itself does not create a new proxy for the already observed tensor
+    # variable.
+    if restore_side_effects:
+        prev_side_effects = tx.output.side_effects.clone()
+
+    with autograd_ctx, checkpoint_ctx:
+        output = f.call_function(tx, args, sub_kwargs)
+
+    if restore_side_effects:
+        new_side_effects = tx.output.side_effects.clone()
+        prev_side_effects.track_runahead_tensor_and_symvar_side_effects(
+            new_side_effects
+        )
+        tx.output.side_effects = prev_side_effects
+    return output
+
+
+def get_hop_args(
+    tx, f, subtracer, sub_args, sub_kwargs, set_subgraph_inputs, description
+):
+    sub_args_names = maybe_positional_arg_names(f)
+    # User mismatch in the number of args. Will eventually lead to an error.
+    if sub_args_names is not None and len(sub_args_names) < len(sub_args):
+        sub_args_names = None
+    args = validate_args_and_maybe_create_graph_inputs(
+        sub_args,
+        subtracer,
+        tx,
+        set_subgraph_inputs,
+        description,
+        sub_args_names,
+    )
+
+    validate_args_and_maybe_create_graph_inputs(
+        sub_kwargs.values(),
+        subtracer,
+        tx,
+        set_subgraph_inputs="automatic",
+        description=description,
+    )
+    return args
+
+
+# TODO - The eventual goal is to replace
+# speculate_subgraph_with_auto_output_flattening with speculate_subgraph or
+# merge them two into one. We are following a staged approach because of
+# existing implementation complexity for control flow ops.
+def speculate_subgraph_with_auto_output_flattening(
+    tx: "InstructionTranslator",
+    f: VariableTracker,
+    sub_args: Sequence[VariableTracker],
+    sub_kwargs: Optional[dict[str, VariableTracker]],
+    description: str,
+    *,
+    # source_target is the .value of HigherOrderOpVariable and is the
+    # target of the proxy that we created for the higherOrderOperator.
+    source_target: Optional[HigherOrderOperator] = None,
+    enable_grad: Optional[bool] = None,
+    # TODO - We can probably just make everyone use automatic for wrap_semantics
+    set_subgraph_inputs: Literal[
+        "automatic", "semi_automatic", "flatten_manual", "manual"
+    ] = "automatic",
+    # Make default False
+    restore_side_effects: bool = True,
+    under_activation_checkpoint: bool = False,
+    # TODO - supports input_mutation and aliasing should be False by default for strictness
+    supports_input_mutation: bool = True,
+    supports_aliasing: bool = True,
+    # Pass in an originating tracer - this is needed for preserving context
+    # across fwd-bwd for autograd.Function
+    tracer: Optional["torch._dynamo.output_graph.SubgraphTracer"] = None,
+) -> tuple[
+    VariableTracker,  # output: The VT that Dynamo continues tracing with
+    torch.fx.Graph,  # graph: The FX graph representing the subgraph computation
+    dict[
+        torch.fx.Proxy, torch.fx.Proxy
+    ],  # lifted_freevars: Free variables lifted as inputs
+    VariableTracker
+    | tuple[
+        VariableTracker, ...
+    ],  # graph_output_vts: Tensor/symint VTs that are actual FX graph outputs
+]:
+    """
+    Speculate subgraph for Higher-Order Operators (HOPs) with automatic output flattening.
+
+    ## Automatic output flattening
+
+    For many HOPs, the representation exists only as a container for the
+    subgraph. In later compiler stages or at runtime, the HOP is desugared and
+    simply executes the subgraph directly, as if it were inlined. For such hops,
+    we follow automatic output flattening.
+    For example:
+    - invoke_subgraph
+    - activation checkpointing (torch.utils.checkpoint.checkpoint)
+    - autograd.Function
+    - nested_compile_region
+
+    This is in contrast to control flow HOPs which do not follow this desugaring:
+    - torch.cond (conditional execution based on predicate)
+    - torch.while_loop (iterative execution)
+    - torch.map (parallel execution over batch dimension)
+
+    For control flow HOPs, the HOP behavior is fundamentally different from just
+    running the body function once.
+
+    ## Key Advantage: Disentangling VTs from Graph Outputs
+
+    Desugaring simplify HOP processing by allowing us to disentangle the output
+    variable trackers (VTs) from the HOP subgraph outputs. This mirrors typical
+    Dynamo processing where:
+    - VTs "run ahead" representing the program state for continued tracing
+    - The graph is a side data structure tracking computation seen so far
+
+    This separation is crucial for HOPs with non-proxyable outputs (e.g., custom
+    user-defined objects containing tensors). The function may return complex Python
+    objects for Dynamo to continue tracing, but only the tensor/symint VTs need to
+    be registered as actual FX graph outputs.
+
+    Example:
+        class Foo:
+            def __init__(self, a, b):
+                self.a = a  # tensor
+                self.b = b  # tensor
+
+        def gn(x):
+            return Foo(torch.sin(x), torch.cos(x))
+
+        result = some_hop(gn, x)  # Returns Foo instance
+        out = result.a + result.b  # Dynamo can continue tracing
+
+    Here, `output` VT is a UserDefinedObjectVariable wrapping Foo, but
+    `graph_output_vts` contains only the tensor VTs (a and b) that should be
+    actual FX graph outputs. This allows Dynamo to continue tracing with the
+    Foo object while the graph only needs to output the constituent tensors.
+
+    ## Return Values
+
+    Unlike `speculate_subgraph`, this function returns:
+    - output: The VT that Dynamo continues tracing with (may be complex Python objects)
+    - graph: The FX graph representing the subgraph computation
+    - lifted_freevars: Free variables lifted as inputs to the subgraph
+    - graph_output_vts: Only the tensor/symint VTs that are actual FX graph outputs
+
+    The key difference is `graph_output_vts` instead of `treespec`, which gives more
+    flexibility for handling non-proxyable outputs.
+    """
+    if sub_kwargs is None:
+        sub_kwargs = {}
+
+    assert set_subgraph_inputs in {
+        "automatic",
+        "semi_automatic",
+        "flatten_manual",
+        "manual",
+    }, "Please use one of the supported set_subgraph_inputs options."
+
+    # See NOTE [Temporary argument `set_subgraph_inputs`]
+    if sub_kwargs and set_subgraph_inputs != "automatic":
+        unimplemented(
+            gb_type="invalid set_subgraph_inputs and sub_kwargs settings",
+            context=f"set_subgraph_inputs: {set_subgraph_inputs}, sub_kwargs: {sub_kwargs}",
+            explanation="`sub_kwargs` cannot be used when `set_subgraph_inputs` is not set to 'automatic'.",
+            hints=[
+                "Use `set_subgraph_inputs='automatic'` when passing `sub_kwargs`.",
+                *graph_break_hints.USER_ERROR,
+            ],
+        )
+
+    try:
+        # ensure guards on args get installed in parent subgraph
+        f, sub_args, sub_kwargs = LazyVariableTracker.realize_all(
+            (f, sub_args, sub_kwargs),
+        )
+
+        with tx.output.subtracer(source_target, tracer) as subtracer:
+            args = get_hop_args(
+                tx, f, subtracer, sub_args, sub_kwargs, set_subgraph_inputs, description
+            )
+
+            output = trace_hop_function(
+                f,
+                tx,
+                subtracer,
+                enable_grad,
+                under_activation_checkpoint,
+                restore_side_effects,
+                args,
+                sub_kwargs,
+            )
+
+            # NOTE: [Separation of graph outputs and output VTs]
+            # In Dynamo (outside of speculate_subgraph), VTs and the graph are
+            # separate concepts:
+            # - VTs (VariableTrackers) can "run ahead" and continue Dynamo tracing
+            # - The graph is just a side data structure tracking computation seen so far
+            #
+            # This separation is crucial for HOPs with non-proxyable outputs (e.g.,
+            # custom user-defined objects containing tensors). The function may return
+            # complex Python objects for Dynamo to continue tracing, but only the
+            # tensor/symint VTs need to be registered as actual graph outputs.
+            #
+            # Example:
+            #   class Foo:
+            #       def __init__(self, a, b):
+            #           self.a = a  # tensor
+            #           self.b = b  # tensor
+            #
+            #   def gn(x):
+            #       return Foo(torch.sin(x), torch.cos(x))
+            #
+            # Here, `output` VT is a UserDefinedObjectVariable wrapping Foo, but
+            # `graph_output_vts` contains only the tensor VTs (a and b) that should
+            # be actual FX graph outputs.
+            # Collect only tensor and symint VTs that should be graph outputs.
+            # We walk the output structure and extract proxyable VTs.
+            graph_output_vts = []
+
+            output_types = (variables.TensorVariable, variables.SymNodeVariable)
+
+            def visit(vt):
+                if isinstance(vt, output_types):
+                    graph_output_vts.append(vt)
+
+            VariableTracker.visit(visit, output)
+            graph_output_vts = tuple(graph_output_vts)
+
+            # NOTE - [Return subgraph intermediates as subgraph outputs]
+            # This helps HOPs which allow side effects. Consider the
+            # following example
+            #
+            # def gn(x, z):
+            #     o = torch.matmul(x, x) @ x
+            #     out = x.sin()
+            #     z.append(out)
+            #     return torch.cos(torch.sin(o))
+
+            # def fn(x):
+            #     z = []
+            #     out1 = torch.utils.checkpoint.checkpoint(
+            #         gn,
+            #         x,
+            #         z,
+            #         use_reentrant=False,
+            #     )
+            #     return out1, z[0]
+            #
+            # In this example, list `z` is in outer scope and gets appended
+            # in the subgraph with `out`. But `out` is not an output of the
+            # subgraph. This can cause issue because later on when the outer
+            # graph returns `z[0]` it needs to have access to the graph node
+            # `out`. To solve this problem, we just return all intermediates
+            # from the subgraph.
+
+            # TODO - Today this is supported only for AC. AC HOP gets
+            # desugared in AOTDispatcher so even though subgraph has extra
+            # unused outputs in Dynamo, its ok even if we don't DCE them in
+            # Dynamo. As AOTDispatcher desugars/inlines the subgraph, the
+            # subgraph boundary disappears. And even for AC, today this only
+            # works when the skip_fwd_side_effects_in_bwd_under_checkpoint
+            # flag is True, i.e., only when we allow side-effects. But, we
+            # want this to be supported for other Hops as well, specifically
+            # nested_compile_region and autograd.Function. Today, its safe
+            # because we error out on seeing a side-effect.
+            if under_activation_checkpoint:
+                extra_outputs = []
+                for out in subtracer.tracked_tensor_or_symint_vt:
+                    if out not in set(graph_output_vts):
+                        extra_outputs.append(out)
+                graph_output_vts = graph_output_vts + tuple(extra_outputs)
+
+            validate_subgraph_output_types(graph_output_vts)
+
+            # The output proxies might not belong to this SubgraphTracer
+            # (if they are free variables that were never lifted)
+            # so lift them here.
+            # output_proxies = output.as_proxy()
+            if isinstance(graph_output_vts, tuple):
+                output_proxies = [a.as_proxy() for a in graph_output_vts]
+                output_proxies = pytree.tree_map(
+                    subtracer.maybe_lift_tracked_freevar_to_input, output_proxies
+                )
+                output_proxies = tuple(output_proxies)
+            else:
+                output_proxies = output.as_proxy()
+                output_proxies = pytree.tree_map(
+                    subtracer.maybe_lift_tracked_freevar_to_input, output_proxies
+                )
+
+            tx.output.create_node(
+                "output",
+                "output",
+                (subtracer.create_arg((output_proxies,))),
+                {},
+            )
+            graph = tx.output.graph
+            graph.lint()
+            lifted_freevars = subtracer.lifted_freevars
+
+            if len(lifted_freevars) > 0:
+                move_lifted_freevars_phs_to_end(graph, lifted_freevars)
+
+            check_aliasing_and_input_mutation(
+                subtracer,
+                graph,
+                supports_input_mutation,
+                supports_aliasing,
+                source_target,
+            )
+            # Return both the output VT and the graph output VTs separately:
+            # - `output`: The VT that Dynamo continues tracing with (may be
+            #   complex Python objects, tuples, dicts, etc.)
+            # - `graph`: The FX graph representing the subgraph computation
+            # - `lifted_freevars`: Free variables lifted as inputs to the subgraph
+            # - `graph_output_vts`: Only the tensor/symint VTs that are actual
+            #   FX graph outputs (basically the vts associated with graph outputs)
+            return (
+                output,
+                graph,
+                lifted_freevars,
+                graph_output_vts,
+            )
+    except Unsupported as ex:
+        f_name = f"{type(f).__name__}"
+        if isinstance(f, UserFunctionVariable):
+            f_name = f.get_name()
+        msg = (
+            f"speculate_subgraph: while introspecting {description}, we were unable "
+            f"to trace function `{f_name}` into a single graph. This means "
+            f"that Dynamo was unable to prove safety for this API and will "
+            f"fall back to eager-mode PyTorch, which could lead to a slowdown."
+        )
+        log.info(msg)
+        log.info(ex)  # noqa: G200
+        raise ex
+
+
 # See NOTE [HigherOrderOperator tracing design] for details of the design
 def speculate_subgraph(
     tx,
@@ -928,7 +1521,15 @@ def speculate_subgraph(
 
     # See NOTE [Temporary argument `set_subgraph_inputs`]
     if sub_kwargs and set_subgraph_inputs != "automatic":
-        unimplemented("Use `set_subgraph_inputs=automatic` when passing `sub_kwargs`.")
+        unimplemented(
+            gb_type="invalid set_subgraph_inputs and sub_kwargs settings",
+            context=f"set_subgraph_inputs: {set_subgraph_inputs}, sub_kwargs: {sub_kwargs}",
+            explanation="`sub_kwargs` cannot be used when `set_subgraph_inputs` is not set to 'automatic'.",
+            hints=[
+                "Use `set_subgraph_inputs='automatic'` when passing `sub_kwargs`.",
+                *graph_break_hints.USER_ERROR,
+            ],
+        )
 
     try:
         # ensure guards on args get installed in parent subgraph
@@ -937,65 +1538,20 @@ def speculate_subgraph(
         )
 
         with tx.output.subtracer(source_target, tracer) as subtracer:
-            sub_args_names = maybe_positional_arg_names(f)
-            # User mismatch in the number of args. Will eventually lead to an error.
-            if sub_args_names is not None and len(sub_args_names) < len(sub_args):
-                sub_args_names = None
-            args = validate_args_and_maybe_create_graph_inputs(
-                sub_args,
-                subtracer,
-                tx,
-                set_subgraph_inputs,
-                description,
-                sub_args_names,
+            args = get_hop_args(
+                tx, f, subtracer, sub_args, sub_kwargs, set_subgraph_inputs, description
             )
 
-            validate_args_and_maybe_create_graph_inputs(
-                sub_kwargs.values(),
-                subtracer,
+            output = trace_hop_function(
+                f,
                 tx,
-                set_subgraph_inputs="automatic",
-                description=description,
-            )
-
-            autograd_ctx = (
-                dynamo_enable_grad(tx, enable_grad)
-                if enable_grad is not None
-                else contextlib.nullcontext()
-            )
-            checkpoint_ctx = (
-                dynamo_under_activation_checkpoint(tx)
-                if under_activation_checkpoint
-                else contextlib.nullcontext()
-            )
-
-            # For handling side effects, we can make an argument that we don't
-            # have to do anything here. The side effects infra does a good job
-            # of graph breaking if we mutate any nonlocal or global variable
-            # while subtracing. As a result if tracing succeeds, side effects
-            # data structure will only contain read-only data structures that
-            # are put there for tracking purposes.
-            # But on the other hand, there is an argument that if we ever write
-            # a new side effect in Dynamo which does not go through the side
-            # effect infra, we can end up in bad state.
-            # Therefore we restore the side effects after tracing. The catch is
-            # that we have to special handle tensor variables. If we have seen a
-            # nonlocal variable tensor during subtracing, we want to keep a
-            # track of that tensor, so that later subtracing or the root tracer
-            # itself does not create a new proxy for the already observed tensor
-            # variable.
-            if restore_side_effects:
-                prev_side_effects = tx.output.side_effects.clone()
-
-            with autograd_ctx, checkpoint_ctx:
-                output = f.call_function(tx, args, sub_kwargs)
-
-            if restore_side_effects:
-                new_side_effects = tx.output.side_effects.clone()
-                prev_side_effects.track_runahead_tensor_and_symvar_side_effects(
-                    new_side_effects
-                )
-                tx.output.side_effects = prev_side_effects
+                subtracer,
+                enable_grad,
+                under_activation_checkpoint,
+                restore_side_effects,
+                args,
+                sub_kwargs,
+            )
 
             treespec = None
             masks_to_filter_const_values = None
@@ -1030,6 +1586,9 @@ def speculate_subgraph(
                         output, masks_to_filter_const_values
                     )
 
+            # TODO - clean up num_intermediate_nodes_as_outputs - we do not need
+            # after AC moved to auto_output_flattening
+            num_intermediate_nodes_as_outputs = 0
             # Register output to graph
             # Modeled off of compile_and_call_fx_graph
             # TODO: support pytree output
@@ -1041,7 +1600,10 @@ def speculate_subgraph(
                     (
                         output,
                         OutputSpec(
-                            treespec, masks_to_filter_const_values, const_values
+                            treespec,
+                            masks_to_filter_const_values,
+                            const_values,
+                            num_intermediate_nodes_as_outputs,
                         ),
                     ),
                     tx.output.graph,
@@ -1068,104 +1630,25 @@ def speculate_subgraph(
                 graph.lint()
                 lifted_freevars = subtracer.lifted_freevars
 
-                # NOTE: [HigherOrderOperator subgraph input ordering]
-                # The input ordering of the higher order ops is determined by the order of
-                # the creation of the placeholder.
-                # Manually created inputs are created in validate_args_and_maybe_create_graph_inputs before
-                # speculating subgraph.
-                # During subgraph speculation, we may lift closured tensors and free symbols as inputs,
-                # their ordering is determined by the time they are lifted: earlier lifted ones precede later
-                # lifted ones.
-                #
-                # Suppose the placeholders are
-                # O1, O2, X1, O3, O4, X2, X3, O5 where Xs are lifted phs
-                # The following code re-order the placeholders to
-                # O1, O2, O3, O4, O5, X1, X2, X3
-                def move_lifted_freevars_phs_to_end(
-                    graph: torch.fx.Graph, lifted_freevars: tuple[torch.fx.Node]
-                ):
-                    lifted_ph_set = {
-                        child_p.node for child_p in lifted_freevars.values()
-                    }
-
-                    prev_phs = [n for n in graph.nodes if n.op == "placeholder"]
-
-                    # No need to reorder when graph doesn't have args or doesn't
-                    # have lifted freevars or all inputs are lifted freevars.
-                    if (
-                        len(prev_phs) == 0
-                        or len(lifted_ph_set) == 0
-                        or len(prev_phs) == len(lifted_ph_set)
-                    ):
-                        return
-
-                    # Step 1: find first X1
-                    for x1 in prev_phs:
-                        if x1 in lifted_ph_set:
-                            break
-
-                    assert x1 is not None and x1.op == "placeholder"
-                    # Step 2: starting from the X1, skip Xs and prepend Os before X1.
-                    cand_x = x1.next
-                    while cand_x is not None and cand_x.op == "placeholder":
-                        if cand_x in lifted_ph_set:
-                            cand_x = cand_x.next
-                        else:
-                            nxt = cand_x.next
-                            cand_x._remove_from_list()
-                            x1.prepend(cand_x)
-                            cand_x = nxt
-
-                    # Step 3: assert that all placeholders are in the correct order as .
-                    # in lifted_freevars
-                    after_phs = [
-                        node for node in graph.nodes if node.op == "placeholder"
-                    ][-len(lifted_freevars) :]
-                    assert len(after_phs) == len(lifted_freevars)
-                    for child_proxy, ph in zip(lifted_freevars.values(), after_phs):
-                        assert child_proxy.node is ph, (
-                            "The order of placeholders is different from the order of lifted_freevars"
-                        )
-
-                    graph.lint()
-
                 if len(lifted_freevars) > 0:
                     move_lifted_freevars_phs_to_end(graph, lifted_freevars)
 
-                if not supports_input_mutation:
-                    mutation_info = subtracer.has_input_mutation()
-                    if mutation_info.has_mutation:
-                        context = f"{mutation_info.msg} in\n {graph}"
-                        unimplemented_v2(
-                            gb_type="Encountered input mutation during higher order op tracing",
-                            context=context,
-                            explanation=f"Higher order ops do not support input mutation. Found in {source_target.name()}",
-                            hints=[
-                                "Consider using the debug context to change user code to avoid mutation.",
-                                "Please open an issue.",
-                            ],
-                        )
-
-                if not supports_aliasing:
-                    aliasing_info = subtracer.has_aliasing()
-                    if aliasing_info.has_aliasing:
-                        context = f"{aliasing_info.msg} in\n {graph}"
-                        unimplemented_v2(
-                            gb_type="Encountered aliasing during higher order op tracing",
-                            context=context,
-                            explanation=f"Higher order ops do not support aliasing. Found in {source_target.name()}",
-                            hints=[
-                                "Replace `return input` with `return input.clone()` to avoid aliasing.",
-                                "Consider using the debug context to change user code to avoid aliasing.",
-                                "Please open an issue.",
-                            ],
-                        )
+                check_aliasing_and_input_mutation(
+                    subtracer,
+                    graph,
+                    supports_input_mutation,
+                    supports_aliasing,
+                    source_target,
+                )
 
                 return (
                     (
                         output,
                         OutputSpec(
-                            treespec, masks_to_filter_const_values, const_values
+                            treespec,
+                            masks_to_filter_const_values,
+                            const_values,
+                            num_intermediate_nodes_as_outputs,
                         ),
                     ),
                     graph,
@@ -1215,7 +1698,14 @@ def make(value, source=None, **kwargs):
 
         if isinstance(value, BaseHOP):
             return BaseHOPVariable(value, source, **kwargs)
-        unimplemented(f"HigherOrderOperator {value.__name__}")
+        unimplemented(
+            gb_type="unsupported HigherOrderOperator",
+            context=str(value),
+            explanation=f"Unable to create higher order operator variable for {value.__name__}.",
+            hints=[
+                *graph_break_hints.DYNAMO_BUG,
+            ],
+        )
 
     def call_function(
         self,
@@ -1236,7 +1726,14 @@ def _call_function(
         args: Sequence[VariableTracker],
         kwargs: dict[str, VariableTracker],
     ) -> VariableTracker:
-        unimplemented(f"HigherOrderOperator {self.value.__name__}")
+        unimplemented(
+            gb_type="unsupported HigherOrderOperator function call",
+            context=str(self.value),
+            explanation=f"Unable to trace calling higher order operator variable for {self.value.__name__}.",
+            hints=[
+                *graph_break_hints.DYNAMO_BUG,
+            ],
+        )
 
     def as_python_constant(self):
         return self.value
@@ -1286,15 +1783,18 @@ def _call_function(
                 )
                 args.append(v)
 
-        if kwargs:
-            unimplemented(f"torch.cond: Got unexpected kwargs: {list(kwargs.keys())}")
-
         # TODO(voz): Support fake tensor dispatch for recursive
         # ops - see torch/dispatch/_dispatcher.py
-        if len(args) != 4:
+        if len(args) != 4 or kwargs:
             unimplemented(
-                f"Expected 4 arguments but got {len(args)}.\n"
-                f"Usage: cond(pred, true_fn, false_fn, operands)",
+                gb_type="torch.cond: improper args/kwargs",
+                context=f"args: {args}, kwargs: {kwargs}",
+                explanation=f"torch.cond expects 4 positional arguments (got {len(args)}) "
+                f"and no keyword arguments (got {len(kwargs)}) "
+                "Usage: cond(pred, cond_fn, body_fn, operands)",
+                hints=[
+                    *graph_break_hints.USER_ERROR,
+                ],
             )
 
         # Specialize into one of the branches since pred is constant
@@ -1313,23 +1813,38 @@ def _call_function(
         # predicate
         if type(pred) not in (ConstantVariable, TensorVariable, SymNodeVariable):
             unimplemented(
-                f"Expected pred to be bool or a boolean tensor with single "
-                f"item but got {str(type(pred))} "
-                f"with original python type {str(pred.python_type())}.",
+                gb_type="torch.cond: improper predicate",
+                context=str(pred),
+                explanation="Expected `pred` to be a bool or a boolean tensor with a single item "
+                f"but got {str(type(pred))} with original python type {str(pred.python_type())}.",
+                hints=[
+                    *graph_break_hints.USER_ERROR,
+                ],
             )
 
         # operands
         if not isinstance(operands, (ListVariable, TupleVariable)):
             unimplemented(
-                f"Expected operands to be a list/tuple but got "
-                f"{operands.python_type()}",
+                gb_type="torch.cond: improper operands",
+                context=str(operands),
+                explanation="Expected `operands` to be a list/tuple "
+                f"but got {operands.python_type()}.",
+                hints=[
+                    *graph_break_hints.USER_ERROR,
+                ],
             )
+
         operands_seq = operands.unpack_var_sequence(tx)
         if not only_consist_of(
             operands, (TensorVariable, ConstantVariable, SymNodeVariable)
         ):
             unimplemented(
-                "Expect operands to be a tuple of pytrees that only consists of tensor leaves."
+                gb_type="torch.cond: improper operands contents",
+                context=str(operands),
+                explanation="Expected `operands` to be a list/tuple of pytrees that only consists of tensor leaves.",
+                hints=[
+                    *graph_break_hints.USER_ERROR,
+                ],
             )
 
         # branches
@@ -1378,14 +1893,22 @@ def speculate_branch(branch):
 
             if not only_consist_of(ret_val, (TensorVariable, ConstantVariable)):
                 unimplemented(
-                    "Expected branches to return a possibly nested pytree of tensors "
-                    "or constant ints but it consists of others.",
+                    gb_type="torch.cond: unsupported branch return type",
+                    context=str(ret_val),
+                    explanation="Expected branches to return a possibly nested pytree of tensors or constant ints.",
+                    hints=[
+                        *graph_break_hints.USER_ERROR,
+                    ],
                 )
             for ret in ret_val.unpack_var_sequence(tx):
                 if isinstance(ret, ConstantVariable) and ret.python_type() is not int:
                     unimplemented(
-                        "Expected branches to return a possibly nested pytree of tensors "
-                        f"or constant ints but it consists of others {ret.python_type()}.",
+                        gb_type="torch.cond: unsupported branch return type (constant non-int)",
+                        context=str(ret_val),
+                        explanation="Constants returned from branches must be ints.",
+                        hints=[
+                            *graph_break_hints.USER_ERROR,
+                        ],
                     )
             return ret_val, ret_spec, ret_graph, ret_lifted_freevars
 
@@ -1405,7 +1928,14 @@ def speculate_branch(branch):
         ).as_python_constant()
         # 3.14: NotImplemented cannot be converted to bool
         if same_spec is not NotImplemented and not same_spec:
-            unimplemented("Expected branches to return the same pytree structure.")
+            unimplemented(
+                gb_type="torch.cond: differing branch outputs",
+                context=f"true_spec: {true_spec.treespec}, false_spec: {false_spec.treespec}, same_spec: {same_spec}",
+                explanation="Expected branches to return the same pytree structure.",
+                hints=[
+                    *graph_break_hints.USER_ERROR,
+                ],
+            )
 
         (
             true_graph,
@@ -1450,6 +1980,7 @@ def speculate_branch(branch):
             {},
             None,
             true_spec,
+            true_r,
         )
 
 
@@ -1501,7 +2032,13 @@ def validate_subgraph_output_types(output: VariableTracker):
             ):
                 continue
             unimplemented(
-                f"HigherOrderOperator body's output must consist of tensors or ints only but got {out.python_type()}"
+                gb_type="HOP body output unsupported",
+                context=f"non-tensor outputs: {non_tensor_output}",
+                explanation="HigherOrderOperator body's output must consist of tensors or ints/bools only "
+                f"but got {out.python_type()}.",
+                hints=[
+                    *graph_break_hints.USER_ERROR,
+                ],
             )
 
 
@@ -1564,8 +2101,13 @@ def arg_extractor(combine_fn, xs, additional_inputs):
             # and the frontend invokes dynamo
             if len(args) != 2:
                 unimplemented(
-                    f"Expected 2 positional arguments but got {len(args)}.\n"
-                    f"Usage: associative_scan(combine_fn, xs)",
+                    gb_type="torch.associative_scan: improper args",
+                    context=f"args: {args}",
+                    explanation=f"torch.associative_scan expects 2 positional arguments (got {len(args)}) "
+                    "Usage: associative_scan(combine_fn, xs)",
+                    hints=[
+                        *graph_break_hints.USER_ERROR,
+                    ],
                 )
 
             xs_treespec = args[0].keywords["spec"]
@@ -1585,9 +2127,12 @@ def arg_extractor(combine_fn, xs, additional_inputs):
         # xs input check
         if not isinstance(xs, (ListVariable, TupleVariable)):
             unimplemented(
-                f"Expected xs to be a list/tuple but got "
-                f"{xs.python_type()}. It seems to be an "
-                f"internal error, please report an issue to PyTorch."
+                gb_type="torch.associative_scan: improper xs",
+                context=str(xs),
+                explanation=f"Expected xs to be a list/tuple but got {xs.python_type()}",
+                hints=[
+                    *graph_break_hints.DYNAMO_BUG,
+                ],
             )
         xs_vars = xs.unpack_var_sequence(tx)
         _check_all_tensorvariable(xs_vars)
@@ -1595,9 +2140,12 @@ def arg_extractor(combine_fn, xs, additional_inputs):
         # additional_inputs input check
         if not isinstance(additional_inputs, (ListVariable, TupleVariable)):
             unimplemented(
-                f"Expected additional_inputs to be a list/tuple but got "
-                f"{additional_inputs.python_type()}. It seems to be an "
-                f"internal error, please report an issue to PyTorch."
+                gb_type="torch.associative_scan: improper additional_inputs",
+                context=str(additional_inputs),
+                explanation=f"Expected additional_inputs to be a list/tuple but got {additional_inputs.python_type()}",
+                hints=[
+                    *graph_break_hints.DYNAMO_BUG,
+                ],
             )
         additional_inputs_vars = additional_inputs.unpack_var_sequence(tx)
         _check_all_tensorvariable(additional_inputs_vars)
@@ -1605,7 +2153,12 @@ def arg_extractor(combine_fn, xs, additional_inputs):
         scan_length = get_fake_value(xs_vars[0].as_proxy().node, tx).size()[0]
         if scan_length == 0:
             unimplemented(
-                "associative_scan() operator doesn't support zero-sized tensors during tracing."
+                gb_type="torch.associative_scan: zero-sized tensor",
+                context=str(xs_vars[0]),
+                explanation="associative_scan() operator doesn't support zero-sized tensors during tracing.",
+                hints=[
+                    *graph_break_hints.USER_ERROR,
+                ],
             )
 
         # Trace the subgraph
@@ -1653,8 +2206,13 @@ def arg_extractor(combine_fn, xs, additional_inputs):
         # Check whether the combine_fn returns one child tree for the output.
         if _combine_treespec.as_python_constant().num_leaves < 1:
             unimplemented(
-                f"combine_fn needs to produce one pytree for the output "
-                f"but combine_fn produces the pytree {_combine_treespec.as_python_constant()}."
+                gb_type="torch.associative_scan: combine_fn improper number of leaves",
+                context=str(_combine_treespec.as_python_constant()),
+                explanation="combine_fn needs to produce one pytree for the output "
+                f"but combine_fn produces the pytree {_combine_treespec.as_python_constant()}.",
+                hints=[
+                    *graph_break_hints.USER_ERROR,
+                ],
             )
 
         # Check whether the outs produced by combine_fn has the same treespec as xs
@@ -1667,8 +2225,13 @@ def arg_extractor(combine_fn, xs, additional_inputs):
             xs_treespec, _combine_treespec
         ).as_python_constant():
             unimplemented(
-                f"The tree structure of the xs and the outs of the combine_fn are are expected to be identical, but got "
-                f"xs: {xs_treespec.as_python_constant()} vs output: {_combine_treespec.as_python_constant()}."
+                gb_type="torch.associative_scan: mismatched input/output tree structure",
+                context=f"xs: {xs_treespec.as_python_constant()}, output: {_combine_treespec.as_python_constant()}",
+                explanation="The tree structure of the xs and the outs of the combine_fn are are expected to be identical, but got "
+                f"xs: {xs_treespec.as_python_constant()} vs output: {_combine_treespec.as_python_constant()}.",
+                hints=[
+                    *graph_break_hints.USER_ERROR,
+                ],
             )
 
         # We set include contiguity=False because we have vmap x HOP tests, where if
@@ -1741,6 +2304,7 @@ def arg_extractor(combine_fn, xs, additional_inputs):
             {},
             None,
             OutputSpec(xs_treespec),
+            None,
         )
 
 
@@ -1773,9 +2337,13 @@ def _check_combine_fn_is_normalized(combine_fn_var):
                 ),
             ):
                 unimplemented(
-                    f"Expected combine_fn to be wrapped as functools.partial in scan user-facing api "
-                    f"or a graph module if we're re-exporting but got "
-                    f"{combine_fn.python_type()}. Please report an issue to PyTorch if you're seeing this."
+                    gb_type="torch.scan: improper combine_fn",
+                    context=str(combine_fn_var),
+                    explanation="Expected combine_fn to be wrapped as functools.partial in scan user-facing api "
+                    f"or a graph module if we're re-exporting but got {combine_fn_var.python_type()}.",
+                    hints=[
+                        *graph_break_hints.DIFFICULT,
+                    ],
                 )
             return isinstance(
                 combine_fn_var,
@@ -1810,33 +2378,56 @@ def arg_extractor(combine_fn, init, xs, additional_inputs):
         # xs input check
         if not isinstance(xs, (ListVariable, TupleVariable)):
             unimplemented(
-                f"Expected xs to be a list/tuple but got "
-                f"{xs.python_type()}. It seems to be an "
-                f"internal error, please report an issue to PyTorch."
+                gb_type="torch.scan: improper xs",
+                context=str(xs),
+                explanation=f"Expected xs to be a list/tuple but got {xs.python_type()}",
+                hints=[
+                    *graph_break_hints.DYNAMO_BUG,
+                ],
             )
         # init input check
         if not isinstance(init, (ListVariable, TupleVariable)):
             unimplemented(
-                f"Expected init to be a list/tuple with at least one element but got "
-                f"{init.python_type()}. It seems to be an "
-                f"internal error, please report an issue to PyTorch."
+                gb_type="torch.scan: improper init",
+                context=str(init),
+                explanation=f"Expected init to be a list/tuple with at least one element but got {init.python_type()}",
+                hints=[
+                    *graph_break_hints.DYNAMO_BUG,
+                ],
             )
+
         if len(init_vars) == 0:
             unimplemented(
-                "scan() operator requires init leaves.  It seems to be an "
-                "internal error, please report an issue to PyTorch."
+                gb_type="torch.scan: no init leaves",
+                context="",
+                explanation="Expected init leaves.",
+                hints=[
+                    *graph_break_hints.DYNAMO_BUG,
+                ],
             )
+
         # additional_inputs input check
         if not isinstance(additional_inputs, (ListVariable, TupleVariable)):
             unimplemented(
-                f"Expected additional_inputs to be a list/tuple but got "
-                f"{additional_inputs.python_type()}. It seems to be an "
-                f"internal error, please report an issue to PyTorch."
+                gb_type="torch.scan: improper additional_inputs",
+                context=str(additional_inputs),
+                explanation=f"Expected additional_inputs to be a list/tuple but got {additional_inputs.python_type()}",
+                hints=[
+                    *graph_break_hints.DYNAMO_BUG,
+                ],
             )
         # scan_length check
         scan_length = get_fake_value(xs_vars[0].as_proxy().node, tx).size()[0]
         if scan_length == 0:
-            unimplemented("NYI: scan() operator doesn't support zero scan_length.")
+            unimplemented(
+                gb_type="torch.scan: zero-sized tensor",
+                context=str(xs_vars[0]),
+                explanation="associative_scan() operator doesn't support zero-sized tensors during tracing.",
+                hints=[
+                    *graph_break_hints.USER_ERROR,
+                    *graph_break_hints.SUPPORTABLE,
+                ],
+            )
         _check_all_tensorvariable(init_vars)
         _check_all_tensorvariable(xs_vars)
         _check_all_tensorvariable(additional_inputs_vars)
@@ -1886,7 +2477,12 @@ def arg_extractor(combine_fn, init, xs, additional_inputs):
         else:
             if len(combine_result_vars) != 2:
                 unimplemented(
-                    f"Expect combine_fn to return a tuple (next_carry, y) but got {combine_result_vars}"
+                    gb_type="torch.scan: improper combine_fn number of returns",
+                    context=str(combine_result_vars),
+                    explanation=f"Expect combine_fn to return a tuple (next_carry, y) but got {combine_result_vars}.",
+                    hints=[
+                        *graph_break_hints.USER_ERROR,
+                    ],
                 )
             carry_tree, out_vars = combine_result_vars
             carry_vars, _ = _make_inlined(tx, pytree.tree_flatten)(
@@ -1941,7 +2537,13 @@ def arg_extractor(combine_fn, init, xs, additional_inputs):
         )
 
         return _call_function_and_unflatten_output(
-            tx, torch.ops.higher_order.scan, p_args, {}, None, _combine_spec
+            tx,
+            torch.ops.higher_order.scan,
+            p_args,
+            {},
+            None,
+            _combine_spec,
+            None,
         )
 
 
@@ -1949,8 +2551,11 @@ def non_single_tensor_return_unsupported(api, ret):
     from . import TensorVariable
 
     if not isinstance(ret, TensorVariable):
-        raise Unsupported(
-            f"{api} over function that returns something other than one Tensor"
+        unimplemented(
+            gb_type="non-single Tensor return unsupported",
+            context=f"api: {api}, ret: {ret}",
+            explanation=f"{api} over function that returns something other than one Tensor.",
+            hints=[],
         )
 
 
@@ -1971,7 +2576,12 @@ def _call_function(
 
         if len(kwargs) > 0:
             unimplemented(
-                "torch.ops.higher_order.map: kwargs are not supported in the map operator."
+                gb_type="torch.map: kwargs not supported",
+                context=f"args: {args}, kwargs: {kwargs}",
+                explanation=f"torch.map expects no keyword arguments (got {len(kwargs)})",
+                hints=[
+                    *graph_break_hints.USER_ERROR,
+                ],
             )
 
         _check_supported_callable_arg(tx, args[0], "map_fn")
@@ -1986,7 +2596,12 @@ def _call_function(
 
         if len(sample_shape) < 1 or sample_shape[0] == 0:
             unimplemented(
-                "map() operator doesn't support scalar or zero-sized tensors during tracing."
+                gb_type="torch.map: improper inputs",
+                context=str(sample_shape),
+                explanation="torch.map doesn't support scalar or non-zero sized tensors during tracing.",
+                hints=[
+                    *graph_break_hints.USER_ERROR,
+                ],
             )
 
         # To get the example output from map() we will need to provide at least one sample to
@@ -2054,7 +2669,31 @@ def _call_function(
         )
 
         return _call_function_and_unflatten_output(
-            tx, torch.ops.higher_order.map_impl, p_args, {}, None, body_spec
+            tx, torch.ops.higher_order.map_impl, p_args, {}, None, body_spec, body_r
+        )
+
+
+class PrintHigherOrderVariable(TorchHigherOrderOperatorVariable):
+    def _call_function(
+        self,
+        tx: "InstructionTranslator",
+        args: "list[VariableTracker]",
+        kwargs: "dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        from .builder import wrap_fx_proxy
+
+        args, kwargs = LazyVariableTracker.realize_all((args, kwargs))
+
+        args_proxy = [arg.as_proxy() for arg in args]
+        kwargs_proxy = {k: v.as_proxy() for k, v in kwargs.items()}
+        return wrap_fx_proxy(
+            tx=tx,
+            proxy=tx.output.create_proxy(
+                "call_function",
+                self.value,
+                args=tuple(args_proxy),
+                kwargs=kwargs_proxy,
+            ),
         )
 
 
@@ -2075,7 +2714,10 @@ def _call_function(
         # there's no real solution to this issue yet.
         if len(kwargs) > 0:
             unimplemented(
-                "executorch_call_delegate: kwargs arguments were not enabled."
+                gb_type="executorch_call_delegate: kwargs not supported",
+                context=f"args: {args}, kwargs: {kwargs}",
+                explanation=f"executorch_call_delegate expects no keyword arguments (got {len(kwargs)})",
+                hints=[],
             )
         if isinstance(args[0], variables.NNModuleVariable):
             lowered_module = tx.output.get_submodule(args[0].module_key)
@@ -2125,6 +2767,9 @@ def call_function(
     ) -> "VariableTracker":
         return super().call_function(tx, args, kwargs)
 
+    def should_allow_nested_graph_breaks(self):
+        return False
+
 
 class FunctionalCallVariable(FunctorchHigherOrderVariable):
     def call_function(
@@ -2132,9 +2777,12 @@ def call_function(
     ) -> VariableTracker:
         if not torch._dynamo.config.inline_inbuilt_nn_modules:
             unimplemented(
-                "torch.func.functional_call capture is disabled, "
-                "it can be turned on by setting "
-                "`torch._dynamo.config.inline_inbuilt_nn_modules=True`"
+                gb_type="torch.func.functional_call capture is disabled",
+                context="",
+                explanation="torch.func.functional_call capture is disabled",
+                hints=[
+                    "Set `torch._dynamo.config.inline_inbuilt_nn_modules=True` to enable.",
+                ],
             )
         return super().call_function(tx, args, kwargs)
 
@@ -2179,10 +2827,11 @@ def create_wrapped_node(
         # See NOTE [HigherOrderOperator tracing design] for more details
 
         (
-            (body_r, treespec),
+            body_r,
             body_graph,
             body_lifted_freevars,
-        ) = speculate_subgraph(
+            body_graph_output_vts,
+        ) = speculate_subgraph_with_auto_output_flattening(
             tx,
             fn_vt,
             fn_args_vt,
@@ -2190,7 +2839,6 @@ def create_wrapped_node(
             description,
             source_target=self.value,
             restore_side_effects=self.restore_side_effects,
-            should_flatten_outputs=True,
             under_activation_checkpoint=under_activation_checkpoint,
             supports_input_mutation=self.supports_input_mutation,
             supports_aliasing=self.supports_aliasing,
@@ -2209,16 +2857,25 @@ def create_wrapped_node(
 
         # Since, we call `speculate_subgraph` with `set_subgraph_inputs="automatic`,
         # all the arguments are lifted.
-        lifted_args = tuple(arg for arg in body_lifted_freevars.keys())
+        lifted_args = tuple(arg for arg in body_lifted_freevars)
 
         proxy_args = (body_node,) + lifted_args
+
         example_value = pytree.tree_map_only(
-            torch.fx.Proxy,
-            lambda a: a.node.meta["example_value"],
-            body_r.as_proxy(),
+            torch.fx.Node,
+            lambda a: a.meta["example_value"],
+            body_graph.find_nodes(op="output")[0].args[0],
         )
 
-        return proxy_args, {}, example_value, body_r, treespec, body_gmod, body_name
+        return (
+            proxy_args,
+            {},
+            example_value,
+            body_r,
+            body_gmod,
+            body_name,
+            body_graph_output_vts,
+        )
 
     def _call_function(
         self,
@@ -2232,22 +2889,29 @@ def _call_function(
             p_kwargs,
             _example_value,
             body_r,
-            treespec,
             _,
             _,
+            body_graph_output_vts,
         ) = self.create_wrapped_node(tx, args[0], args[1:], kwargs, "wrap")
 
         if len(p_kwargs) > 0:
-            unimplemented("kwargs should have been flattened into lifted args")
-
-        flat_example_value = pytree.tree_map_only(
-            torch.fx.Proxy,
-            lambda a: a.node.meta["example_value"],
-            body_r.as_proxy(),
-        )
+            unimplemented(
+                gb_type="WrapHigherOrderVariable: kwargs unexpected",
+                context=f"args: {args}, kwargs: {kwargs}",
+                explanation="kwargs should have been flattened into lifted args.",
+                hints=[
+                    *graph_break_hints.DYNAMO_BUG,
+                ],
+            )
 
-        return _call_function_and_unflatten_output(
-            tx, self.value, tuple(p_args), p_kwargs, flat_example_value, treespec
+        return _call_function_with_auto_output_flattening(
+            tx,
+            self.value,
+            tuple(p_args),
+            p_kwargs,
+            _example_value,
+            body_r,
+            body_graph_output_vts,
         )
 
 
@@ -2267,13 +2931,25 @@ def call_function(
 
         if kwargs:
             unimplemented(
-                f"wrap_with_set_grad_enabled: Got unexpected kwargs: {list(kwargs.keys())}"
+                gb_type="wrap_with_set_grad_enabled: unexpected kwargs",
+                context=f"args: {args}, kwargs: {kwargs}",
+                explanation=f"wrap_with_set_grad_enabled expects no keyword arguments (got {len(kwargs)}).",
+                hints=[
+                    *graph_break_hints.DYNAMO_BUG,
+                ],
             )
 
         grad_enabled, fn_var, *rest_args = args
 
         if not isinstance(grad_enabled, ConstantVariable):
-            unimplemented("grad_enabled must be a constant")
+            unimplemented(
+                gb_type="wrap_with_set_grad_enabled: non-constant grad_enabled",
+                context=str(grad_enabled),
+                explanation="wrap_with_set_grad_enabled expects grad_enabled argument to be a constant.",
+                hints=[
+                    *graph_break_hints.DYNAMO_BUG,
+                ],
+            )
 
         _check_supported_callable_arg(tx, fn_var, "enable_grad_fn")
 
@@ -2295,7 +2971,10 @@ def call_function(
 
         if len(body_lifted_freevars) > 0:
             unimplemented(
-                f"wrap_with_set_grad_enabled: Got unexpected freevars {body_lifted_freevars}"
+                gb_type="wrap_with_set_grad_enabled: unexpected freevars",
+                context=str(body_lifted_freevars),
+                explanation="wrap_with_set_grad_enabled expects no freevars.",
+                hints=[],
             )
 
         body_gmod = torch.fx.GraphModule(tx.output.nn_modules, body_graph)
@@ -2319,7 +2998,7 @@ def call_function(
             body_r.as_proxy(),
         )
         return _call_function_and_unflatten_output(
-            tx, self.value, proxy_args, {}, example_value, treespec
+            tx, self.value, proxy_args, {}, example_value, treespec, body_r
         )
 
 
@@ -2339,7 +3018,12 @@ def call_function(
 
         if kwargs:
             unimplemented(
-                f"wrap_with_autocast: Got unexpected kwargs: {list(kwargs.keys())}"
+                gb_type="wrap_with_autocast: unexpected kwargs",
+                context=f"args: {args}, kwargs: {kwargs}",
+                explanation=f"wrap_with_autocast expects no keyword arguments (got {len(kwargs)}).",
+                hints=[
+                    *graph_break_hints.DYNAMO_BUG,
+                ],
             )
 
         device_type, dtype, enabled, cache_enabled, fn_var, *rest_args = args
@@ -2347,7 +3031,13 @@ def call_function(
         for arg in [device_type, dtype, enabled, cache_enabled]:
             if not isinstance(arg, ConstantVariable):
                 unimplemented(
-                    "device_type, dtype, enabled, cache_enabled must be constants"
+                    gb_type="wrap_with_autocast: expected constant arg",
+                    context=str(args),
+                    explanation="wrap_with_autocast expects device_type, dtype, enabled, "
+                    "and cache_enabled arguments to be constants.",
+                    hints=[
+                        *graph_break_hints.DYNAMO_BUG,
+                    ],
                 )
 
         _check_supported_callable_arg(tx, fn_var, "autocast")
@@ -2375,7 +3065,10 @@ def call_function(
 
         if len(body_lifted_freevars) > 0:
             unimplemented(
-                f"wrap_with_autocast: Got unexpected freevars {body_lifted_freevars}"
+                gb_type="wrap_with_autocast: unexpected freevars",
+                context=str(body_lifted_freevars),
+                explanation="wrap_with_autocast expects no freevars.",
+                hints=[],
             )
 
         body_gmod = torch.fx.GraphModule(tx.output.nn_modules, body_graph)
@@ -2400,13 +3093,21 @@ def call_function(
         )
 
         return _call_function_and_unflatten_output(
-            tx, self.value, proxy_args, {}, example_value, treespec
+            tx, self.value, proxy_args, {}, example_value, treespec, body_r
         )
 
 
-class HintsWrapperHigherOrderVariable(TorchHigherOrderOperatorVariable):
+class HintsWrapperHigherOrderVariable(WrapHigherOrderVariable):
+    def install_subgraph_in_output_graph(
+        self, tx, fn_vt, fn_args_vt, kwargs, body_gmod, attr_name="wrap_body"
+    ):
+        return tx.output.install_subgraph(
+            "hints_wrapper_body",
+            body_gmod,
+        )
+
     @raise_hard_error_if_graph_break(
-        reason="Hints_wrapper doesn't work unless it is captured completely with torch.compile."
+        reason="hints_wrapper doesn't work unless it is captured completely with torch.compile."
     )
     def _call_function(
         self, tx, args: "list[VariableTracker]", kwargs: "dict[str, VariableTracker]"
@@ -2414,66 +3115,64 @@ def _call_function(
         _check_supported_callable_arg(tx, args[0], "body_fn")
 
         # inputs
-        if len(args) != 3:
+        if (
+            len(args) != 3
+            or not isinstance(args[1], (ListVariable, TupleVariable))
+            or not isinstance(args[2], ConstDictVariable)
+            or len(kwargs) != 1
+            or "hints" not in kwargs
+        ):
             unimplemented(
-                f"Expected 3 arguments but got {len(args)}.\n"
-                f"Usage: hints_wrapper(body_fn, args, kwargs, hints).\n"
-                f"kwargs required to be provided explicitly."
+                gb_type="hints_wrapper: improper args/kwargs",
+                context=f"args: {args}, kwargs: {kwargs}",
+                explanation=f"hints_wrapper expects 3 positional arguments (got {len(args)}) "
+                f"and 1 keyword argument (got {len(kwargs)}). "
+                "Usage: hints_wrapper(body_fn, args, kwargs, hints=...). "
+                "args is expected to be list/tuple and kwargs is expected to be a dict.",
+                hints=[
+                    *graph_break_hints.USER_ERROR,
+                ],
             )
 
-        if not isinstance(args[1], (ListVariable, TupleVariable)):
-            unimplemented(
-                f"Expected a tuple but got {args[1].python_type()}",
-            )
         operands = args[1].unpack_var_sequence(tx)
+        fn_kwargs = args[2].as_python_constant()
 
-        if not isinstance(args[2], ConstDictVariable):
-            unimplemented(
-                f"Expected a dict but got {args[2].python_type()}",
-            )
-
-        if "hints" not in kwargs:
-            raise IncorrectUsage("hints_wrapper - key hints not provided")
-
+        # Use create_wrapped_node from WrapHigherOrderVariable
         (
-            (body_r, treespec),
-            body_graph,
-            body_lifted_freevars,
-        ) = speculate_subgraph(
+            p_args,
+            _,
+            example_value,
+            body_r,
+            body_gmod,
+            _,
+            body_graph_output_vts,
+        ) = self.create_wrapped_node(
             tx,
             args[0],  # function
             operands,
-            args[2].as_python_constant(),
+            fn_kwargs,
             "hints_wrapper",
-            source_target=self.value,
-            should_flatten_outputs=True,
         )
 
-        body_gmod = torch.fx.GraphModule(tx.output.nn_modules, body_graph)
-        body_name = tx.output.install_subgraph(
-            "hints_wrapper_body",
-            body_gmod,
-        )
-
-        body_node = make_attr(tx, body_name)
-
-        # Since, we call `speculate_subgraph` with `set_subgraph_inputs="automatic`,
-        # all the arguments are lifted.
-        lifted_args = tuple(arg for arg in body_lifted_freevars.keys())
+        # hints_wrapper expects (body_node, args, kwargs) as positional args
+        # So we need to restructure p_args from (body_node, *lifted_args)
+        # to (body_node, lifted_args_tuple, {})
+        body_node = p_args[0]
+        lifted_args = p_args[1:]
         p_args = (body_node, lifted_args, {})
 
-        p_kwargs = {}
         # add hints into p_kwargs
+        p_kwargs = {}
         p_kwargs["hints"] = kwargs["hints"].as_python_constant()
 
-        flat_example_value = pytree.tree_map_only(
-            torch.fx.Proxy,
-            lambda a: a.node.meta["example_value"],
-            body_r.as_proxy(),
-        )
-
-        return _call_function_and_unflatten_output(
-            tx, self.value, p_args, p_kwargs, flat_example_value, treespec
+        return _call_function_with_auto_output_flattening(
+            tx,
+            self.value,
+            p_args,
+            p_kwargs,
+            example_value,
+            body_r,
+            body_graph_output_vts,
         )
 
 
@@ -2487,7 +3186,14 @@ def _call_function(
         from .builder import wrap_fx_proxy
 
         if len(kwargs) > 0:
-            unimplemented("out_dtype does not handle kwargs")
+            unimplemented(
+                gb_type="out_dtype: unexpected kwargs",
+                context=f"args: {args}, kwargs: {kwargs}",
+                explanation=f"out_dtype expects no keyword arguments (got {len(kwargs)}).",
+                hints=[
+                    *graph_break_hints.USER_ERROR,
+                ],
+            )
 
         p_args = tuple(arg.as_proxy() for arg in args)
         op = p_args[0]
@@ -2526,11 +3232,23 @@ def _call_function(
         # TODO (tmanlaibaatar) support pytree here
         for arg in unpacked_sequence:
             if isinstance(arg, (ListVariable, TupleVariable, ConstDictVariable)):
-                unimplemented("strict_mode HOO only works for flat inputs for now")
+                unimplemented(
+                    gb_type="strict_mode: improper args",
+                    context=f"args: {args}, kwargs: {kwargs}",
+                    explanation="strict_mode higher order op expects flat inputs (list/tuple/dict)",
+                    hints=[
+                        *graph_break_hints.USER_ERROR,
+                    ],
+                )
 
         if kwargs:
             unimplemented(
-                f"strict_mode HOO received unexpected kwargs: {list(kwargs.keys())}"
+                gb_type="strict_mode: unexpected kwargs",
+                context=f"args: {args}, kwargs: {kwargs}",
+                explanation=f"strict_mode higher order op expects no keyword arguments (got {len(kwargs)}).",
+                hints=[
+                    *graph_break_hints.USER_ERROR,
+                ],
             )
 
         (
@@ -2557,7 +3275,7 @@ def _call_function(
         strict_mode_node = make_attr(tx, strict_mode_name)
         p_args = (
             strict_mode_node,
-            tuple(arg for arg in ret_lifted_freevars.keys()),
+            tuple(ret_lifted_freevars.keys()),
         )
 
         flat_example_value = pytree.tree_map_only(
@@ -2573,6 +3291,7 @@ def _call_function(
             {},
             flat_example_value,
             ret_spec,
+            ret_val,
         )
 
 
@@ -2617,9 +3336,9 @@ def _call_function(
             _,
             example_value,
             _body_r,
-            out_spec,
             checkpointed_gmod,
             _,
+            body_graph_output_vts,
         ) = self.create_wrapped_node(
             tx,
             args[0],
@@ -2633,13 +3352,14 @@ def _call_function(
 
         _, checkpoint_kwargs = proxy_args_kwargs([], checkpoint_kwargs)
 
-        return _call_function_and_unflatten_output(
+        return _call_function_with_auto_output_flattening(
             tx,
             self.value,
             p_args,
             checkpoint_kwargs,
             example_value,
-            out_spec,
+            _body_r,
+            body_graph_output_vts,
         )
 
 
@@ -2670,9 +3390,9 @@ def _call_function(
             _,
             example_value,
             _body_r,
-            out_spec,
             gmod,
             _,
+            body_graph_output_vts,
         ) = self.create_wrapped_node(
             tx,
             args[1],
@@ -2686,13 +3406,14 @@ def _call_function(
         gmod_meta_key = "_dynamo_bypassing_wrapper_fn"
         gmod.meta[gmod_meta_key] = func
 
-        return _call_function_and_unflatten_output(
+        return _call_function_with_auto_output_flattening(
             tx,
             self.value,
             (gmod_meta_key,) + tuple(p_args),
             {},
             example_value,
-            out_spec,
+            _body_r,
+            body_graph_output_vts,
         )
 
 
@@ -2789,9 +3510,15 @@ def _call_function(
             p_args = tuple(self.to_proxy(tx, arg) for arg in args)
             p_kwargs = {key: self.to_proxy(tx, arg) for key, arg in kwargs.items()}
         except (NotImplementedError, Unsupported) as err:
-            raise Unsupported(
-                "Missing Dynamo support for FlexAttentionBackward HOP argument. Please file an issue."
-            ) from err
+            unimplemented(
+                gb_type="failed to handle argument for FlexAttentionBackward HOP",
+                context=f"args: {args}, kwargs: {kwargs}",
+                explanation="Missing Dynamo support for FlexAttentionBackward HOP argument.",
+                hints=[
+                    *graph_break_hints.SUPPORTABLE,
+                ],
+                from_exc=err,
+            )
         return wrap_fx_proxy(
             tx=tx,
             proxy=tx.output.create_proxy(
@@ -2895,7 +3622,7 @@ def create_scalar():
         # passed in as arguments. In this case, we need to lift them, which is handled by speculate_subgraph.
         # We then need to create proxies for this + the inputs.
 
-        lifted_args = tuple(arg for arg in body_lifted_freevars.keys())
+        lifted_args = tuple(arg for arg in body_lifted_freevars)
 
         proxy_args = (body_node, lifted_args)
 
@@ -2949,24 +3676,26 @@ def _call_function(
         # - lifted args from tracing subgraph: [score_mod_other_buffers, mask_fn_other_buffers]
         _, _, _, inp_arg_block_mask, inp_arg_scale, inp_arg_kernel_options = inp_args
         block_mask = tuple(inp_arg_block_mask + (mask_fn_node,))
-        return wrap_fx_proxy(
-            tx=tx,
-            proxy=tx.output.create_proxy(
-                "call_function",
-                self.value,
-                args=inp_args[:3]
-                + (
-                    score_mod_node,
-                    block_mask,
-                    inp_arg_scale,
-                    inp_arg_kernel_options,
-                    score_mod_lifted_args,
-                    mask_fn_lifted_args,
+        with torch.fx.experimental.proxy_tensor.set_original_aten_op(self.value):
+            proxy = wrap_fx_proxy(
+                tx=tx,
+                proxy=tx.output.create_proxy(
+                    "call_function",
+                    self.value,
+                    args=inp_args[:3]
+                    + (
+                        score_mod_node,
+                        block_mask,
+                        inp_arg_scale,
+                        inp_arg_kernel_options,
+                        score_mod_lifted_args,
+                        mask_fn_lifted_args,
+                    ),
+                    kwargs={},
                 ),
-                kwargs={},
-            ),
-            example_value=None,
-        )
+                example_value=None,
+            )
+        return proxy
 
 
 class AutogradFunctionApplyVariable(VariableTracker):
@@ -3048,7 +3777,12 @@ def bwd(ctx, grad, x):
             )
             fwd_args = [fwd_fn.obj, ctx, *args]
         else:
-            unimplemented("non-function or method")
+            unimplemented(
+                gb_type="autograd.Function.apply: non-function or method forward",
+                context=str(self.fwd_graph),
+                explanation="Expected forward function to be a function or method.",
+                hints=[],
+            )
 
         # Speculate subgraph on the fwd
         (fwd_out, _), fwd_graph, fwd_freevars = speculate_subgraph(
@@ -3068,7 +3802,14 @@ def bwd(ctx, grad, x):
                 "_materialize_non_diff_grads"
                 in tx.output.side_effects.store_attr_mutations[ctx]
             ):
-                unimplemented("NYI")
+                unimplemented(
+                    gb_type="autograd.Function.apply: _materialize_non_diff_grads mutation",
+                    context="",
+                    explanation="Mutations to autograd.Function.ctx._materialize_non_diff_grads are not supported.",
+                    hints=[
+                        *graph_break_hints.SUPPORTABLE,
+                    ],
+                )
 
         bwd_tracer = torch._dynamo.output_graph.SubgraphTracer(
             tx.output,
@@ -3096,7 +3837,12 @@ def bwd(ctx, grad, x):
             )
             bwd_args = [bwd_fn.obj, *bwd_args]
         else:
-            unimplemented("non-function or method")
+            unimplemented(
+                gb_type="autograd.Function.apply: non-function or method backward",
+                context=str(self.bwd_graph),
+                explanation="Expected backward function to be a function or method.",
+                hints=[],
+            )
 
         def is_strict_for(v: VariableTracker):
             if isinstance(v, variables.TensorVariable):
@@ -3147,7 +3893,12 @@ def is_strict_for(v: VariableTracker):
                             UserDefinedClassVariable(self.bwd_graph.__class__),
                         )
                     else:
-                        unimplemented("non-function or method")
+                        unimplemented(
+                            gb_type="autograd.Function.apply: non-function or method backward (2)",
+                            context=str(self.bwd_graph),
+                            explanation="Expected backward function to be a function or method.",
+                            hints=[],
+                        )
 
                     with mock.patch(
                         "torch._dynamo.config._autograd_backward_strict_mode_conditional_banned_ops",
@@ -3201,7 +3952,7 @@ def is_strict_for(v: VariableTracker):
         # However, the bwd_freevars got from speculate_subgraph use the Proxies in the bwd_graph,
         # we need to convert them to Proxies in the fwd_graph and then generate new fwd_graph output.
         fwd_proxy_of_bwd_freevars = []
-        for k in bwd_freevars.keys():
+        for k in bwd_freevars:
             if k in fwd_freevars:
                 fwd_proxy_of_bwd_freevars.append(fwd_freevars[k])
             else:
@@ -3314,19 +4065,18 @@ def unwrap_proxy(x):
         # The fwd outputs (tensor's example_value) need to be inferred from fake tensor prop to get the correct attributes
         # (e.g, tensor.requires_grad), which would be used by downstream Dynamo tracing.
         # Since there can be other ops like Triton kernels, which depends on python dispatcher, we have to enable it.
-        with enable_python_dispatcher():
-            with tx.output.fake_mode:
-                fake_args = (
-                    tx.output.nn_modules[fwd_node.node.name],
-                    tx.output.nn_modules[bwd_node.node.name],
-                    *(
-                        [
-                            _get_fake_value(arg)
-                            for arg in filtered_args + list(fwd_freevars.keys())
-                        ]
-                    ),
-                )
-                example_value = autograd_function_apply(*fake_args, **kwargs)
+        with enable_python_dispatcher(), tx.output.fake_mode:
+            fake_args = (
+                tx.output.nn_modules[fwd_node.node.name],
+                tx.output.nn_modules[bwd_node.node.name],
+                *(
+                    [
+                        _get_fake_value(arg)
+                        for arg in filtered_args + list(fwd_freevars.keys())
+                    ]
+                ),
+            )
+            example_value = autograd_function_apply(*fake_args, **kwargs)
 
         return wrap_fx_proxy(
             tx=tx,
@@ -3394,29 +4144,30 @@ def _call_function(
             p_kwargs,
             example_value,
             body_r,
-            treespec,
-            body_gmod,
-            body_name,
+            _,
+            _,
+            body_graph_output_vts,
         ) = self.create_wrapped_node(
             tx, args[0], args[1:], {}, self.value._name, subgraph_name="subgraph"
         )
         assert len(p_kwargs) == 0
 
-        flat_example_value = pytree.tree_map_only(
-            torch.fx.Proxy,
-            lambda a: a.node.meta["example_value"],
-            body_r.as_proxy(),
-        )
-
         p_kwargs = {key: value.as_proxy() for key, value in kwargs.items()}
-        return _call_function_and_unflatten_output(
-            tx, self.value, p_args, p_kwargs, flat_example_value, treespec
+        return _call_function_with_auto_output_flattening(
+            tx,
+            self.value,
+            p_args,
+            p_kwargs,
+            example_value,
+            body_r,
+            body_graph_output_vts,
         )
 
 
 class InvokeSubgraphHigherOrderVariable(WrapHigherOrderVariable):
     supports_input_mutation = True
     supports_aliasing = False
+    restore_side_effects = False
 
     def install_subgraph_in_output_graph(
         self, tx, fn_vt, fn_args_vt, kwargs, body_gmod, attr_name
@@ -3427,7 +4178,7 @@ def install_subgraph_in_output_graph(
         # using the saved attr name.
 
         if not isinstance(fn_vt, (UnspecializedNNModuleVariable, UserFunctionVariable)):
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Encountered non user function variable during invoke_subgraph HOP tracing",
                 context=str(fn_vt),
                 explanation="invoke_subgraph does not support non user function variable",
@@ -3493,32 +4244,34 @@ def _call_function(
             p_kwargs,
             example_value,
             body_r,
-            treespec,
-            body_gmod,
+            _,
             body_name,
+            body_graph_output_vts,
         ) = self.create_wrapped_node(tx, args[0], args[1:], kwargs, "invoke_subgraph")
 
         if len(p_kwargs) > 0:
-            unimplemented("kwargs should have been flattened into lifted args")
-
-        flat_example_value = pytree.tree_map_only(
-            torch.fx.Proxy,
-            lambda a: a.node.meta["example_value"],
-            body_r.as_proxy(),
-        )
+            unimplemented(
+                gb_type="invoke_subgraph: kwargs unexpected",
+                context=f"args: {args}, kwargs: {kwargs}",
+                explanation="kwargs should have been flattened into lifted args.",
+                hints=[
+                    *graph_break_hints.DYNAMO_BUG,
+                ],
+            )
 
         p_args = (
             p_args[0],
             body_name,
             *p_args[1:],
         )
-        return _call_function_and_unflatten_output(
+        return _call_function_with_auto_output_flattening(
             tx,
             torch._higher_order_ops.invoke_subgraph,
             tuple(p_args),
             p_kwargs,
-            flat_example_value,
-            treespec,
+            example_value,
+            body_r,
+            body_graph_output_vts,
         )
 
 
@@ -3673,9 +4426,9 @@ def check_none_last(placements):
             p_kwargs,
             example_value,
             body_r,
-            treespec,
             body_gmod,
             body_name,
+            body_graph_output_vts,
         ) = self.create_wrapped_node(
             tx, user_func, user_args, kwargs, self.value._name, subgraph_name="subgraph"
         )
@@ -3733,16 +4486,16 @@ def make_error_msg(*args):
             )
         assert len(p_kwargs) == 0
 
-        flat_example_value = pytree.tree_map_only(
-            torch.fx.Proxy,
-            lambda a: a.node.meta["example_value"],
-            body_r.as_proxy(),
-        )
-
         # Step 5: Install local_map subgraph
         p_kwargs = {key: value.as_proxy() for key, value in kwargs.items()}
-        out = _call_function_and_unflatten_output(
-            tx, self.value, p_args, p_kwargs, flat_example_value, treespec
+        out = _call_function_with_auto_output_flattening(
+            tx,
+            self.value,
+            p_args,
+            p_kwargs,
+            example_value,
+            body_r,
+            body_graph_output_vts,
         )
 
         # Step 6: Restore inputs and outputs to global shapes
@@ -3808,6 +4561,7 @@ def make_error_msg(*args):
     "associative_scan": AssociativeScanHigherOrderVariable,
     "scan": ScanHigherOrderVariable,
     "call_torchbind": CallTorchbindHigherOrderVariable,
+    "print": PrintHigherOrderVariable,
     "wrap_with_set_grad_enabled": WrapWithSetGradEnabledHigherOrderVariable,
     "wrap_with_autocast": WrapWithAutocastHigherOrderVariable,
     "dynamo_bypassing_wrapper": DynamoBypassingWrapperHigherOrderVariable,
diff --git a/torch/_dynamo/variables/iter.py b/torch/_dynamo/variables/iter.py
index 5970ba0e1dda7..162ec02a9a9b7 100644
--- a/torch/_dynamo/variables/iter.py
+++ b/torch/_dynamo/variables/iter.py
@@ -14,8 +14,8 @@
 """
 
 import itertools
-from collections.abc import Callable
-from typing import Any, Sequence, TYPE_CHECKING, Union
+from collections.abc import Callable, Sequence
+from typing import Any, TYPE_CHECKING, Union
 
 from .. import graph_break_hints, polyfills, variables
 from ..bytecode_transformation import (
@@ -28,7 +28,7 @@
     handle_observed_exception,
     ObservedUserStopIteration,
     raise_observed_exception,
-    unimplemented_v2,
+    unimplemented,
     UserError,
 )
 from .base import ValueMutationNew, VariableTracker
@@ -63,8 +63,8 @@ def call_function(
         # See also: module `torch._dynamo.polyfills.itertools`
 
         if self.value is itertools.product:
-            if any(kw != "repeat" for kw in kwargs.keys()):
-                unimplemented_v2(
+            if any(kw != "repeat" for kw in kwargs):
+                unimplemented(
                     gb_type="Unsupported kwargs for itertools.product",
                     context=f"call_function {self} {args} {kwargs}",
                     explanation=f"Expected kwargs: 'repeat', but got "
@@ -72,7 +72,7 @@ def call_function(
                     hints=[*graph_break_hints.USER_ERROR],
                 )
 
-            if "repeat" in kwargs.keys():
+            if "repeat" in kwargs:
                 r = kwargs["repeat"].as_python_constant()
             else:
                 r = 1
@@ -82,7 +82,8 @@ def call_function(
                 for item in itertools.product(*seqs, repeat=r)
             ]
             return variables.ListIteratorVariable(
-                items, mutation_type=ValueMutationNew()
+                items,  # type: ignore[arg-type]
+                mutation_type=ValueMutationNew(),
             )
         elif (
             self.value is itertools.combinations
@@ -98,11 +99,12 @@ def call_function(
             for item in itertools.combinations(iterable, r):
                 items.append(variables.TupleVariable(list(item)))
             return variables.ListIteratorVariable(
-                items, mutation_type=ValueMutationNew()
+                items,  # type: ignore[arg-type]
+                mutation_type=ValueMutationNew(),
             )
         elif self.value is itertools.groupby:
-            if any(kw != "key" for kw in kwargs.keys()):
-                unimplemented_v2(
+            if any(kw != "key" for kw in kwargs):
+                unimplemented(
                     gb_type="Unsupported kwargs for itertools.groupby",
                     context=f"call_function {self} {args} {kwargs}",
                     explanation=f"Expected kwargs: 'key', but got "
@@ -116,7 +118,7 @@ def retrieve_const_key(key: VariableTracker) -> Any:
                 elif isinstance(key, variables.ConstantVariable):
                     return key.as_python_constant()
                 else:
-                    unimplemented_v2(
+                    unimplemented(
                         gb_type="Unsupported key type for itertools.groupby",
                         context=f"call_function {self} {args} {kwargs}",
                         explanation="Dynamo does not know how to trace "
@@ -128,7 +130,7 @@ def retrieve_const_key(key: VariableTracker) -> Any:
             if len(args) == 1 and args[0].has_unpack_var_sequence(tx):
                 seq = args[0].unpack_var_sequence(tx)
             else:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Unsupported arguments for itertools.groupby",
                     context=f"call_function {self} {args} {kwargs}",
                     explanation="Dynamo does not know how to trace "
@@ -173,7 +175,7 @@ def keyfunc(x: VariableTracker) -> Any:
                         )
                     )
             except Exception as e:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Unexpected failure during itertools.groupby() iteration",
                     context=f"call_function {self} {args} {kwargs}",
                     explanation="Unexpected failure in invoking function during groupby",
@@ -181,7 +183,8 @@ def keyfunc(x: VariableTracker) -> Any:
                     from_exc=e,
                 )
             return variables.ListIteratorVariable(
-                result, mutation_type=ValueMutationNew()
+                result,  # type: ignore[arg-type]
+                mutation_type=ValueMutationNew(),
             )
         elif self.value is itertools.repeat:
             if len(args) < 2:
@@ -212,7 +215,8 @@ def keyfunc(x: VariableTracker) -> Any:
                 )
             ]
             return variables.ListIteratorVariable(
-                items, mutation_type=ValueMutationNew()
+                items,  # type: ignore[arg-type]
+                mutation_type=ValueMutationNew(),
             )
         else:
             return super().call_function(tx, args, kwargs)
@@ -223,7 +227,7 @@ def __init__(self, **kwargs: Any) -> None:
         super().__init__(**kwargs)
 
     def next_variable(self, tx: "InstructionTranslator") -> VariableTracker:
-        unimplemented_v2(
+        unimplemented(
             gb_type="Unimplemented next() call",
             context=f"next({self})",
             explanation="This abstract method must be implemented",
@@ -586,7 +590,7 @@ def _next() -> VariableTracker:
             else:
                 res = self.fn.call_function(tx, [item], {})
             pred_res = variables.UserFunctionVariable(
-                polyfills.predicate
+                polyfills.predicate  # type: ignore[arg-type]
             ).call_function(tx, [res], {})
             if pred_res.as_python_constant():
                 return item
diff --git a/torch/_dynamo/variables/lists.py b/torch/_dynamo/variables/lists.py
index 11a199e99eadc..2ac355bd53417 100644
--- a/torch/_dynamo/variables/lists.py
+++ b/torch/_dynamo/variables/lists.py
@@ -1,5 +1,3 @@
-# mypy: ignore-errors
-
 """
 Variable tracking implementations for list-like data structures in Dynamo.
 
@@ -20,7 +18,8 @@ class that handles its unique behaviors while integrating with Dynamo's
 import inspect
 import operator
 import sys
-from typing import Optional, TYPE_CHECKING
+from collections.abc import Sequence
+from typing import Any, Optional, TYPE_CHECKING
 
 import torch
 import torch.fx
@@ -32,7 +31,7 @@ class that handles its unique behaviors while integrating with Dynamo's
     create_instruction,
     create_rot_n,
 )
-from ..exc import raise_observed_exception, unimplemented_v2
+from ..exc import raise_observed_exception, unimplemented
 from ..source import AttrSource, NamedTupleFieldsSource
 from ..utils import (
     cmp_name_to_op_mapping,
@@ -60,11 +59,11 @@ class that handles its unique behaviors while integrating with Dynamo's
 
 class BaseListVariable(VariableTracker):
     @staticmethod
-    def cls_for_instance(obj):
+    def cls_for_instance(obj: Any) -> type["BaseListVariable"]:
         return BaseListVariable.cls_for(type(obj))
 
     @staticmethod
-    def cls_for(obj):
+    def cls_for(obj: Any) -> type:
         return {
             iter: ListIteratorVariable,
             list: ListVariable,
@@ -80,34 +79,38 @@ def cls_for(obj):
     def __init__(
         self,
         items: list[VariableTracker],
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         super().__init__(**kwargs)
         assert isinstance(items, list)
         assert all(isinstance(x, VariableTracker) for x in items)
         self.items: list[VariableTracker] = items
 
-    def _as_proxy(self):
+    def _as_proxy(self) -> list[Any]:
         return [x.as_proxy() for x in self.items]
 
-    def modified(self, items, **kwargs):
+    def modified(
+        self, items: list[VariableTracker], **kwargs: Any
+    ) -> "BaseListVariable":
         return type(self)(items, **kwargs)
 
     @property
-    def value(self):
+    def value(self) -> Any:
         return self.as_python_constant()
 
-    def debug_repr_helper(self, prefix, suffix):
+    def debug_repr_helper(self, prefix: str, suffix: str) -> str:
         return prefix + ", ".join(i.debug_repr() for i in self.items) + suffix
 
-    def as_python_constant(self):
+    def as_python_constant(self) -> Any:
         return self.python_type()([x.as_python_constant() for x in self.items])
 
-    def as_proxy(self):
+    def as_proxy(self) -> Any:
         assert self.python_type() is not SizeVariable
         return self.python_type()(self._as_proxy())
 
-    def getitem_const(self, tx: "InstructionTranslator", arg: VariableTracker):
+    def getitem_const(
+        self, tx: "InstructionTranslator", arg: VariableTracker
+    ) -> VariableTracker:
         from .tensor import SymNodeVariable
 
         if isinstance(arg, SymNodeVariable):
@@ -134,16 +137,16 @@ def getitem_const(self, tx: "InstructionTranslator", arg: VariableTracker):
                     IndexError, tx, args=["list index out of range"]
                 )
 
-    def unpack_var_sequence(self, tx):
+    def unpack_var_sequence(self, tx: "InstructionTranslator") -> list[VariableTracker]:
         return list(self.items)
 
     def call_method(
         self,
-        tx,
-        name,
-        args: list["VariableTracker"],
-        kwargs: dict[str, "VariableTracker"],
-    ) -> "VariableTracker":
+        tx: "InstructionTranslator",
+        name: str,
+        args: list[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         if name == "__getitem__":
             from .tensor import TensorVariable
 
@@ -160,7 +163,7 @@ def call_method(
                 if value.constant is not None and value.constant.numel() == 1:
                     value = variables.ConstantVariable.create(value.constant.item())
                 else:
-                    unimplemented_v2(
+                    unimplemented(
                         gb_type="Indexing list with non-scalar tensor",
                         context=f"call_method {self} {name} {args} {kwargs}",
                         explanation=(
@@ -224,15 +227,15 @@ def call_method(
             if type(self) is not type(args[0]):
                 tp_name = self.python_type_name()
                 other = args[0].python_type_name()
-                msg = ConstantVariable.create(
+                msg_vt = ConstantVariable.create(
                     f'can only concatenate {tp_name} (not "{other}") to {tp_name}'
                 )
-                raise_observed_exception(TypeError, tx, args=[msg])
+                raise_observed_exception(TypeError, tx, args=[msg_vt])
 
             if name == "__add__":
-                return type(self)(self.items + args[0].items, source=self.source)
+                return type(self)(self.items + args[0].items, source=self.source)  # type: ignore[attr-defined]
             else:
-                self.items += args[0].items
+                self.items += args[0].items  # type: ignore[attr-defined]
                 return self
         elif name in ("__mul__", "__imul__"):
             if kwargs or len(args) != 1:
@@ -244,10 +247,10 @@ def call_method(
                 )
 
             if not (args[0].is_python_constant() and args[0].python_type() is int):
-                msg = ConstantVariable.create(
+                msg_vt = ConstantVariable.create(
                     f"can't multiply sequence by non-int type of '{args[0].python_type_name()}'"
                 )
-                raise_observed_exception(TypeError, tx, args=[msg])
+                raise_observed_exception(TypeError, tx, args=[msg_vt])
 
             val = args[0].as_python_constant()
 
@@ -301,7 +304,7 @@ def call_method(
 
 
 class RangeVariable(BaseListVariable):
-    def __init__(self, items, **kwargs) -> None:
+    def __init__(self, items: Sequence[VariableTracker], **kwargs: Any) -> None:
         items_to_map = items
         start = variables.ConstantVariable.create(0)
         stop = None
@@ -316,7 +319,7 @@ def __init__(self, items, **kwargs) -> None:
         else:
             raise AssertionError
 
-        def maybe_as_int(x):
+        def maybe_as_int(x: VariableTracker) -> VariableTracker:
             return (
                 ConstantVariable(int(x.value)) if isinstance(x, ConstantVariable) else x
             )
@@ -329,22 +332,22 @@ def maybe_as_int(x):
         assert stop is not None
         super().__init__([start, stop, step], **kwargs)
 
-    def debug_repr(self):
+    def debug_repr(self) -> str:
         return self.debug_repr_helper("range(", ")")
 
-    def python_type(self):
+    def python_type(self) -> type:
         return range
 
-    def start(self):
+    def start(self) -> Any:
         return self.items[0].as_python_constant()
 
-    def stop(self):
+    def stop(self) -> Any:
         return self.items[1].as_python_constant()
 
-    def step(self):
+    def step(self) -> Any:
         return self.items[2].as_python_constant()
 
-    def range_length(self):
+    def range_length(self) -> int:
         lo = self.start()
         hi = self.stop()
         step = self.step()
@@ -357,7 +360,7 @@ def range_length(self):
         else:
             return 0
 
-    def _get_slice_indices(self, length, slice):
+    def _get_slice_indices(self, length: int, slice: slice) -> list[int]:
         step_is_negative = 0
 
         if slice.step is None:
@@ -406,7 +409,7 @@ def _get_slice_indices(self, length, slice):
 
         return [start, stop, step]
 
-    def apply_index(self, index):
+    def apply_index(self, index: int) -> VariableTracker:
         length = self.range_length()
         if index < 0:
             index = length + index
@@ -421,12 +424,12 @@ def apply_index(self, index):
 
         return variables.ConstantVariable.create(self.start() + (index * self.step()))
 
-    def apply_slice(self, slice):
+    def apply_slice(self, slice: slice) -> "RangeVariable":
         (slice_start, slice_stop, slice_step) = self._get_slice_indices(
             self.range_length(), slice
         )
 
-        def compute_item(index):
+        def compute_item(index: int) -> int:
             return self.start() + (index * self.step())
 
         sub_step = self.step() * slice_step
@@ -442,10 +445,12 @@ def compute_item(index):
         )
         return result
 
-    def as_python_constant(self):
+    def as_python_constant(self) -> range:
         return range(*[x.as_python_constant() for x in self.items])
 
-    def getitem_const(self, tx: "InstructionTranslator", arg: VariableTracker):
+    def getitem_const(
+        self, tx: "InstructionTranslator", arg: VariableTracker
+    ) -> VariableTracker:
         # implementations mimics https://github.com/python/cpython/blob/main/Objects/rangeobject.c
         index = arg.as_python_constant()
 
@@ -457,28 +462,30 @@ def getitem_const(self, tx: "InstructionTranslator", arg: VariableTracker):
             msg = ConstantVariable("range indices must be integers or slices")
             raise_observed_exception(TypeError, tx, args=[msg])
 
-    def as_proxy(self):
+    def as_proxy(self) -> range:
         return self.python_type()(*self._as_proxy())
 
-    def unpack_var_sequence(self, tx=None):
+    def unpack_var_sequence(
+        self, tx: Optional["InstructionTranslator"] = None
+    ) -> list[VariableTracker]:
         return [variables.ConstantVariable.create(x) for x in self.as_python_constant()]
 
     def reconstruct(self, codegen: "PyCodegen") -> None:
         assert "range" not in codegen.tx.f_globals
         codegen.add_push_null(
-            lambda: codegen.append_output(codegen.create_load_python_module(range))
+            lambda: codegen.append_output(codegen.create_load_python_module(range))  # type: ignore[arg-type]
         )
         codegen.foreach(self.items)
         codegen.extend_output(create_call_function(3, False))
 
     def call_obj_hasattr(
         self, tx: "InstructionTranslator", name: str
-    ) -> "VariableTracker":
+    ) -> VariableTracker:
         if self.python_type() is range:
             return variables.ConstantVariable.create(name in range.__dict__)
         return super().call_obj_hasattr(tx, name)
 
-    def range_equals(self, other: "RangeVariable"):
+    def range_equals(self, other: "RangeVariable") -> bool:
         r0, r1 = self, other
         if (
             self.range_length() != r1.range_length()
@@ -487,12 +494,12 @@ def range_equals(self, other: "RangeVariable"):
         ):
             return False
 
-        if len(r0) == 1:
+        if self.range_length() == 1:
             return True
 
         return r0.step() == r1.step()
 
-    def range_count(self, x: VariableTracker):
+    def range_count(self, x: VariableTracker) -> int:
         # Based on CPython
         # https://github.com/guilhermeleobas/cpython/blob/baefaa6cba1d69efd2f930cdc56bca682c54b139/Objects/rangeobject.c#L442-L486
         x = x.as_python_constant()
@@ -511,7 +518,13 @@ def range_count(self, x: VariableTracker):
             return int(re)
         return 0
 
-    def call_method(self, tx, name, args, kwargs):
+    def call_method(
+        self,
+        tx: "InstructionTranslator",
+        name: str,
+        args: list[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         if name == "__iter__":
             if not all(var.is_python_constant() for var in self.items):
                 # Can't represent a `range_iterator` without well defined bounds
@@ -545,7 +558,10 @@ def call_method(self, tx, name, args, kwargs):
             if pt is not range:
                 return ConstantVariable.create(NotImplemented)
 
-            cmp = self.range_equals(other)
+            if isinstance(other, RangeVariable):
+                cmp = self.range_equals(other)
+            else:
+                cmp = False
 
             # Two ranges are equal if they produce the same sequence of values
             if name == "__eq__":
@@ -554,7 +570,7 @@ def call_method(self, tx, name, args, kwargs):
                 return ConstantVariable(not cmp)
         return super().call_method(tx, name, args, kwargs)
 
-    def var_getattr(self, tx: "InstructionTranslator", name):
+    def var_getattr(self, tx: "InstructionTranslator", name: str) -> VariableTracker:
         fields = ["start", "stop", "step"]
         if name in fields:
             return self.items[fields.index(name)]
@@ -568,11 +584,11 @@ class CommonListMethodsVariable(BaseListVariable):
 
     def call_method(
         self,
-        tx,
-        name,
-        args: list["VariableTracker"],
-        kwargs: dict[str, "VariableTracker"],
-    ) -> "VariableTracker":
+        tx: "InstructionTranslator",
+        name: str,
+        args: list[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         from .tensor import SymNodeVariable
 
         if name == "append" and self.is_mutable():
@@ -676,9 +692,9 @@ def call_method(
                 self.items[key.evaluate_expr()] = value
             elif isinstance(key, SliceVariable):
                 if key.is_python_constant():
-                    self.items[key.as_python_constant()] = list(value.items)
+                    self.items[key.as_python_constant()] = list(value.items)  # type: ignore[attr-defined]
                 else:
-                    items = slice(
+                    items_slice = slice(
                         *[
                             (
                                 s.evaluate_expr()
@@ -688,7 +704,7 @@ def call_method(
                             for s in key.items
                         ]
                     )
-                    self.items[items] = list(value.items)
+                    self.items[items_slice] = list(value.items)  # type: ignore[attr-defined]
             else:
                 self.items[key.as_python_constant()] = value
             return ConstantVariable.create(None)
@@ -733,8 +749,8 @@ def call_method(
                     "0 args and 0 kwargs",
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
-            items = list(self.items)
-            return self.modified(items, mutation_type=ValueMutationNew())
+            items_lst: list[VariableTracker] = list(self.items)
+            return self.modified(items_lst, mutation_type=ValueMutationNew())
         elif name == "reverse" and self.is_mutable():
             if args or kwargs:
                 raise_args_mismatch(
@@ -763,13 +779,13 @@ def call_method(
 
 
 class ListVariable(CommonListMethodsVariable):
-    def python_type(self):
+    def python_type(self) -> type:
         return list
 
     def __repr__(self) -> str:
         return f"{self.__class__.__name__}(length={len(self.items)})"
 
-    def debug_repr(self):
+    def debug_repr(self) -> str:
         return self.debug_repr_helper("[", "]")
 
     def reconstruct(self, codegen: "PyCodegen") -> None:
@@ -778,11 +794,11 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
 
     def call_method(
         self,
-        tx,
-        name,
-        args: list["VariableTracker"],
-        kwargs: dict[str, "VariableTracker"],
-    ) -> "VariableTracker":
+        tx: "InstructionTranslator",
+        name: str,
+        args: list[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         from .tensor import SymNodeVariable
 
         if name == "__setitem__" and self.is_mutable():
@@ -805,14 +821,14 @@ def call_method(
                     msg = ConstantVariable.create("can only assign an iterable")
                     raise_observed_exception(TypeError, tx, args=[msg])
 
-                key = key.as_python_constant()
-                if key.step == 0:
+                key_as_const = key.as_python_constant()
+                if key_as_const.step == 0:
                     msg = ConstantVariable.create("slice step cannot be zero")
                     raise_observed_exception(ValueError, tx, args=[msg])
 
-                value = value.force_unpack_var_sequence(tx)
+                value_unpack = value.force_unpack_var_sequence(tx)
                 try:
-                    self.items[key] = value
+                    self.items[key_as_const] = value_unpack
                 except Exception as exc:
                     raise_observed_exception(
                         type(exc),
@@ -859,11 +875,11 @@ def call_method(
                 assert first_non_constant_key is not None
 
                 try:
-                    python_type = first_non_constant_key.python_type()
+                    python_type = str(first_non_constant_key.python_type())
                 except NotImplementedError:
                     python_type = "unknown"
 
-                unimplemented_v2(
+                unimplemented(
                     gb_type="sort with non-constant keys",
                     context=str(first_non_constant_key),
                     explanation=(
@@ -904,7 +920,7 @@ def call_method(
 
         return super().call_method(tx, name, args, kwargs)
 
-    def var_getattr(self, tx, name):
+    def var_getattr(self, tx: "InstructionTranslator", name: str) -> VariableTracker:
         if name == "__class__":
             source = AttrSource(self.source, name) if self.source else None
             class_type = self.python_type()
@@ -916,14 +932,19 @@ def var_getattr(self, tx, name):
 
     def call_obj_hasattr(
         self, tx: "InstructionTranslator", name: str
-    ) -> "VariableTracker":
+    ) -> VariableTracker:
         if self.python_type() is not list:
             return super().call_obj_hasattr(tx, name)
         return variables.ConstantVariable.create(hasattr([], name))
 
 
 class DequeVariable(CommonListMethodsVariable):
-    def __init__(self, items, maxlen=None, **kwargs) -> None:
+    def __init__(
+        self,
+        items: list[VariableTracker],
+        maxlen: Optional[VariableTracker] = None,
+        **kwargs: Any,
+    ) -> None:
         if maxlen is None:
             maxlen = ConstantVariable.create(None)
         assert maxlen.is_python_constant(), (
@@ -935,17 +956,17 @@ def __init__(self, items, maxlen=None, **kwargs) -> None:
             items = items[-maxlen.as_python_constant() :]
         super().__init__(items, **kwargs)
 
-    def python_type(self):
+    def python_type(self) -> type:
         return collections.deque
 
-    def debug_repr(self):
+    def debug_repr(self) -> str:
         if self.maxlen.as_python_constant() is None:
             return self.debug_repr_helper(
                 "deque([", "], maxlen=" + self.maxlen.debug_repr() + ")"
             )
         return self.debug_repr_helper("deque([", "])")
 
-    def as_python_constant(self):
+    def as_python_constant(self) -> collections.deque[Any]:
         return self.python_type()(
             [x.as_python_constant() for x in self.items],
             maxlen=self.maxlen.as_python_constant(),
@@ -954,7 +975,7 @@ def as_python_constant(self):
     def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen.add_push_null(
             lambda: codegen.append_output(
-                codegen.create_load_python_module(collections.deque)
+                codegen.create_load_python_module(collections.deque)  # type: ignore[arg-type]
             )
         )
         codegen.foreach(self.items)
@@ -962,18 +983,18 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen(self.maxlen)
         codegen.extend_output(codegen.create_call_function_kw(2, ("maxlen",), False))
 
-    def var_getattr(self, tx: "InstructionTranslator", name):
+    def var_getattr(self, tx: "InstructionTranslator", name: str) -> VariableTracker:
         if name == "maxlen":
             return self.maxlen
         return super().var_getattr(tx, name)
 
     def call_method(
         self,
-        tx,
-        name,
-        args: list["VariableTracker"],
-        kwargs: dict[str, "VariableTracker"],
-    ) -> "VariableTracker":
+        tx: "InstructionTranslator",
+        name: str,
+        args: list[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         if (
             name == "__setitem__"
             and self.is_mutable()
@@ -1068,20 +1089,20 @@ def call_method(
 
     def call_obj_hasattr(
         self, tx: "InstructionTranslator", name: str
-    ) -> "VariableTracker":
+    ) -> VariableTracker:
         if self.python_type() is collections.deque:
             return variables.ConstantVariable.create(name in collections.deque.__dict__)
         return super().call_obj_hasattr(tx, name)
 
 
 class TupleVariable(BaseListVariable):
-    def python_type(self):
+    def python_type(self) -> type[tuple]:  # type: ignore[type-arg]
         return tuple
 
     def __repr__(self) -> str:
         return f"{self.__class__.__name__}(length={len(self.items)})"
 
-    def debug_repr(self):
+    def debug_repr(self) -> str:
         return self.debug_repr_helper("(", ")")
 
     def reconstruct(self, codegen: "PyCodegen") -> None:
@@ -1090,14 +1111,14 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
 
     def call_method(
         self,
-        tx,
-        name,
-        args: list["VariableTracker"],
-        kwargs: dict[str, "VariableTracker"],
-    ) -> "VariableTracker":
+        tx: "InstructionTranslator",
+        name: str,
+        args: list[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         return super().call_method(tx, name, args, kwargs)
 
-    def var_getattr(self, tx, name):
+    def var_getattr(self, tx: "InstructionTranslator", name: str) -> VariableTracker:
         if name == "__class__":
             source = AttrSource(self.source, name) if self.source else None
             class_type = self.python_type()
@@ -1109,7 +1130,7 @@ def var_getattr(self, tx, name):
 
     def call_obj_hasattr(
         self, tx: "InstructionTranslator", name: str
-    ) -> "VariableTracker":
+    ) -> VariableTracker:
         if self.python_type() is not tuple:
             return super().call_obj_hasattr(tx, name)
         return variables.ConstantVariable.create(hasattr((), name))
@@ -1127,18 +1148,18 @@ def __init__(
         self,
         items: list[VariableTracker],
         proxy: Optional[torch.fx.Proxy] = None,
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         self.proxy = proxy
         super().__init__(items, **kwargs)
 
-    def debug_repr(self):
+    def debug_repr(self) -> str:
         return self.debug_repr_helper("torch.Size([", "])")
 
-    def python_type(self):
+    def python_type(self) -> type:
         return torch.Size
 
-    def as_proxy(self):
+    def as_proxy(self) -> Any:
         if self.proxy is not None:
             return self.proxy
 
@@ -1193,10 +1214,10 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
         ] + create_call_function(1, False)
         codegen.extend_output(build_torch_size)
 
-    def unpack_var_sequence(self, tx):
+    def unpack_var_sequence(self, tx: "InstructionTranslator") -> list[VariableTracker]:
         return list(self.items)
 
-    def numel(self, tx):
+    def numel(self, tx: "InstructionTranslator") -> VariableTracker:
         from .builtin import BuiltinVariable
         from .tensor import SymNodeVariable
 
@@ -1226,11 +1247,11 @@ def numel(self, tx):
 
     def call_method(
         self,
-        tx,
-        name,
-        args: list["VariableTracker"],
-        kwargs: dict[str, "VariableTracker"],
-    ) -> "VariableTracker":
+        tx: "InstructionTranslator",
+        name: str,
+        args: list[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         if name == "__getitem__":
             if kwargs or len(args) != 1:
                 raise_args_mismatch(
@@ -1253,7 +1274,9 @@ def call_method(
 
         return super().call_method(tx, name, args, kwargs)
 
-    def get_item_dyn(self, tx: "InstructionTranslator", arg: VariableTracker):
+    def get_item_dyn(
+        self, tx: "InstructionTranslator", arg: VariableTracker
+    ) -> VariableTracker:
         from .tensor import SymNodeVariable
 
         if isinstance(arg, SymNodeVariable):
@@ -1269,7 +1292,7 @@ def get_item_dyn(self, tx: "InstructionTranslator", arg: VariableTracker):
 
     def call_obj_hasattr(
         self, tx: "InstructionTranslator", name: str
-    ) -> "VariableTracker":
+    ) -> VariableTracker:
         return variables.ConstantVariable.create(hasattr(torch.Size, name))
 
 
@@ -1280,33 +1303,39 @@ class NamedTupleVariable(TupleVariable):
         *TupleVariable._nonvar_fields,
     }
 
-    def __init__(self, items, tuple_cls, dynamic_attributes=None, **kwargs) -> None:
+    def __init__(
+        self,
+        items: list[VariableTracker],
+        tuple_cls: type,
+        dynamic_attributes: Optional[dict[str, VariableTracker]] = None,
+        **kwargs: Any,
+    ) -> None:
         super().__init__(items, **kwargs)
         self.tuple_cls = tuple_cls
         self.dynamic_attributes = dynamic_attributes if dynamic_attributes else {}
 
-    def is_namedtuple(self):
+    def is_namedtuple(self) -> bool:
         return isinstance(getattr(self.tuple_cls, "_fields", None), tuple) and callable(
             getattr(self.tuple_cls, "_make", None)
         )
 
-    def is_structseq(self):
+    def is_structseq(self) -> bool:
         return not self.is_namedtuple()
 
-    def fields(self):
+    def fields(self) -> tuple[str, ...]:
         return namedtuple_fields(self.tuple_cls)
 
-    def debug_repr(self):
+    def debug_repr(self) -> str:
         if self.is_structseq():
             # StructSequenceType(iterable)
             return repr(self.tuple_cls([Lit(x.debug_repr()) for x in self.items]))
         # NamedTupleType(*iterable)
         return repr(self.tuple_cls(*(Lit(x.debug_repr()) for x in self.items)))
 
-    def python_type(self):
+    def python_type(self) -> type:
         return self.tuple_cls
 
-    def as_python_constant(self):
+    def as_python_constant(self) -> Any:
         if self.is_structseq():
             # StructSequenceType(iterable)
             result = self.python_type()([x.as_python_constant() for x in self.items])
@@ -1328,7 +1357,7 @@ def as_python_constant(self):
 
         return result
 
-    def as_proxy(self):
+    def as_proxy(self) -> Any:
         assert self.python_type() is not SizeVariable
         if self.is_structseq():
             # StructSequenceType(iterable)
@@ -1342,7 +1371,10 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
         #   StructSequenceType(iterable)
         #   NamedTupleType(*iterable)
         #   NamedTupleType._make(iterable)
-        create_fn = self.tuple_cls if self.is_structseq() else self.tuple_cls._make
+        if self.is_structseq():
+            create_fn = self.tuple_cls
+        else:
+            create_fn = self.tuple_cls._make  # type: ignore[attr-defined]
         codegen.add_push_null(
             lambda: codegen.append_output(
                 codegen.create_load_const_unchecked(create_fn)
@@ -1384,8 +1416,8 @@ def _is_method_overridden(self, method_name: str) -> bool:
 
     def call_method(
         self,
-        tx,
-        name,
+        tx: "InstructionTranslator",
+        name: str,
         args: list[VariableTracker],
         kwargs: dict[str, VariableTracker],
     ) -> VariableTracker:
@@ -1446,7 +1478,9 @@ def call_method(
 
         return super().call_method(tx, name, args, kwargs)
 
-    def getitem_const(self, tx: "InstructionTranslator", arg: VariableTracker):
+    def getitem_const(
+        self, tx: "InstructionTranslator", arg: VariableTracker
+    ) -> VariableTracker:
         if isinstance(arg, SliceVariable):
             # slicing a namedtuple produces a tuple
             return TupleVariable(
@@ -1455,8 +1489,8 @@ def getitem_const(self, tx: "InstructionTranslator", arg: VariableTracker):
             )
         return super().getitem_const(tx, arg)
 
-    def var_getattr(self, tx: "InstructionTranslator", name):
-        def check_and_create_method():
+    def var_getattr(self, tx: "InstructionTranslator", name: str) -> VariableTracker:
+        def check_and_create_method() -> Optional[VariableTracker]:
             method = inspect.getattr_static(self.tuple_cls, name, None)
             if isinstance(method, classmethod):
                 # We need the unbounded cls method to avoid the inline __self__
@@ -1465,6 +1499,7 @@ def check_and_create_method():
                     variables.UserDefinedClassVariable(self.tuple_cls),
                 )
             elif isinstance(method, staticmethod):
+                # pyrefly: ignore[bad-argument-type]
                 return UserFunctionVariable(method.__func__)
             elif inspect.isfunction(method):
                 return UserMethodVariable(method, self)
@@ -1489,8 +1524,8 @@ def check_and_create_method():
             return super().var_getattr(tx, name)
 
         if name == "_fields":
-            source = NamedTupleFieldsSource(self.source) if self.source else None
-            return VariableTracker.build(tx, self.fields(), source=source)
+            result_source = NamedTupleFieldsSource(self.source) if self.source else None
+            return VariableTracker.build(tx, self.fields(), source=result_source)
 
         if name in self.dynamic_attributes:
             return self.dynamic_attributes[name]
@@ -1505,14 +1540,19 @@ def check_and_create_method():
 
     def call_obj_hasattr(
         self, tx: "InstructionTranslator", name: str
-    ) -> "VariableTracker":
+    ) -> VariableTracker:
         return variables.ConstantVariable.create(
             name in self.dynamic_attributes or hasattr(self.tuple_cls, name)
         )
 
 
 class SliceVariable(VariableTracker):
-    def __init__(self, items, tx=None, **kwargs) -> None:
+    def __init__(
+        self,
+        items: Sequence[VariableTracker],
+        tx: Optional["InstructionTranslator"] = None,
+        **kwargs: Any,
+    ) -> None:
         items_to_map = items
         start, stop, step = [variables.ConstantVariable.create(None)] * 3
 
@@ -1547,28 +1587,28 @@ def __init__(self, items, tx=None, **kwargs) -> None:
 
         super().__init__(**kwargs)
 
-    def debug_repr(self):
-        return self.debug_repr_helper("slice(", ")")
+    def debug_repr(self) -> str:
+        return "slice(" + ", ".join(i.debug_repr() for i in self.items) + ")"
 
-    def as_proxy(self):
+    def as_proxy(self) -> slice:
         return slice(*[x.as_proxy() for x in self.items])
 
-    def python_type(self):
+    def python_type(self) -> type:
         return slice
 
-    def as_python_constant(self):
+    def as_python_constant(self) -> slice:
         return slice(*[guard_if_dyn(x) for x in self.items])
 
     def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen.foreach(self.items)
         codegen.append_output(create_instruction("BUILD_SLICE", arg=len(self.items)))
 
-    def var_getattr(self, tx: "InstructionTranslator", name):
+    def var_getattr(self, tx: "InstructionTranslator", name: str) -> VariableTracker:
         if name in cmp_name_to_op_mapping:
             return variables.GetAttrVariable(self, name)
         fields = ["start", "stop", "step"]
         if name not in fields:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Unsupported attribute for slice() object",
                 context=f"var_getattr {self} {name}",
                 explanation=f"Expected attribute to be one of {','.join(fields)} "
@@ -1584,7 +1624,9 @@ class ListIteratorVariable(IteratorVariable):
         *IteratorVariable._nonvar_fields,
     }
 
-    def __init__(self, items, index: int = 0, **kwargs) -> None:
+    def __init__(
+        self, items: list[VariableTracker], index: int = 0, **kwargs: Any
+    ) -> None:
         super().__init__(**kwargs)
         assert isinstance(items, list)
         # Removing this check as it slows things down too much
@@ -1598,7 +1640,7 @@ def __init__(self, items, index: int = 0, **kwargs) -> None:
     def __repr__(self) -> str:
         return f"{self.__class__.__name__}(length={len(self.items)}, index={repr(self.index)})"
 
-    def next_variable(self, tx):
+    def next_variable(self, tx: "InstructionTranslator") -> VariableTracker:
         assert self.is_mutable()
         old_index = self.index
         if old_index >= len(self.items) or self.is_exhausted:
@@ -1609,27 +1651,31 @@ def next_variable(self, tx):
         self.index += 1
         return self.items[old_index]
 
-    def call_obj_hasattr(self, tx, name):
+    def call_obj_hasattr(
+        self, tx: "InstructionTranslator", name: str
+    ) -> VariableTracker:
         return variables.ConstantVariable.create(hasattr(iter([]), name))
 
-    def python_type(self):
+    def python_type(self) -> type:
         return type(iter([]))
 
-    def as_python_constant(self):
+    def as_python_constant(self) -> Any:
         if self.index > 0:
             raise NotImplementedError
         return iter([x.as_python_constant() for x in self.items])
 
-    def has_unpack_var_sequence(self, tx):
+    def has_unpack_var_sequence(self, tx: "InstructionTranslator") -> bool:
         return True
 
-    def unpack_var_sequence(self, tx):
+    def unpack_var_sequence(self, tx: "InstructionTranslator") -> list[VariableTracker]:
         if self.is_exhausted:
             return []
         self.is_exhausted = True
         return list(self.items[self.index :])
 
-    def force_unpack_var_sequence(self, tx) -> list[VariableTracker]:
+    def force_unpack_var_sequence(
+        self, tx: "InstructionTranslator"
+    ) -> list[VariableTracker]:
         return self.unpack_var_sequence(tx)
 
     def reconstruct(self, codegen: "PyCodegen") -> None:
@@ -1656,27 +1702,37 @@ class RangeIteratorVariable(IteratorVariable):
         "iter_obj",
     }
 
-    def __init__(self, start: int, stop: int, step: int, len_: int, **kwargs):
+    def __init__(
+        self, start: int, stop: int, step: int, len_: int, **kwargs: Any
+    ) -> None:
         super().__init__(**kwargs)
         self.start = start
         self.stop = stop
         self.step = step
         self.len = len_
 
-    def call_method(self, tx, name, args, kwargs):
+    def call_method(
+        self,
+        tx: "InstructionTranslator",
+        name: str,
+        args: list[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
         if name == "__next__":
             return self.next_variable(tx)
         elif name == "__iter__":
             return self
         return super().call_method(tx, name, args, kwargs)
 
-    def call_obj_hasattr(self, tx, name):
+    def call_obj_hasattr(
+        self, tx: "InstructionTranslator", name: str
+    ) -> VariableTracker:
         if self.python_type() is range_iterator:
             ri = iter(range(0))
             return ConstantVariable(hasattr(ri, name))
         return super().call_obj_hasattr(tx, name)
 
-    def next_variable(self, tx):
+    def next_variable(self, tx: "InstructionTranslator") -> VariableTracker:
         if self.len <= 0:
             raise_observed_exception(StopIteration, tx)
 
@@ -1685,12 +1741,12 @@ def next_variable(self, tx):
         self.start += self.step
         return ConstantVariable.create(current)
 
-    def python_type(self):
+    def python_type(self) -> type:
         return range_iterator
 
-    def reconstruct(self, codegen: "PyCodegen"):
+    def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen.add_push_null(
-            lambda: codegen.append_output(codegen.create_load_python_module(range))
+            lambda: codegen.append_output(codegen.create_load_python_module(range))  # type: ignore[arg-type]
         )
         codegen.append_output(codegen.create_load_const(self.start))
         codegen.append_output(codegen.create_load_const(self.stop))
diff --git a/torch/_dynamo/variables/misc.py b/torch/_dynamo/variables/misc.py
index 4845d5d9acc93..8d074f913dbf5 100644
--- a/torch/_dynamo/variables/misc.py
+++ b/torch/_dynamo/variables/misc.py
@@ -18,6 +18,7 @@
 """
 
 import dataclasses
+import enum
 import functools
 import inspect
 import itertools
@@ -39,7 +40,7 @@
     create_instruction,
 )
 from ..create_parameter_op import do_not_convert_to_tracable_parameter
-from ..exc import raise_observed_exception, unimplemented, unimplemented_v2
+from ..exc import raise_observed_exception, unimplemented
 from ..guards import GuardBuilder, install_guard
 from ..mutation_guard import unpatched_nn_module_init
 from ..source import (
@@ -108,7 +109,7 @@ def reconstruct(self, codegen: "PyCodegen"):
 
     def _resolved_getattr_and_source(self, tx: "InstructionTranslator", name):
         if not self.objvar:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="1-arg super not implemented",
                 context="",
                 explanation=f"Dynamo failed to trace attribute `{name}` accessed "
@@ -159,7 +160,7 @@ def _resolved_getattr_and_source(self, tx: "InstructionTranslator", name):
                         )
                     return resolved_getattr, source
 
-        unimplemented_v2(
+        unimplemented(
             gb_type="Unable to resolve super getattr",
             context="",
             explanation=f"Dynamo failed to trace attribute `{name}` accessed "
@@ -220,7 +221,7 @@ def call_method(
                     )
                     return fn_vt.call_function(tx, [self.objvar] + args, kwargs)
             else:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Unsupported super().__init__() call",
                     context=f"call_method {self} {name} {args} {kwargs}",
                     explanation="Dynamo encountered a super().__init__() call "
@@ -290,7 +291,7 @@ def call_method(
             try:
                 attr = attr.as_python_constant()
             except NotImplementedError as exc:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Non-constant attribute given to `super().__delattr__()`",
                     context=f"call_method {self} {name}",
                     explanation="Dynamo requires the attribute name passed to "
@@ -301,7 +302,7 @@ def call_method(
                     from_exc=exc,
                 )
             if not tx.output.side_effects.is_attribute_mutation(self.objvar):
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Attempted super().__delattr__() on an object without mutation tracking",
                     context=f"call_method {self} {name}",
                     explanation="Dynamo needs to track mutations on an object "
@@ -392,7 +393,7 @@ def call_method(
             fn_var = VariableTracker.build(tx, inner_fn, source)
             return fn_var.call_function(tx, [self.objvar] + args, kwargs)
 
-        unimplemented_v2(
+        unimplemented(
             gb_type="Attempted to call a super() attribute that is "
             "not a function or method",
             context=f"call_method {self} {name}",
@@ -414,7 +415,7 @@ def __init__(
         self.exc_type = exc_type
         self.args = args
         if init_kwargs:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Keyword args passed to exception constructor",
                 context=f"{self} with kwargs {init_kwargs}",
                 explanation="Dynamo does not know how to handle keyword args passed to an exception constructor",
@@ -495,7 +496,7 @@ def raise_error(msg):
             if isinstance(val, ConstantVariable) and val.value is None:
                 self.__traceback__ = val
             else:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Set Exception object `__traceback__` attribute to not-`None`",
                     context=f"call_setattr {self} {name}",
                     explanation="Dynamo does not support setting the attribute "
@@ -507,7 +508,7 @@ def raise_error(msg):
                     ],
                 )
         else:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Unsupported attribute assignment on Exception object",
                 context=f"call_setattr {self} {name}",
                 explanation="Dynamo does not support setting the attribute "
@@ -567,7 +568,7 @@ def call_function(
         args: "list[VariableTracker]",
         kwargs: "dict[str, VariableTracker]",
     ) -> "VariableTracker":
-        unimplemented_v2(
+        unimplemented(
             gb_type="Unsupported function call (delayed)",
             context=f"source: {self.source}",
             explanation="Dynamo determined that a graph break should occur "
@@ -722,7 +723,7 @@ def visit(vt):
 
             vjp_fn = self.fn_cls.vjp  # type: ignore[attr-defined]
             if vjp_fn is not torch.autograd.Function.vjp:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Unsupported custom vjp",
                     context=f"call_apply {self} {args} {kwargs}",
                     explanation="Dynamo does not support tracing "
@@ -737,7 +738,7 @@ def visit(vt):
 
             jvp_fn = self.fn_cls.jvp  # type: ignore[attr-defined]
             if jvp_fn is not torch.autograd.Function.jvp:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Unsupported custom jvp",
                     context=f"call_apply {self} {args} {kwargs}",
                     explanation="Dynamo does not support tracing "
@@ -798,7 +799,7 @@ def visit(vt):
                 source=source,
             ).call_function(tx, args, kwargs)
         else:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Non-function or method in subclass of torch.autograd.Function",
                 context=f"call_apply {self} {args} {kwargs}",
                 explanation="Dynamo requires the `forward` attribute of a "
@@ -873,7 +874,7 @@ def call_method(
                     obj.__func__, self, source=source
                 ).call_function(tx, args, kwargs)
             else:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Unsupported autograd.Function method",
                     context=f"call_method {self} {name}",
                     explanation="Dynamo does not support calling the method "
@@ -943,7 +944,7 @@ def create(tx: "InstructionTranslator", args=None, kwargs=None):
 
     def as_proxy(self):
         if self.proxy is None:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="proxy not set",
                 context=f"as_proxy {self}",
                 explanation="Dynamo requires the autograd.Function context "
@@ -968,7 +969,7 @@ def call_method(
             return variables.ConstantVariable.create(None)
 
         if name != "save_for_backward":
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Unsupported autograd.Function context method",
                 context=f"call_method {self} {name}",
                 explanation="Dynamo does not support calling the method "
@@ -978,7 +979,7 @@ def call_method(
                 hints=[*graph_break_hints.SUPPORTABLE],
             )
         if self.saved_tensors is None:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Unsupported autograd.Function context `save_for_backward`",
                 context=f"call_method {self} {name}",
                 explanation="Dynamo requires the `saved_tensors` attribute "
@@ -1057,7 +1058,7 @@ def call_method(
                     kwargs,
                 )
             else:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Unsupported torch._C._ImperativeEngine.queue_callback()",
                     context=f"call_method {self} {name}",
                     explanation="queue_callback() is only supported when "
@@ -1065,7 +1066,7 @@ def call_method(
                     hints=[],
                 )
         else:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Unsupported torch._C._ImperativeEngine method",
                 context=f"call_method {self} {name}",
                 explanation="Dynamo only supports the `queue_callback` method "
@@ -1283,7 +1284,7 @@ def call_function(
                 except AsPythonConstantNotImplementedError:
                     pass
 
-            unimplemented_v2(
+            unimplemented(
                 gb_type="unsupported type.__dict__['__annotations__'].__get__ call",
                 context=f"call_function {self}, args: {args}, kwargs: {kwargs}",
                 explanation="`torch.compile` only supports calling type.__dict__['__annotations__'].__get__ "
@@ -1382,7 +1383,15 @@ def call_method(
         if name == "__getitem__" and len(args) == 1:
             new_typing = self.value[args[0].as_python_constant()]
             return TypingVariable(new_typing)
-        unimplemented("unsupported method call on typing variable")
+        unimplemented(
+            gb_type="unsupported method call on `typing` variable",
+            context=f"typing variable: {self.value}, method name: {name}, args: {args}, kwargs: {kwargs}",
+            explanation=f"`torch.compile` does not support method call `{name}` on `typing` variable f{self.value}.",
+            hints=[
+                f"Avoid calling the {name} method on {self.value}.",
+                *graph_break_hints.SUPPORTABLE,
+            ],
+        )
 
     def var_getattr(self, tx: "InstructionTranslator", name: str):
         from .builder import SourcelessBuilder, VariableBuilder
@@ -1493,7 +1502,15 @@ def call_function(
         kwargs: "dict[str, VariableTracker]",
     ) -> "VariableTracker":
         if not config.trace_numpy:
-            unimplemented(f"numpy.{self.value}()")
+            unimplemented(
+                gb_type="attempted to trace numpy function with config.trace_numpy=False",
+                context=f"numpy function: {self.value}, args: {args}, kwargs: {kwargs}",
+                explanation=f"Attempted to trace numpy function {self.value} "
+                "while `torch._dynamo.config.trace_numpy` was set to False.",
+                hints=[
+                    "Set `torch._dynamo.config.trace_numpy` to True to trace numpy functions.",
+                ],
+            )
 
         from ..utils import numpy_to_tensor_wrapper
         from .tensor import NumpyNdarrayVariable
@@ -1501,8 +1518,12 @@ def call_function(
         func = get_np_to_tnp_map().get(self.value)
         if func is None:
             unimplemented(
-                f"Can't find numpy function {self.value} in torch._numpy. "
-                " Please file an issue to request support for this function."
+                gb_type="attempted to trace numpy function unsupported by PyTorch",
+                context=f"numpy function: {self.value}, args: {args}, kwargs: {kwargs} (corresponding torch function: {func})",
+                explanation=f"Can't find numpy numpy function {self.value} in torch._numpy.",
+                hints=[
+                    *graph_break_hints.SUPPORTABLE,
+                ],
             )
 
         # We are dealing with a function that produces a const collection type (np.dtype, np.iinfo/np.finfo)
@@ -1516,20 +1537,32 @@ def call_function(
                         **{k: v.as_python_constant() for k, v in kwargs.items()},
                     )
                 )
-            except NotImplementedError:
+            except AsPythonConstantNotImplementedError:
                 unimplemented(
-                    f"{self.value.__name__} with non-const args: {args} {kwargs}"
+                    gb_type="numpy function that produces a const collection type encountered non-const arguments",
+                    context=f"numpy function: {self.value}, args: {args}, kwargs: {kwargs} (corresponding torch function: {func})",
+                    explanation=f"numpy function {self.value} that produces a const collection type "
+                    "(e.g. np.dtype, np.iinfo/np.finfo) "
+                    "received arguments that are not constant.",
+                    hints=[
+                        *graph_break_hints.USER_ERROR,
+                    ],
                 )
         else:
             if (
                 func.__module__ == "torch._numpy.random"
                 and config.use_numpy_random_stream
             ):
-                msg = f"delegate '{func.__qualname__}' to NumPy itself via "
-                msg += (
-                    f"config.use_numpy_random_stream={config.use_numpy_random_stream}"
+                unimplemented(
+                    gb_type="attempted to trace torch._numpy.random function with config.use_numpy_random_stream=True",
+                    context=f"numpy function: {self.value}, args: {args}, kwargs: {kwargs} (corresponding torch function: {func})",
+                    explanation=f"Attempted to trace {self.value} when `torch._dynamo.config.use_numpy_random_stream` "
+                    "is set to True.",
+                    hints=[
+                        "Set `torch._dynamo.config.use_numpy_random_stream` to False.",
+                        f"Avoid calling {self.value}.",
+                    ],
                 )
-                unimplemented(msg)
 
             args, kwargs = NumpyNdarrayVariable.patch_args(func.__name__, args, kwargs)
 
@@ -1559,17 +1592,29 @@ def call_method(
         args: "list[VariableTracker]",
         kwargs: "dict[str, VariableTracker]",
     ) -> "VariableTracker":
-        unimplemented("numpy")
+        unimplemented(
+            gb_type="attempted to trace numpy.* function as a method",
+            context=f"numpy function: {self.value}, args: {args}, kwargs: {kwargs}",
+            explanation="Tracing numpy.* functions as methods is not supported.",
+            hints=[
+                *graph_break_hints.DIFFICULT,
+            ],
+        )
 
     def as_python_constant(self):
         return self.value
 
     def as_proxy(self):
-        if config.trace_numpy and isinstance(self.value, type):
-            # This handles numpy dtype attributes such as np.float32
-            # We return a string as we don't want to serialize non-PyTorch objects in the output FX graph
-            # In torch/_numpy we normalize strings to their dtypes when the input is a dtype, as NumPy does
-            return self.value.__name__
+        if config.trace_numpy:
+            # Can replace with EnumType once we drop 3.10 support
+            if isinstance(self.value, enum.EnumMeta):
+                # This is mostly for np._CopyMode
+                return self.value
+            if isinstance(self.value, type):
+                # This handles numpy dtype attributes such as np.float32
+                # We return a string as we don't want to serialize non-PyTorch objects in the output FX graph
+                # In torch/_numpy we normalize strings to their dtypes when the input is a dtype, as NumPy does
+                return self.value.__name__
 
         return super().as_proxy()
 
@@ -1584,7 +1629,15 @@ def __repr__(self) -> str:
 
     def reconstruct(self, codegen: "PyCodegen"):
         if sys.version_info < (3, 11):
-            unimplemented("cannot reconstruct NullVariable in < Python 3.11")
+            unimplemented(
+                gb_type="cannot reconstruct NullVariable in Python < 3.11",
+                context="",
+                explanation="Attempted to generate PUSH_NULL instruction in Python < 3.11; "
+                "where this instruction does not exist.",
+                hints=[
+                    *graph_break_hints.DYNAMO_BUG,
+                ],
+            )
         codegen.append_output(create_instruction("PUSH_NULL"))
 
 
@@ -1666,8 +1719,13 @@ def call_function(self, tx: "InstructionTranslator", args, kwargs):
 
         if not self.can_reorder_logs(self.value, args, kwargs):
             unimplemented(
-                f"Reordering debugging function {self.value} "
-                f"with inputs {args} {kwargs} is not yet implemented."
+                gb_type="attempted to reorder a debugging function that can't actually be reordered",
+                context=f"fn: {self.value}, args: {args}, kwargs: {kwargs}",
+                explanation="`torch.compile` can only reorder functions where the arguments "
+                "are Tensors, constants, or string formatters.",
+                hints=[
+                    f"Avoid calling the logging function {self.value} with args that are not supported.",
+                ],
             )
 
         tx.debug_locals.append((self, list(args)))
@@ -1720,9 +1778,12 @@ def call_method(
         if {method, function}.intersection(torch._dynamo.config.ignore_logger_methods):
             return variables.ConstantVariable.create(None)
         unimplemented(
-            "Logger not supported for non-export cases. "
-            "To avoid graph breaks caused by logger in compile-mode, it is recommended to"
-            " disable logging by adding logging methods to config.ignore_logger_methods"
+            gb_type="logging.Logger method not supported for non-export cases",
+            context=f"method: {self.value}.{name}, args: {args}, kwargs: {kwargs}",
+            explanation="logging.Logger methods are not supported for non-export cases.",
+            hints=[
+                "Add the logging method to `torch._dynamo.config.ignore_logger_methods.",
+            ],
         )
 
 
@@ -1759,7 +1820,14 @@ def call_method(
             cargs = [x.as_python_constant() for x in args]
             ckwargs = {k: v.as_python_constant() for k, v in kwargs.items()}
         except NotImplementedError:
-            unimplemented(f"{self._error_prefix}.{name}(*{args}, **{kwargs})")
+            unimplemented(
+                gb_type="constant-like method call with non-constant args",
+                context=f"{self._error_prefix}.{name}(*{args}, **{kwargs})",
+                explanation=f"Attempted to call {self._error_prefix}.{name} with non-constant args.",
+                hints=[
+                    "Ensure that the args to the method call are constant (int, str, etc.).",
+                ],
+            )
 
         result = getattr(self.value, name)(*cargs, **ckwargs)
 
@@ -1768,7 +1836,14 @@ def call_method(
         if isinstance(result, re.Match):
             return ConstantRegexMatchVariable(result)
 
-        unimplemented(f"{self._error_prefix}.{name}() -> {result}")
+        unimplemented(
+            gb_type="constant-like method call with unsupported return type",
+            context=f"{self._error_prefix}.{name}(*{args}, **{kwargs}) returned {result}",
+            explanation=f"Attempted to call {self._error_prefix}.{name}, got unsupported return value {result}.",
+            hints=[
+                *graph_break_hints.SUPPORTABLE,
+            ],
+        )
 
     def var_getattr(self, tx: "InstructionTranslator", name: str) -> VariableTracker:
         result = getattr(self.value, name)
@@ -1831,10 +1906,15 @@ def __init__(self, **kwargs) -> None:
         super().__init__(**kwargs)
 
     def call_function(self, tx: "InstructionTranslator", args, kwargs):
-        if len(args) > 1:
-            unimplemented("random.Random() with > 1 arg")
-        elif kwargs:
-            unimplemented("random.Random() with kwargs")
+        if len(args) > 1 or kwargs:
+            unimplemented(
+                gb_type="random.Random() with improper arguments",
+                context=f"args: {args}, kwargs: {kwargs}",
+                explanation="random.Random() with > 1 arg or with kwargs is not supported.",
+                hints=[
+                    *graph_break_hints.USER_ERROR,
+                ],
+            )
         seed = variables.ConstantVariable.create(None) if len(args) == 0 else args[0]
         return RandomVariable(
             seed=seed, mutation_type=variables.base.ValueMutationNew()
diff --git a/torch/_dynamo/variables/nn_module.py b/torch/_dynamo/variables/nn_module.py
index 794fdf607220a..e754699d862ad 100644
--- a/torch/_dynamo/variables/nn_module.py
+++ b/torch/_dynamo/variables/nn_module.py
@@ -34,12 +34,7 @@
 import torch.nn
 
 from .. import graph_break_hints, trace_rules, variables
-from ..exc import (
-    raise_observed_exception,
-    unimplemented_v2,
-    UnspecializeRestartAnalysis,
-    Unsupported,
-)
+from ..exc import raise_observed_exception, unimplemented, UnspecializeRestartAnalysis
 from ..guards import GuardBuilder, install_guard
 from ..mutation_guard import GenerationTracker
 from ..source import (
@@ -263,7 +258,7 @@ def has_key_in_generic_dict(self, tx: "InstructionTranslator", key):
         base = tx.output.get_submodule(self.module_key)
 
         if object_has_getattribute(base):
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Custom __getattribute__ in nn.Module dict key check",
                 context=f"has_key_in_generic_dict {self} {key}",
                 explanation="Dynamo does not support checking key existence "
@@ -285,7 +280,7 @@ def has_key_in_generic_dict(self, tx: "InstructionTranslator", key):
     def _custom_getattr_fallback(self, base, tx, name, obj_source):
         """Check for a __getattr__ and handle it specially if it is implemented"""
         if object_has_getattribute(base):
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Custom __getattribute__ in nn.Module attribute access",
                 context=f"var_getattr {self} {name}",
                 explanation="Dynamo does not support checking key existence "
@@ -302,7 +297,7 @@ def _custom_getattr_fallback(self, base, tx, name, obj_source):
             return None
 
         if not isinstance(getattr_fn, types.FunctionType):
-            unimplemented_v2(
+            unimplemented(
                 gb_type="torch.nn.Module with a non-function custom __getattr__",
                 context=f"var_getattr {self} {name}",
                 explanation=(
@@ -336,7 +331,7 @@ def var_getattr(self, tx: "InstructionTranslator", name):
             all_class_attribute_names.update(x.__dict__.keys())
 
         if not self.source:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="getattr with no source",
                 context=f"var_getattr {self} {name}",
                 explanation="Dynamo does not know how to access an attribute "
@@ -423,7 +418,7 @@ def var_getattr(self, tx: "InstructionTranslator", name):
                 # Support possibly common cases of class members
                 return VariableTracker.build(tx, subobj, NNModuleSource(source))
             else:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Unsupported nn.Module attribute type",
                     context=f"nn.Module subclass: {typestr(base)}, name: {name}, attribute type: {typestr(subobj)}",
                     explanation=f"Dynamo does not support tracing nn.Module attributes of type `{typestr(subobj)}`",
@@ -644,7 +639,7 @@ def assert_all_args_kwargs_const():
             if not all(
                 x.is_python_constant() for x in itertools.chain(args, kwargs.values())
             ):
-                unimplemented_v2(
+                unimplemented(
                     gb_type="non-const argument in nn.Module method",
                     context=f"call_method: {self} {name} {args} {kwargs}",
                     explanation="Dynamo does not support calling "
@@ -763,7 +758,7 @@ def gen_source(source, name):
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
             result = []
-            for name in module.keys():
+            for name in module:
                 result.append(ConstantVariable.create(name))
             return ListIteratorVariable(result, mutation_type=ValueMutationNew())
         elif name == "values":
@@ -830,7 +825,7 @@ def gen_source(source, name):
                     isinstance(args[0], variables.ConstantVariable)
                     and isinstance(args[0].as_python_constant(), (str, int))
                 ):
-                    unimplemented_v2(
+                    unimplemented(
                         gb_type="Invalid or non-const argument in nn.Module __getitem__",
                         context=f"call_method: {self} {name} {args} {kwargs}",
                         explanation="Dynamo does not support calling "
@@ -893,7 +888,7 @@ def gen_source(source, name):
             elif args[0].is_python_constant():
                 key = args[0].as_python_constant()
             else:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Unsupported key type for nn.Module.__getitem__",
                     context=f"call_method: {self} {name} {args} {kwargs}",
                     explanation="Dynamo does not support getitem on "
@@ -960,9 +955,14 @@ class UnspecializedNNModuleVariable(UserDefinedObjectVariable):
 
     def __init__(self, value, **kwargs) -> None:
         if type(value) is torch.jit._script.RecursiveScriptModule:
-            raise Unsupported(
-                "ScriptModules aren't supported in UnspecializedNNModuleVariable"
-                " because their .forward function isn't a static member of their type"
+            unimplemented(
+                gb_type="UnspecializedNNModuleVariable wrapped around ScriptModules unsupported",
+                context=str(value),
+                explanation="ScriptModules aren't supported in UnspecializedNNModuleVariable"
+                " because their .forward function isn't a static member of their type.",
+                hints=[
+                    *graph_break_hints.DIFFICULT,
+                ],
             )
         if "value_type" in kwargs:
             lazy_value_to_become = getattr(kwargs["value_type"], "cls_to_become", None)
@@ -1136,7 +1136,7 @@ def call_method(
                 hasattr(method, "__code__")
                 and id(method.__code__) in self._nn_module_method_ids()
             ):
-                unimplemented_v2(
+                unimplemented(
                     gb_type="UnspecializedNNModuleVariable missing method",
                     context=f"call_method: {self} {name} {args} {kwargs}",
                     explanation=f"Dynamo does not support tracing method {name} of nn.Module {self.value}",
diff --git a/torch/_dynamo/variables/optimizer.py b/torch/_dynamo/variables/optimizer.py
index 289cebbe8129b..fd7ccf9cc6e68 100644
--- a/torch/_dynamo/variables/optimizer.py
+++ b/torch/_dynamo/variables/optimizer.py
@@ -22,7 +22,8 @@
 
 import logging
 import weakref
-from typing import Any, Iterable, Optional, TYPE_CHECKING
+from collections.abc import Iterable
+from typing import Any, Optional, TYPE_CHECKING
 
 import torch
 from torch._dynamo.variables.tensor import TensorVariable
@@ -170,9 +171,14 @@ def graph_break_if_pending_mutation(self, tx: "InstructionTranslator") -> None:
                 side_effects = tx.output.side_effects
                 variable = side_effects.id_to_variable.get(id(p), None)
                 if variable and side_effects.has_pending_mutation(variable):
-                    from ..exc import Unsupported
+                    from ..exc import unimplemented
 
-                    raise Unsupported("Pending mutation on parameter")
+                    unimplemented(
+                        gb_type="optimizer: pending mutation on parameter",
+                        context=f"variable: {variable}, parameter: {p}",
+                        explanation="Pending mutations on a parameter (e.g. due to using closure) require a graph break.",
+                        hints=[],
+                    )
 
     def _set_capturable(self, tx: "InstructionTranslator") -> None:
         from . import LazyVariableTracker
diff --git a/torch/_dynamo/variables/script_object.py b/torch/_dynamo/variables/script_object.py
index 85977104977fb..af7bd985287d7 100644
--- a/torch/_dynamo/variables/script_object.py
+++ b/torch/_dynamo/variables/script_object.py
@@ -19,16 +19,17 @@
 """
 
 import functools
-from collections.abc import Callable
-from typing import Any, Iterable, TYPE_CHECKING, TypeVar
+from collections.abc import Callable, Iterable
+from typing import Any, TYPE_CHECKING, TypeVar
 from typing_extensions import ParamSpec
 
 import torch
 from torch._guards import Source
+from torch._library.opaque_object import is_opaque_type, OpaqueTypeStr
 from torch.fx.proxy import Proxy
 
 from .. import graph_break_hints
-from ..exc import unimplemented_v2, UnsafeScriptObjectError, Unsupported
+from ..exc import unimplemented, UnsafeScriptObjectError, Unsupported
 from .base import VariableTracker
 from .user_defined import UserDefinedObjectVariable
 
@@ -61,7 +62,7 @@ class TorchScriptObjectVariable(UserDefinedObjectVariable):
 
     @classmethod
     def is_matching_cls(cls, user_cls: type) -> bool:
-        return issubclass(user_cls, torch.ScriptObject)
+        return issubclass(user_cls, torch.ScriptObject) or is_opaque_type(user_cls)
 
     @staticmethod
     def create(proxy: Proxy, value: Any, **options: Any) -> "TorchScriptObjectVariable":
@@ -80,6 +81,16 @@ def as_proxy(self) -> Proxy:
         "Dynamo cannot safely trace script object due to graph break."
     )
     def var_getattr(self, tx: "InstructionTranslator", name: str) -> VariableTracker:
+        if getattr(self.value, "script_class_name", "") == OpaqueTypeStr:
+            unimplemented(
+                gb_type="Attempted to access attributes/methods on an OpaqueObject",
+                context=f"value={self.value}, attr={name}",
+                explanation="Attribute/method access of OpaqueObjects is not supported.",
+                hints=[
+                    "Use custom operators instead of direct attribute/method access.",
+                ],
+            )
+
         from torch._higher_order_ops.torchbind import call_torchbind
 
         from ..source import AttrSource
@@ -87,7 +98,7 @@ def var_getattr(self, tx: "InstructionTranslator", name: str) -> VariableTracker
 
         method = getattr(self.value, name, None)
         if method is None:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="FakeScriptObject missing method implementation",
                 context=f"value={self.value}, method={name}",
                 explanation=f"TorchScript object {self.value} doesn't define the method {name}.",
@@ -98,7 +109,7 @@ def var_getattr(self, tx: "InstructionTranslator", name: str) -> VariableTracker
             )
 
         if not callable(method):
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Attempted to access non-callable attribute of TorchScript object",
                 context=f"value={self.value}, method={name}",
                 explanation="Attribute accesses of TorchScript objects to non-callable attributes are not supported.",
@@ -128,7 +139,7 @@ def call_method(
         args: Iterable[Any],
         kwargs: dict[str, Any],
     ) -> VariableTracker:
-        unimplemented_v2(
+        unimplemented(
             gb_type="Weird method call on TorchScript object",
             context=f"value={self.value}, method={name}",
             explanation=(
diff --git a/torch/_dynamo/variables/sdpa.py b/torch/_dynamo/variables/sdpa.py
index 75928842cf297..1a7006f5d56ab 100644
--- a/torch/_dynamo/variables/sdpa.py
+++ b/torch/_dynamo/variables/sdpa.py
@@ -1,12 +1,13 @@
+from collections.abc import Sequence
 from inspect import getattr_static
-from typing import Any, Sequence, TYPE_CHECKING, TypeGuard
+from typing import Any, TYPE_CHECKING, TypeGuard
 
 from torch._guards import Source
 from torch.backends.cuda import SDPAParams
 from torch.fx.proxy import Proxy
 
 from ..bytecode_transformation import create_call_function
-from ..exc import Unsupported
+from ..exc import unimplemented
 from ..source import AttrSource
 from .base import VariableTracker
 
@@ -70,10 +71,16 @@ def var_getattr(self, tx: "InstructionTranslator", name: str) -> VariableTracker
         try:
             getattr_static(torch._C._SDPAParams, name)
         except AttributeError:
-            # Using raise from is too verbose here
-            raise Unsupported(
-                f"Unsupported torch._C._SDPAParams attribute {name}"
-            ) from None
+            import torch._dynamo.graph_break_hints as graph_break_hints
+
+            unimplemented(
+                gb_type="unsupported torch._C._SDPAParams attribute",
+                context=f"name: {name}",
+                explanation=f"Unable to fetch attribute {name} from torch._C._SDPAParams.",
+                hints=[
+                    *graph_break_hints.USER_ERROR,
+                ],
+            )
 
         proxy = GetAttrVariable.create_getattr_proxy(self.as_proxy(), name)
         if self.source is not None:
diff --git a/torch/_dynamo/variables/streams.py b/torch/_dynamo/variables/streams.py
index fbc0eed3a99ff..38da38a8cfc18 100644
--- a/torch/_dynamo/variables/streams.py
+++ b/torch/_dynamo/variables/streams.py
@@ -1,14 +1,20 @@
 import collections
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from typing import Any, Optional
 
 import torch
 from torch._dynamo.variables.dicts import ConstDictVariable
 from torch._dynamo.variables.lists import TupleVariable
-from torch.fx import Proxy
+from torch.fx import has_side_effect, Proxy
 
 from .. import graph_break_hints
 from ..bytecode_transformation import create_call_function
-from ..exc import TYPE_CHECKING, unimplemented_v2
+from ..exc import TYPE_CHECKING, unimplemented
+from ..graph_bytecode_inputs import (
+    get_external_object_by_index,
+    register_graph_created_object,
+)
+from ..source import CurrentStreamSource
 from .base import VariableTracker
 from .constant import ConstantVariable
 from .ctx_manager import FxTracebackAnnotateVariable
@@ -26,46 +32,149 @@
 Tensor = torch.Tensor
 
 
+def new_event(*args: Any, **kwargs: Any) -> int:
+    event = torch.Event(*args, **kwargs)
+    return register_graph_created_object(
+        event,
+        EventVariable.make_construct_in_graph_event_fn(
+            TupleVariable([]), ConstDictVariable({})
+        ),
+    )
+
+
+def new_stream(*args: tuple[Any], **kwargs: Any) -> int:
+    stream = torch.Stream(*args, **kwargs)  # type: ignore[no-matching-overload,call-overload]
+    return register_graph_created_object(
+        stream,
+        StreamVariable.make_construct_in_graph_stream_fn(
+            TupleVariable([]), ConstDictVariable({})
+        ),
+    )
+
+
+def _codegen_current_stream(device: torch.device, cg: "PyCodegen") -> None:
+    cg.add_push_null(
+        lambda: cg.load_import_from(
+            torch._dynamo.graph_bytecode_inputs.__name__,  # type: ignore[implicit-imports]
+            "stash_graph_created_object",
+        )
+    )
+    cg(CurrentStreamSource(device))
+    cg.extend_output(create_call_function(1, False))
+
+
+def get_current_stream(device: torch.device) -> int:
+    stream = torch.accelerator.current_stream(device)
+    return register_graph_created_object(
+        stream, lambda _, cg: _codegen_current_stream(device, cg)
+    )
+
+
+def _get_stream_by_index(index: int) -> torch.Stream:
+    stream = get_external_object_by_index(index)
+    assert isinstance(stream, torch.Stream), (
+        f"Fork/join stream expected a stream object at index {index}"
+    )
+    return stream
+
+
+def _get_event_by_index(index: int) -> torch.Event:
+    event = get_external_object_by_index(index)
+    assert isinstance(event, torch.Event), (
+        f"Record/wait event expected an event object at index {index}"
+    )
+    return event
+
+
 @custom_op("streams::fork", mutates_args=())
 def fork_stream(
-    from_index: int,
-    from_device: torch.device,
+    from_index: int,  # kept to make stream transitions clearer
     to_index: int,
-    to_device: torch.device,
 ) -> None:
-    pass
+    torch.accelerator.set_stream(_get_stream_by_index(to_index))
 
 
 @fork_stream.register_fake
 def _(
-    from_index: int,
-    from_device: torch.device,
+    from_index: int,  # kept to make stream transitions clearer
     to_index: int,
-    to_device: torch.device,
 ) -> None:
     pass
 
 
+has_side_effect(torch.ops.streams.fork.default)
+
+
 @custom_op("streams::join", mutates_args=())
-def join_stream(
+def join_stream(from_index: int, to_index: int) -> None:
+    torch.accelerator.set_stream(_get_stream_by_index(to_index))
+
+
+@join_stream.register_fake
+def _(
     from_index: int,
-    from_device: torch.device,
     to_index: int,
-    to_device: torch.device,
 ) -> None:
     pass
 
 
-@join_stream.register_fake
+has_side_effect(torch.ops.streams.join.default)
+
+
+@custom_op("streams::record_event", mutates_args=())
+def record_event(event_index: int, stream_index: int) -> None:
+    event = _get_event_by_index(event_index)
+    stream = _get_stream_by_index(stream_index)
+    stream.record_event(event)
+
+
+@record_event.register_fake
 def _(
-    from_index: int,
-    from_device: torch.device,
-    to_index: int,
-    to_device: torch.device,
+    event_index: int,
+    stream_index: int,
 ) -> None:
     pass
 
 
+has_side_effect(torch.ops.streams.record_event.default)
+
+
+@custom_op("streams::wait_event", mutates_args=())
+def wait_event(event_index: int, stream_index: int) -> None:
+    event = _get_event_by_index(event_index)
+    stream = _get_stream_by_index(stream_index)
+    stream.wait_event(event)
+
+
+@wait_event.register_fake
+def _(
+    event_index: int,
+    stream_index: int,
+) -> None:
+    pass
+
+
+has_side_effect(torch.ops.streams.wait_event.default)
+
+
+@custom_op("streams::wait_stream", mutates_args=())
+def wait_stream(waiting_stream_index: int, waited_on_stream_index: int) -> None:
+    waiting = _get_stream_by_index(waiting_stream_index)
+    waited_on = _get_stream_by_index(waited_on_stream_index)
+    waiting.wait_stream(waited_on)
+
+
+@wait_stream.register_fake
+def _(
+    event_index: int,
+    stream_index: int,
+) -> None:
+    pass
+
+
+has_side_effect(torch.ops.streams.wait_stream.default)
+
+
 class SymbolicStreamState:
     """Track the currently entered stream if any"""
 
@@ -116,11 +225,7 @@ def create(
             **kwargs,
         )
 
-    def __init__(
-        self,
-        stream: Optional["StreamVariable"],
-        **kwargs: dict[str, Any],
-    ) -> None:
+    def __init__(self, stream: Optional["StreamVariable"], **kwargs: Any) -> None:
         self.stream = stream
         super().__init__(
             target_values={"stream": self.get_stream().user_object_index},
@@ -129,14 +234,16 @@ def __init__(
         )
 
     def enter(
-        self, tx: "InstructionTranslator", *args: tuple[Any]
-    ) -> "VariableTracker":
+        self, tx: "InstructionTranslator", *args: VariableTracker
+    ) -> VariableTracker:
         # to stream, from stream is the order of the arguments
         # we are entering the target, and leaving the initial stream
         tx.symbolic_stream_state.enter_stream(self.get_stream())
         return super().enter(tx)
 
-    def exit(self, tx: "InstructionTranslator", *args: tuple[Any]) -> "VariableTracker":
+    def exit(
+        self, tx: "InstructionTranslator", *args: VariableTracker
+    ) -> VariableTracker:
         # to stream, from stream is the order of the arguments
         # we are leaving the target, and entering the initial stream
         tx.symbolic_stream_state.exit_stream()
@@ -157,11 +264,11 @@ def __init__(
         self,
         proxy: Proxy,
         value: torch.Stream,
+        user_object_index: Optional[int] = None,
         **kwargs: Any,
     ) -> None:
         # Index into the user object table
         # used to pass arbitrary objects to the graph
-        user_object_index = kwargs.pop("user_obj_index", None)
         if proxy is not None and "example_value" in proxy.node.meta:
             assert proxy.node.meta["example_value"] == value
 
@@ -182,7 +289,7 @@ def call_method(
         name: str,
         args: list[VariableTracker],
         kwargs: dict[str, VariableTracker],
-    ) -> "VariableTracker":
+    ) -> VariableTracker:
         assert hasattr(self.value, name), f"no stream method found named {name}"
 
         from ..utils import cmp_name_to_op_mapping, proxy_args_kwargs
@@ -253,7 +360,7 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
             codegen.append_output(codegen.create_load_const(self.user_object_index))
             codegen.extend_output(create_call_function(1, False))
         else:
-            # TODO mlazos: evaluate if we still need this
+            # This will support the legacy behavior
             prefix = f"_stream_{self.device}"
             name = codegen.tx.output.install_global_by_id(prefix, self.value)
             codegen.append_output(codegen.create_load_global(name, add=True))
@@ -266,6 +373,12 @@ def make_construct_in_graph_stream_fn(
         args: TupleVariable, kwargs: ConstDictVariable
     ) -> Callable[[int, "PyCodegen"], None]:
         def fn(index: int, codegen: "PyCodegen") -> None:
+            codegen.add_push_null(
+                lambda: codegen.load_import_from(
+                    torch._dynamo.graph_bytecode_inputs.__name__,  # type: ignore[implicit-imports]
+                    "stash_graph_created_object",
+                )
+            )
             codegen.add_push_null(
                 lambda: codegen.load_import_from(
                     torch._dynamo.utils.__name__, "build_stream"
@@ -274,17 +387,25 @@ def fn(index: int, codegen: "PyCodegen") -> None:
             codegen(args)
             codegen(kwargs)
             codegen.extend_output(create_call_function(2, False))
+            codegen.extend_output(create_call_function(1, False))
 
         return fn
 
 
 class EventVariable(VariableTracker):
-    def __init__(self, proxy: Proxy, value: torch.Event, **kwargs: Any) -> None:
+    def __init__(
+        self,
+        proxy: Proxy,
+        value: torch.Event,
+        user_object_index: Optional[int],
+        **kwargs: Any,
+    ) -> None:
         if proxy is not None and "example_value" in proxy.node.meta:
             assert proxy.node.meta["example_value"] == value
         super().__init__(**kwargs)
         self.proxy = proxy
         self.value = value
+        self.user_object_index = user_object_index
 
     def call_method(
         self,
@@ -296,7 +417,29 @@ def call_method(
         from ..utils import proxy_args_kwargs
         from .builder import wrap_fx_proxy_cls
 
-        if name in ("wait", "record", "synchronize"):
+        if name == "wait":
+            tx.output.create_proxy(
+                "call_function",
+                torch.ops.streams.wait_event,
+                (
+                    self.user_object_index,
+                    EventVariable._get_stream_arg(tx, args, kwargs).user_object_index,
+                ),
+                {},
+            )
+            return ConstantVariable(None)
+        elif name == "record":
+            tx.output.create_proxy(
+                "call_function",
+                torch.ops.streams.record_event,
+                (
+                    self.user_object_index,
+                    EventVariable._get_stream_arg(tx, args, kwargs).user_object_index,
+                ),
+                {},
+            )
+            return ConstantVariable(None)
+        elif name == "synchronize":
             tx.output.create_proxy(
                 "call_method", name, *proxy_args_kwargs([self] + args, kwargs)
             )
@@ -313,7 +456,7 @@ def call_method(
             method_name = (
                 f"{type(self.value).__module__}.{type(self.value).__qualname__}.{name}"
             )
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Unsupported event method",
                 context=str(name),
                 explanation=f"Dynamo doesn't support tracing the {method_name} method. "
@@ -326,6 +469,46 @@ def call_method(
     def as_proxy(self) -> Proxy:
         return self.proxy
 
+    @staticmethod
+    def _get_stream_arg(
+        tx: "InstructionTranslator",
+        args: list[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> "StreamVariable":
+        stream_arg = None
+        if args:
+            stream_arg = args[0]
+        elif kwargs:
+            stream_arg = kwargs.get("stream")
+
+        if not stream_arg:
+            stream_arg = tx.symbolic_stream_state.cur_stream()
+
+        return stream_arg  # type: ignore[return-value]
+
+    @staticmethod
+    def make_construct_in_graph_event_fn(
+        args: TupleVariable, kwargs: ConstDictVariable
+    ) -> Callable[[int, "PyCodegen"], None]:
+        def fn(index: int, codegen: "PyCodegen") -> None:
+            codegen.add_push_null(
+                lambda: codegen.load_import_from(
+                    torch._dynamo.graph_bytecode_inputs.__name__,  # type: ignore[implicit-imports]
+                    "stash_graph_created_object",
+                )
+            )
+            codegen.add_push_null(
+                lambda: codegen.load_import_from(
+                    torch._dynamo.utils.__name__, "build_event"
+                )
+            )
+            codegen(args)
+            codegen(kwargs)
+            codegen.extend_output(create_call_function(2, False))
+            codegen.extend_output(create_call_function(1, False))
+
+        return fn
+
     def reconstruct(self, codegen: "PyCodegen") -> None:
         # If we got here, this event is fully subsumed by the graph - this means it is
         # not an input or global
diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py
index d44f5171217d0..326178ef00874 100644
--- a/torch/_dynamo/variables/tensor.py
+++ b/torch/_dynamo/variables/tensor.py
@@ -46,7 +46,7 @@
 from .. import config, graph_break_hints, variables
 from .._trace_wrapped_higher_order_op import trace_wrapped
 from ..exc import (
-    unimplemented_v2,
+    unimplemented,
     UnknownPropertiesDuringBackwardTrace,
     UserError,
     UserErrorType,
@@ -390,7 +390,7 @@ def method_attr_is_nested(self, tx):
             return ConstantVariable.create(self.is_nested)
 
     def method_attr_retain_grad(self, tx):
-        unimplemented_v2(
+        unimplemented(
             gb_type="Tensor.retain_grad() with AOTDispatcher",
             context=f"var_getattr {self} retain_grad",
             explanation="`Tensor.retain_grad()` does not work with AOTDispatcher.",
@@ -404,7 +404,7 @@ def method_attr_data(self, tx):
 
     def method_attr_grad_fn(self, tx):
         if self.has_grad_fn:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Tensor with grad_fn()",
                 context=f"var_getattr {self} grad_fn",
                 explanation="Dynamo does not support tracing tensors with a grad_fn directly.",
@@ -451,7 +451,7 @@ def call_obj_hasattr(self, tx: "InstructionTranslator", name):
     def var_getattr(self, tx: "InstructionTranslator", name):
         if self.is_strict_mode(tx):
             if name in self._strict_mode_banned_ops():
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Strict mode banned op",
                     context=f"var_getattr {self} {name}",
                     explanation=f"Getattr invocation '{name}' in strict mode is not supported.",
@@ -541,7 +541,7 @@ def try_generic_attr_handling():
 
     def call_id(self, tx):
         if not self.source:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Unsupported call_id() without source",
                 context=f"call_id {self}",
                 explanation="call_id() not supported for sourceless TensorVariable.",
@@ -553,7 +553,7 @@ def call_id(self, tx):
         try:
             _input_associated_real_value = eval(self.source.name(), scope)
         except Exception as exc:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Error getting associated real value",
                 context=f"call_id {self}",
                 explanation="Dynamo encountered an error while trying to "
@@ -563,7 +563,7 @@ def call_id(self, tx):
             )
 
         if _input_associated_real_value is None:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="call_id() without associated real value",
                 context=f"call_id {self}",
                 explanation="Dynamo could not find an associated real value for the tensor.",
@@ -639,7 +639,7 @@ def call_method(
         from .torch_function import can_dispatch_torch_function, dispatch_torch_function
 
         if self.is_strict_mode(tx) and name in self._strict_mode_banned_ops():
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Illegal method invocation in strict mode",
                 context=f"call_method {self} {name} {args} {kwargs}",
                 explanation="Dynamo currently does not support this method "
@@ -683,7 +683,7 @@ def call_method(
         # discussions in #151432 for more details.
         # We graph break for now since this use case is uncommon.
         if name == "random_":
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Tensor.random_ op",
                 context=f"Tensor.{name}({args=}, {kwargs=})",
                 explanation="This is currently not supported.",
@@ -693,7 +693,7 @@ def call_method(
                 ],
             )
         elif name == "uniform_" and "from" in kwargs:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Tensor.uniform_ op called with `from` keyword",
                 context=f"Tensor.{name}({args=}, {kwargs=})",
                 explanation="This is currently not supported.",
@@ -713,7 +713,7 @@ def call_method(
                 if result:
                     return result
             except TypeError as e:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Unhandled args for method",
                     context=f"call_method {self} {name} {args} {kwargs}",
                     explanation="Dynamo encountered an error while calling "
@@ -804,7 +804,7 @@ def method_is_floating_point(self):
 
     def method_is_inference(self):
         if config.fake_tensor_disable_inference_mode:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Encountered tensor.is_inference() during tracing",
                 context="",
                 explanation="tensor.is_inference() is not supported",
@@ -890,7 +890,7 @@ def method_as_subclass(self, cls):
                 object(), var, mutation_type_cls=AttributeMutationNew
             )
             return var
-        unimplemented_v2(
+        unimplemented(
             gb_type="Argument of `as_subclass` must be a non-dispatcher-style tensor subclass",
             context=f"{self}.as_subclass({cls})",
             explanation="Currently not supported",
@@ -910,7 +910,7 @@ def method_element_size(self):
 
     def method_numpy(self, *, force=False):
         if not config.trace_numpy:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Tensor.numpy() with trace_numpy=False",
                 context=f"call_method {self} numpy",
                 explanation="`Tensor.numpy()` was called, but the `trace_numpy` "
@@ -921,7 +921,7 @@ def method_numpy(self, *, force=False):
                 ],
             )
         if not np:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Tensor.numpy() without NumPy installed",
                 context=f"call_method {self} numpy",
                 explanation="`Tensor.numpy()` was called, but the NumPy library "
@@ -970,7 +970,7 @@ def wrap(i, sub_proxy):
                 torch.int32,
                 torch.int64,
             ]:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Tensor.tolist() with non-integer tensor",
                     context=f"call_method {self} to_list",
                     explanation="Dynamo currently does not support tracing "
@@ -997,7 +997,7 @@ def wrap(i, sub_proxy):
         return VariableTracker.build(tx, out)
 
     def method_backward(self, *args, **kwargs):
-        unimplemented_v2(
+        unimplemented(
             gb_type="Unsupported Tensor.backward() call",
             context=f"call_method {self} backward {args} {kwargs}",
             explanation="Dynamo currently does not support tracing `Tensor.backward()`.",
@@ -1014,7 +1014,7 @@ def method_item(self, *args, **kwargs):
         # We enable capture_scalar_outputs when full_graph=True by default.
         if not tx.one_graph and not config.capture_scalar_outputs:
             self._warn_capture_scalar_outputs()
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Unsupported Tensor.item() call with capture_scalar_outputs=False",
                 context=f"call_method {self} item {args} {kwargs}",
                 explanation="Dynamo does not support tracing `Tensor.item()` "
@@ -1147,7 +1147,7 @@ def method___setitem__(self, key, value):
         return ConstantVariable.create(None)
 
     def method_resize_(self, *args, **kwargs):
-        unimplemented_v2(
+        unimplemented(
             gb_type="Unsupported Tensor.resize_() call",
             context=f"call_method {self} resize_ {args} {kwargs}",
             explanation="Dynamo currently does not support tracing `Tensor.resize_()`.",
@@ -1155,7 +1155,7 @@ def method_resize_(self, *args, **kwargs):
         )
 
     def method_resize_as_(self, *args, **kwargs):
-        unimplemented_v2(
+        unimplemented(
             gb_type="Unsupported Tensor.resize_as_() call",
             context=f"call_method {self} resize_as_ {args} {kwargs}",
             explanation="Dynamo currently does not support tracing `Tensor.resize_as_()`.",
@@ -1163,7 +1163,7 @@ def method_resize_as_(self, *args, **kwargs):
         )
 
     def method_sparse_resize_(self, *args, **kwargs):
-        unimplemented_v2(
+        unimplemented(
             gb_type="Unsupported Tensor.sparse_resize_() call",
             context=f"call_method {self} sparse_resize_ {args} {kwargs}",
             explanation="Dynamo currently does not support tracing `Tensor.sparse_resize_()`.",
@@ -1171,7 +1171,7 @@ def method_sparse_resize_(self, *args, **kwargs):
         )
 
     def method_sparse_resize_and_clear_(self, *args, **kwargs):
-        unimplemented_v2(
+        unimplemented(
             gb_type="Unsupported Tensor.sparse_resize_and_clear_() call",
             context=f"call_method {self} sparse_resize_and_clear_ {args} {kwargs}",
             explanation="Dynamo currently does not support tracing `Tensor.sparse_resize_and_clear_()`.",
@@ -1186,7 +1186,7 @@ def method_set_(self, *args, **kwargs):
             # overload and is used by FSDP.
             # graph-breaking on aten::set_source_Tensor_storage_offset for now,
             # unless we find that we need to make it work.
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Unsupported Tensor.set_() call",
                 context=f"call_method {self} set_ {args} {kwargs}",
                 explanation="Dynamo currently does not support tracing `Tensor.set_()` "
@@ -1318,7 +1318,7 @@ def _method_register_hook(self, name: str, hook: VariableTracker):
                 # would have no recourse - their forward traces just fine, but will fail at backwards unless
                 # compiled_autograd is enabled. If compiled_autograd fails (there are a lot of failures today)
                 # then they have nothing they can do except disable compile.
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Compilation of intermediate hooks requires compiled autograd",
                     context=f"var_getattr {self} {name}",
                     explanation="Dynamo must be in compiled_autograd to register hooks.",
@@ -1368,7 +1368,7 @@ def method_requires_grad_(self, requires_grad=True):
             requires_grad = requires_grad.as_python_constant()
 
         if self.as_proxy().node.meta["example_value"].requires_grad != requires_grad:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Unsupported Tensor.requires_grad_() call",
                 context=f"call_method {self} requires_grad_",
                 explanation="Dynamo does not support changes to a Tensor's "
@@ -1432,7 +1432,10 @@ def create(cls, tx, proxy, sym_num=None, **options):
             sym_num = int(sym_num) if isinstance(sym_num, sympy.Integer) else sym_num
             return ConstantVariable.create(sym_num)
 
-        return SymNodeVariable(proxy, sym_num, **options)
+        out = SymNodeVariable(proxy, sym_num, **options)
+        if proxy.node.op != "placeholder":
+            tx.output.current_tracer.record_tensor_or_symint_vt(out)
+        return out
 
     def __init__(self, proxy, sym_num, **kwargs) -> None:
         super().__init__(**kwargs)
@@ -1560,14 +1563,14 @@ def insert_into_graph():
                 return ConstantVariable.create(int(r))
             return insert_into_graph()
         elif name in ["base", "flags", "dtype"]:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Unsupported ndarray attribute access",
                 context=f"var_getattr {self} {name}",
                 explanation=f"Dynamo currently does not support tracing `ndarray.{name}`.",
                 hints=[],
             )
         elif name == "__version__":
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Unsupported ndarray.__version__ access",
                 context=f"var_getattr {self} {name}",
                 explanation=f"Dynamo currently does not support tracing `ndarray.{name}`.",
@@ -1591,7 +1594,7 @@ def call_method(
         args: "list[VariableTracker]",
         kwargs: "dict[str, VariableTracker]",
     ) -> "VariableTracker":
-        from ..exc import unimplemented_v2
+        from ..exc import unimplemented
         from ..utils import numpy_method_wrapper
 
         args, kwargs = self.patch_args(name, args, kwargs)
@@ -1611,7 +1614,7 @@ def call_method(
                 isinstance(dtype_arg, BuiltinVariable) and dtype_arg.fn is object
             )
             if is_object_str or is_object_type:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="ndarray.astype(object)",
                     context=f"call_method {self} {name} {args} {kwargs}",
                     explanation=(
@@ -1625,7 +1628,7 @@ def call_method(
             # delegate back to TensorVariable
             return super().call_method(tx, name, args, kwargs)
         if name in ("tostring", "tobytes", "__delattr__"):
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Unsupported ndarray method call",
                 context=f"call_method {self} {name} {args} {kwargs}",
                 explanation=f"`ndarray.{name}()` is not modelled in `torch._numpy`.",
@@ -1713,7 +1716,7 @@ def call_function(
                     tx, data, self.value, self.source
                 )
             else:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Calling subclass default constructor with more than tensor argument",
                     context=f"{self.value}(args={args}, kwargs={kwargs})",
                     explanation="Currently not supported",
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
index c2e3df8e4adce..645a4e9595cc1 100644
--- a/torch/_dynamo/variables/torch.py
+++ b/torch/_dynamo/variables/torch.py
@@ -52,7 +52,7 @@
     tracable_create_parameter,
 )
 from ..device_interface import get_registered_device_interfaces
-from ..exc import raise_observed_exception, unimplemented_v2
+from ..exc import raise_observed_exception, unimplemented
 from ..guards import GuardBuilder, install_guard
 from ..source import (
     AttrSource,
@@ -78,7 +78,7 @@
 )
 from .dicts import ConstDictVariable
 from .distributed import DistributedVariable, ProcessGroupVariable
-from .functions import bind_args_cached
+from .functions import bind_args_cached, NestedUserFunctionVariable
 from .lists import ListVariable, TupleVariable
 from .torch_function import (
     can_dispatch_torch_function,
@@ -408,6 +408,7 @@ def call_function(
             torch.cuda.amp.autocast,
             torch.cpu.amp.autocast,
         ):
+            # pyrefly: ignore [bad-argument-type]
             return AutocastModeVariable.create(self.value, args, kwargs)
         elif self.value in (
             # NOTE any class added here must align with the semantic
@@ -471,7 +472,12 @@ def call_function(
             )
         elif self.value is torch.nn.attention.sdpa_kernel.__wrapped__:  # type: ignore[attr-defined]
             name_to_arg_map = bind_args_cached(
-                self.value, tx, self.source, args, kwargs
+                # pyrefly: ignore[bad-argument-type]
+                self.value,
+                tx,
+                self.source,
+                args,
+                kwargs,
             )
             backends = name_to_arg_map["backends"].as_python_constant()
             set_priority = name_to_arg_map["set_priority"].as_python_constant()
@@ -597,9 +603,24 @@ def handle_radians(self, tx: "InstructionTranslator", *args, **kwargs):
                     VariableTracker.build(tx, polyfills.radians), args, kwargs
                 )
 
+        if hasattr(math, "fma"):  # Python 3.13+
+
+            @register(math.fma)
+            def handle_fma(self, tx: "InstructionTranslator", *args, **kwargs):
+                if len(args) != 3 or kwargs:
+                    return None
+
+                if all(isinstance(arg, variables.TensorVariable) for arg in args):
+                    x, y, z = args
+                    addcmul_fn = TorchInGraphFunctionVariable(torch.addcmul)
+                    return addcmul_fn.call_function(tx, [z, x, y], {})
+
+                # Use math.fma if constants
+                return None
+
         @register(torch.is_inference_mode_enabled)
         def handle_is_inference_mode_enabled(self, tx: "InstructionTranslator"):
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Encountered torch.is_inference_mode_enabled during tracing",
                 context="",
                 explanation="torch.is_inference_mode_enabled() is not supported",
@@ -648,7 +669,7 @@ def handle_torch_compile(self, tx: "InstructionTranslator", *args, **kwargs):
                 # torch.compile is a no-op in dynamo
                 return args[0]
 
-            unimplemented_v2(
+            unimplemented(
                 gb_type="torch.compile call with > 1 args",
                 context=f"args={args}, kwargs={kwargs}",
                 explanation="Attempted to call `torch.compile` with > 1 args. Dynamo does not support this.",
@@ -684,7 +705,7 @@ def handle_use_deterministic_algorithms(
         ):
             # pyrefly: ignore [missing-attribute]
             if warn_only and warn_only.as_python_constant():
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Attempted to use torch.use_deterministic_algorithms(warn_only=True)",
                     context=f"mode={mode}, warn_only={warn_only}",
                     explanation="Dynamo does not support this.",
@@ -743,7 +764,7 @@ def handle_device_interface_stream(self, tx: "InstructionTranslator", stream):
         @register(torch.from_numpy)
         def handle_from_numpy(self, tx: "InstructionTranslator", *args):
             if not config.trace_numpy:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="call `torch.from_numpy` with `torch._dynamo.config.trace_numpy=False`",
                     context=f"trace_numpy={config.trace_numpy}",
                     explanation=(
@@ -755,7 +776,7 @@ def handle_from_numpy(self, tx: "InstructionTranslator", *args):
                     ],
                 )
             if not np:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="`torch.from_numpy` with NumPy unavailable",
                     context="",
                     explanation="Attempted to call `torch.numpy` but NumPy could not be imported.",
@@ -976,7 +997,7 @@ def handle_nested_tensor(
             from .lists import BaseListVariable
 
             if layout and layout.as_python_constant() == torch.strided:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Attempted to use strided NestedTensor",
                     context=f"layout={layout}",
                     explanation="Dynamo does not support this.",
@@ -986,7 +1007,7 @@ def handle_nested_tensor(
                     ],
                 )
             if not isinstance(tensor_list, BaseListVariable):
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Attempted to use `nested_tensor` with non-list input",
                     context=f"tensor_list={tensor_list}",
                     explanation="Dynamo does not support this.",
@@ -1003,7 +1024,7 @@ def handle_one_hot(self, tx: "InstructionTranslator", *args, **kwargs):
                 and args[1].is_python_constant()
                 and args[1].as_python_constant() == -1
             ):
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Attempted to use `torch.nn.functional.one_hot` with data-dependent output shape",
                     context=f"args={args}, kwargs={kwargs}",
                     explanation="Dynamo does not support this.",
@@ -1066,9 +1087,14 @@ def guard_scalar(self, tx: "InstructionTranslator", expr):
             elif isinstance(expr, ConstantVariable):
                 val = expr.value
             else:
-                raise torch._dynamo.exc.Unsupported("branch not supported")
+                unimplemented(
+                    gb_type="torch.fx.experimental.symbolic_shapes.guard_scalar branch not supported",
+                    context=f"expr: {expr}",
+                    explanation="Expected `expr` to be a symbolic variable or constant.",
+                    hints=[],
+                )
             return variables.ConstantVariable.create(
-                # pyrefly: ignore [bad-argument-type]
+                # pyrefly: ignore [bad-argument-type, unbound-name]
                 torch.fx.experimental.symbolic_shapes.guard_scalar(val)
             )
 
@@ -1181,7 +1207,7 @@ def handle_pop_torch_function(
         ):
             assert not args and not kwargs
             if not tx.symbolic_torch_function_state.mode_stack:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Attempted to pop from empty torch function mode stack",
                     context="",
                     explanation="Called `torch._C._pop_torch_function_stack` when torch function mode stack is empty.",
@@ -1230,7 +1256,7 @@ def handle_get_stack_at(self, tx: "InstructionTranslator", *args, **kwargs):
         @register(torch.get_device_module.__wrapped__)
         def handle_get_device_module(self, tx, *args, **kwargs):
             if len(args) + len(kwargs) > 1 or (kwargs and "device" not in kwargs):
-                unimplemented_v2(
+                unimplemented(
                     gb_type="improper torch.get_device_module arguments",
                     context=f"args={args}, kwargs={kwargs}",
                     explanation="torch.get_device_module accepts 1 optional argument `device`",
@@ -1247,7 +1273,7 @@ def handle_get_device_module(self, tx, *args, **kwargs):
                     device = None
                 module = torch.get_device_module(device)
             except Exception as e:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="bad device argument to torch.get_device_module",
                     context=f"args={args}, kwargs={kwargs}",
                     explanation="Expected valid string/torch.device argument ('cpu', 'cuda', etc.)",
@@ -1269,10 +1295,10 @@ def handle_get_device_module(self, tx, *args, **kwargs):
             # pyrefly: ignore [unbound-name]
             return VariableTracker.build(tx, module, new_source)
 
-        @register(torch.accelerator.current_stream)
+        @register(torch.accelerator.current_stream, torch.cuda.current_stream)
         def handle_current_stream(self, tx: "InstructionTranslator", *args, **kwargs):
             if len(args) + len(kwargs) > 1 or (kwargs and "device" not in kwargs):
-                unimplemented_v2(
+                unimplemented(
                     gb_type="unsupported arguments to torch.accelerator.current_stream",
                     context=f"args={args}, kwargs={kwargs}",
                     explanation="torch.accelerator.current_stream accepts one optional argument `device`",
@@ -1290,7 +1316,7 @@ def handle_current_stream(self, tx: "InstructionTranslator", *args, **kwargs):
 
                 return tx.symbolic_stream_state.cur_stream(device)
             except Exception as e:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="bad device argument to torch.accelerator.current_stream",
                     context=f"args={args}, kwargs={kwargs}",
                     explanation="Expected valid string/torch.device argument ('cpu', 'cuda', etc.)",
@@ -1317,6 +1343,86 @@ def handle_set_default_device(
 
             return ConstantVariable.create(None)
 
+        @register(torch._check)
+        def handle_check(self, tx: "InstructionTranslator", *args, **kwargs):
+            predicate_vt = None
+            message_vt = None
+
+            if args:
+                predicate_vt = args[0]
+                rest_args = args[1:]
+            else:
+                rest_args = ()
+
+            if predicate_vt is None and "cond" in kwargs:
+                predicate_vt = kwargs.pop("cond")
+
+            if rest_args:
+                message_vt = rest_args[0]
+            elif "message" in kwargs:
+                message_vt = kwargs.pop("message")
+
+            if predicate_vt is None:
+                return wrap_fx_proxy(
+                    tx=tx,
+                    proxy=tx.output.create_proxy(
+                        "call_function",
+                        self.value,
+                        (),
+                        {},
+                    ),
+                )
+
+            message_eager = None
+            message_graph_proxy = None
+            if message_vt is not None:
+                if (
+                    not isinstance(message_vt, NestedUserFunctionVariable)
+                    or message_vt.has_closure()
+                ):
+                    unimplemented(
+                        gb_type="Can't extract message from torch._check()",
+                        context=str(message_vt),
+                        explanation=(
+                            "The second argument of torch._check() must be a function"
+                            "defined within the torch.compile region"
+                            "that does not reference a non-local variable."
+                        ),
+                        hints=[
+                            "Make sure the message function is defined in the torch.compile region.",
+                            "Remove any closure variables, e.g. "
+                            "remove references to closure variable `x` in `lambda: f'{x} failed check'`",
+                            *graph_break_hints.SUPPORTABLE,
+                        ],
+                    )
+                message_eager = message_vt.get_function()
+
+                message_graph_proxy = tx.output.register_static_attr_and_return_proxy(
+                    "_check_message", message_eager
+                )
+
+            if predicate_vt.is_python_constant():
+                self.value(predicate_vt.as_python_constant(), message_eager)
+                return ConstantVariable.create(None)
+
+            predicate_proxy = predicate_vt.as_proxy()
+
+            proxy_args: tuple[Any, ...]
+            if message_graph_proxy is None:
+                proxy_args = (predicate_proxy,)
+            else:
+                proxy_args = (predicate_proxy, message_graph_proxy)
+
+            return wrap_fx_proxy(
+                tx=tx,
+                proxy=tx.output.create_proxy(
+                    "call_function",
+                    self.value,
+                    proxy_args,
+                    {},
+                ),
+            )
+
         return handlers
 
     def call_function(
@@ -1348,7 +1454,7 @@ def call_function(
             packed_input_vt = TupleVariable.build(
                 tx, (TupleVariable.build(tx, args), ConstDictVariable.build(tx, kwargs))
             )
-            out_vt = variables.UserFunctionVariable(tree_flatten).call_function(
+            out_vt = variables.UserFunctionVariable(tree_flatten).call_function(  # type: ignore[arg-type]
                 tx, [packed_input_vt], {}
             )
             assert isinstance(out_vt, TupleVariable) and len(out_vt.items) == 2
@@ -1360,7 +1466,7 @@ def call_function(
                 arg_type = flat_arg_vt.python_type()
                 if not is_graphable_type(arg_type):
                     type_name = flat_arg_vt.python_type().__qualname__
-                    unimplemented_v2(
+                    unimplemented(
                         gb_type="Invalid input type for nonstrict_trace-ed function",
                         context=f"Encountered input of type <{type_name}>.",
                         explanation=(
@@ -1394,7 +1500,7 @@ def call_function(
                 import torch.utils._pytree as pytree
 
                 if pytree.is_constant_class(typ):
-                    unimplemented_v2(
+                    unimplemented(
                         gb_type="Input marked with `pytree.register_constant` constructed in the `torch.compile` region",
                         context=f"Input={input_spec_vt}, offending type <{type_name}>.",
                         explanation=(
@@ -1409,7 +1515,7 @@ def call_function(
                         from_exc=e,
                     )
                 else:
-                    unimplemented_v2(
+                    unimplemented(
                         gb_type="Invalid use of pytree_flatten with nonstrict_trace-ed function",
                         context=f"Input={input_spec_vt}, offending type <{type_name}>.",
                         explanation=(
@@ -1474,7 +1580,7 @@ def patched_fn(*args, **kwargs):
                 # From `flat_apply` assert on output type.
                 torch._dynamo.exc.TorchRuntimeError,
             ):
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Unsupported output type for nonstrict_trace-ed function",
                     context=f"Function: {fn.__name__}",
                     explanation=(
@@ -1526,7 +1632,7 @@ def patched_fn(*args, **kwargs):
                         and torch.Tag.inplace_view
                         in getattr(fn, fn.overloads()[0]).tags
                     ):
-                        unimplemented_v2(
+                        unimplemented(
                             gb_type="Inplace op on input tensor",
                             context="",
                             explanation=f"Attempted to trace an inplace view op on input tensor {typestr(self.value)}.",
@@ -1561,7 +1667,7 @@ def patched_fn(*args, **kwargs):
 For now, dynamo will explicitly graph break when it encounters user code with this behavior.
 """
             log.warning(msg)
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Attempted to call torch in-graph function on only torch.SymInt arguments",
                 context=f"fn={self.value}, args={args}, kwargs={kwargs}",
                 explanation=(
@@ -1629,7 +1735,7 @@ def patched_fn(*args, **kwargs):
             and "requires_grad" in kwargs
             and kwargs["requires_grad"].as_python_constant()
         ):
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Attempted to use tensor creation function with requires_grad=True",
                 context=f"fn={self.value}, args={args}, kwargs={kwargs}",
                 explanation="Dynamo does not support this.",
@@ -1669,7 +1775,7 @@ def patched_fn(*args, **kwargs):
                     if saved_out_shape != fake_out.shape:
                         # It's hard to get out variants with resizing on graph inputs work
                         # properly across dynamo/aot/inductor, just fall back.
-                        unimplemented_v2(
+                        unimplemented(
                             gb_type="Shape mismatch with out= list of tensor variants",
                             context=f"fn={self.value}, args={args}, kwargs={kwargs}",
                             explanation=(
@@ -1683,7 +1789,7 @@ def patched_fn(*args, **kwargs):
                     if not torch._prims_common.is_contiguous(fake_out):
                         # It's difficult to handle strides correctly in functionalization
                         # when calling an out= op with a non-contiguous out argument
-                        unimplemented_v2(
+                        unimplemented(
                             gb_type="Attempted to call op with non-contiguous `out=` list of tensors",
                             context=f"self.value={self.value}, args={args}, kwargs={kwargs}",
                             explanation="Dynamo does not support this.",
@@ -1698,7 +1804,7 @@ def patched_fn(*args, **kwargs):
                 if saved_out_shapes != fake_out.shape:
                     # It's hard to get out variants with resizing on graph inputs work
                     # properly across dynamo/aot/inductor, just fall back.
-                    unimplemented_v2(
+                    unimplemented(
                         gb_type="Shape mismatch with out= tensor variant",
                         context=f"fn={self.value}, args={args}, kwargs={kwargs}",
                         explanation=(
@@ -1712,7 +1818,7 @@ def patched_fn(*args, **kwargs):
                 if not torch._prims_common.is_contiguous(fake_out):
                     # It's difficult to handle strides correctly in functionalization
                     # when calling an out= op with a non-contiguous out argument
-                    unimplemented_v2(
+                    unimplemented(
                         gb_type="Attempted to call op with non-contiguous `out=` tensor",
                         context=f"self.value={self.value}, args={args}, kwargs={kwargs}",
                         explanation="Dynamo does not support this.",
@@ -1743,7 +1849,7 @@ def handle_ntuple(value):
                     torch.nn.modules.utils._ntuple(count)(value.as_python_constant()),
                 )
             else:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Attempted to use `torch.nn.modules.utils._ntuple` with unsupported argument type",
                     context=f"value={value}",
                     explanation="Dynamo does not support this.",
@@ -1761,7 +1867,7 @@ def handle_ntuple(value):
     def call_nn_parameter(cls, tx, data=None, requires_grad=True):
         """A call to torch.nn.Parameter() gets lifted to before the graph"""
         if tx.export:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Attempted to use `torch.nn.Parameter()` with export",
                 context="",
                 explanation="Dynamo does not support this.",
@@ -1775,7 +1881,7 @@ def call_nn_parameter(cls, tx, data=None, requires_grad=True):
             try:
                 requires_grad = requires_grad.as_python_constant()
             except NotImplementedError:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="non-constant `requires_grad` argument to `torch.nn.Parameter`",
                     context=f"requires_grad={requires_grad}",
                     explanation="Dynamo does not support this.",
@@ -1786,7 +1892,7 @@ def call_nn_parameter(cls, tx, data=None, requires_grad=True):
                 )
 
         if not isinstance(data, variables.TensorVariable):
-            unimplemented_v2(
+            unimplemented(
                 gb_type="`torch.nn.Parameter()` with unsupported data type",
                 context=f"data={data}",
                 explanation="Called `torch.nn.Parameter()` with non-Tensor argument.",
@@ -1803,7 +1909,7 @@ def call_nn_parameter(cls, tx, data=None, requires_grad=True):
 
         if config.graph_break_on_nn_param_ctor:
             # Need user to manually move since we cannot
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Attempted to use `torch.nn.Parameter()` constructor with Dynamo",
                 context="",
                 explanation="Dynamo does not support this",
@@ -1820,7 +1926,7 @@ def call_nn_parameter(cls, tx, data=None, requires_grad=True):
             TensorWithTFOverrideVariable,
             # pyrefly: ignore [missing-attribute]
         ) or is_traceable_wrapper_subclass_type(data.class_type):
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Attempted to use torch.nn.Parameter constructor with tensor subclass",
                 context=str(data),
                 explanation="Dynamo does not support this.",
@@ -1830,7 +1936,7 @@ def call_nn_parameter(cls, tx, data=None, requires_grad=True):
             )
 
         if not can_convert_to_tracable_parameter():
-            unimplemented_v2(
+            unimplemented(
                 gb_type="`torch.nn.Parameter`: cannot convert to traceable tracable",
                 context="",
                 explanation="convert_tracable_parameter is set to False.",
@@ -1848,7 +1954,7 @@ def call_nn_parameter(cls, tx, data=None, requires_grad=True):
             # pyrefly: ignore [missing-attribute]
             device = data.var_getattr(tx, "device").as_python_constant()
         except NotImplementedError as e:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="`torch.nn.Parameter` with non-constant Tensor attributes",
                 context=f"data={data}",
                 explanation="Dynamo does not support this.",
@@ -1914,7 +2020,7 @@ def _nn_param_via_prefix_insert(tx: "InstructionTranslator", data, requires_grad
 
         data_node = data.as_proxy().node
         if data_node.op not in ("placeholder", "get_attr"):
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Unexpected type of data placeholder op for parameter construction",
                 context=f"data_node.op={data_node.op}",
                 explanation="Data node op should be placeholder or get_attr.",
diff --git a/torch/_dynamo/variables/torch_function.py b/torch/_dynamo/variables/torch_function.py
index 378e9258459f5..c7254afdfebfc 100644
--- a/torch/_dynamo/variables/torch_function.py
+++ b/torch/_dynamo/variables/torch_function.py
@@ -29,9 +29,9 @@
 import functools
 import inspect
 import operator
-from collections.abc import Sequence
+from collections.abc import Generator, Iterable, Sequence
 from types import TracebackType
-from typing import Any, Generator, Iterable, Optional, TYPE_CHECKING
+from typing import Any, Optional, TYPE_CHECKING
 
 import torch._C
 import torch.utils._pytree as pytree
@@ -44,7 +44,7 @@
 from torch.utils._device import DeviceContext
 
 from .. import graph_break_hints
-from ..exc import unimplemented_v2
+from ..exc import unimplemented
 from ..guards import GuardBuilder, install_guard
 from ..polyfills import NoEnterTorchFunctionMode
 from ..source import AttrSource, GlobalSource, TorchFunctionModeStackSource, TypeSource
@@ -164,7 +164,8 @@ def __init__(
         if value is not None:
             super().__init__(value, **kwargs)
         self.value = value
-        self.cm_obj = value  # needed for BC with calling enter from CM code
+        # needed for BC with calling enter from CM code
+        self.cm_obj = value  # type: ignore[assignment]
         self.source = source  # type: ignore[assignment]
 
     def reconstruct(self, codegen: "PyCodegen") -> None:
@@ -557,7 +558,7 @@ def dispatch_torch_function(
         if not (isinstance(res, ConstantVariable) and res.value is NotImplemented):
             return res
 
-    unimplemented_v2(
+    unimplemented(
         gb_type="All __torch_function__ overrides returned NotImplemented due to TypeError from user code",
         context=f"{fn=}, {args=}, {kwargs=}",
         explanation=f"All __torch_function__ overrides for for function {fn} returned NotImplemented",
@@ -625,7 +626,7 @@ def var_getattr(self, tx: "InstructionTranslator", name: str) -> VariableTracker
         # I think only `_base` is breaking because we aren't modelling view
         # relationship perfectly in some scenarios.
         if name in banned_attrs:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Unsupported tensor subclass attribute access",
                 context=f"{name}",
                 explanation="`torch.compile` currently can't trace this",
@@ -685,7 +686,7 @@ def var_getattr(self, tx: "InstructionTranslator", name: str) -> VariableTracker
                     )
 
                 elif attr_is_overridden:
-                    unimplemented_v2(
+                    unimplemented(
                         gb_type="Unsupported tensor subclass overridden attribute access",
                         context=f"{name}",
                         explanation="`torch.compile` only support tracing certain types of overridden tensor subclass attributes",
@@ -733,7 +734,7 @@ def call_method(
             import torch
 
             if _is_attr_overridden(tx, self, name):
-                unimplemented_v2(
+                unimplemented(
                     gb_type="Tensor subclass overridden method call",
                     context=f"{name}",
                     explanation="`torch.compile` currently can't trace this",
diff --git a/torch/_dynamo/variables/user_defined.py b/torch/_dynamo/variables/user_defined.py
index 707ad7b3d9d18..ec378a5512a01 100644
--- a/torch/_dynamo/variables/user_defined.py
+++ b/torch/_dynamo/variables/user_defined.py
@@ -56,7 +56,7 @@
     ObservedTypeError,
     ObservedUserStopIteration,
     raise_observed_exception,
-    unimplemented_v2,
+    unimplemented,
 )
 from ..graph_bytecode_inputs import get_external_object_by_index
 from ..guards import GuardBuilder, install_guard
@@ -419,9 +419,7 @@ def call_method(
             self.value in {collections.OrderedDict, collections.defaultdict}
             and name == "fromkeys"
         ):
-            from .builtin import BuiltinVariable
-
-            return BuiltinVariable.call_custom_dict_fromkeys(
+            return variables.BuiltinVariable.call_custom_dict_fromkeys(
                 tx, self.value, *args, **kwargs
             )
         elif self.value is collections.OrderedDict and name == "move_to_end":
@@ -461,7 +459,7 @@ def call_method(
                 args[1:],
             )
         elif name == "__setattr__" and self.ban_mutation:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="Class attribute mutation when the __dict__ was already materialized",
                 context=str(self.value),
                 explanation="Dyanmo does not support tracing mutations on a class when its __dict__ is materialized",
@@ -501,20 +499,23 @@ def call_function(
                 [self, *args],
                 kwargs,
             )
-        elif (
-            self.value is collections.defaultdict
-            and len(args) <= 1
-            and DefaultDictVariable.is_supported_arg(args[0])
-        ):
+        elif self.value is collections.defaultdict:
+            if len(args) == 0:
+                default_factory = variables.ConstantVariable.create(None)
+            else:
+                default_factory, *args = args
+            dict_vt = variables.BuiltinVariable.call_custom_dict(
+                tx, dict, *args, **kwargs
+            )
             return DefaultDictVariable(
-                {},
+                dict_vt.items,
                 collections.defaultdict,
-                args[0],
+                default_factory,
                 mutation_type=ValueMutationNew(),
             )
         elif is_typeddict(self.value):
             if self.value.__optional_keys__:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="TypedDict with optional keys",
                     context=str(self.value),
                     explanation="Dyanmo does not support tracing TypedDict with optional keys",
@@ -533,7 +534,7 @@ def deque_signature(iterable=None, maxlen=None):
             try:
                 bound_args = inspect.signature(deque_signature).bind(*args, **kwargs)
             except TypeError as e:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="collections.deque() with bad arguments",
                     context=f"args={args}, kwargs={kwargs}",
                     explanation="Detected call to collections.deque() with bad arguments.",
@@ -548,7 +549,7 @@ def deque_signature(iterable=None, maxlen=None):
                 if not bound_args.arguments["iterable"].has_force_unpack_var_sequence(
                     tx
                 ):
-                    unimplemented_v2(
+                    unimplemented(
                         gb_type="collections.deque() with bad iterable argument",
                         context=f"args={args}, kwargs={kwargs}",
                         explanation="Call to collections.deque() has an iterable argument that Dynamo cannot "
@@ -577,7 +578,7 @@ def deque_signature(iterable=None, maxlen=None):
             return variables.WeakRefVariable(args[0], callback)
         elif self.value is functools.partial:
             if not args:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="missing args to functools.partial",
                     context="",
                     explanation="functools.partial requires at least one argument",
@@ -635,7 +636,7 @@ def deque_signature(iterable=None, maxlen=None):
             ):
                 # We are not changing the behavior of Dynamo as these function were
                 # already ignored on trace_rules.py before #136033 landed
-                unimplemented_v2(
+                unimplemented(
                     gb_type="unsupported contextlib.* API",
                     context=f"{self.value}",
                     explanation=f"{self.value} not supported. This may be due to its use of "
@@ -650,7 +651,7 @@ def deque_signature(iterable=None, maxlen=None):
                 args[0], (BaseUserFunctionVariable, TorchCtxManagerClassVariable)
             ):
                 if not torch._dynamo.config.enable_trace_contextlib:
-                    unimplemented_v2(
+                    unimplemented(
                         gb_type="attempted to trace contextlib.contextmanager",
                         context=f"args={args}",
                         explanation="Tracing contextlib.contextmanager is disabled.",
@@ -837,7 +838,34 @@ def deque_signature(iterable=None, maxlen=None):
                     proxy=tx.output.create_proxy(
                         "call_function", get_external_object_by_index, (ind,), {}
                     ),
-                    user_obj_index=ind,
+                )
+            elif issubclass(self.value, torch.Event):
+                from .constant import ConstantVariable
+                from .lists import TupleVariable
+
+                # Register newly created event for reconstruction
+                var_kwargs = ConstDictVariable(
+                    {ConstantVariable(k): v for k, v in kwargs.items()}
+                )
+                var_args = TupleVariable(list(args))
+                event = self.value(
+                    *(var_args.as_python_constant()),
+                    **(var_kwargs.as_python_constant()),
+                )
+                from ..graph_bytecode_inputs import register_graph_created_object
+                from .streams import EventVariable
+
+                ind = register_graph_created_object(
+                    event,
+                    EventVariable.make_construct_in_graph_event_fn(
+                        var_args, var_kwargs
+                    ),
+                )
+                tensor_variable = wrap_fx_proxy(
+                    tx=tx,
+                    proxy=tx.output.create_proxy(
+                        "call_function", get_external_object_by_index, (ind,), {}
+                    ),
                 )
             else:
                 tensor_variable = wrap_fx_proxy(
@@ -968,6 +996,12 @@ def __init__(
         # rid of these workarounds here and in `GetAttrVariable`.
         self.attrs_directly_modifed_on_dict = set()
 
+        import torch.utils._pytree as pytree
+
+        self.is_pytree_constant_class = pytree.is_constant_class(self.value_type)
+        if pytree.is_constant_class(self.value_type) and self.source:
+            install_guard(self.source.make_guard(GuardBuilder.EQUALS_MATCH))
+
     def __str__(self) -> str:
         inner = self.value_type.__name__
         if inner in [
@@ -989,12 +1023,10 @@ def python_type(self):
         return self.value_type
 
     def as_python_constant(self):
-        import torch.utils._pytree as pytree
-
-        if pytree.is_constant_class(self.value_type):
-            if self.source is not None:
-                install_guard(self.source.make_guard(GuardBuilder.EQUALS_MATCH))
-                return self.value
+        if self.is_pytree_constant_class and self.source:
+            # NOTE pytree constants created in the torch.compile region will
+            # NOT be guarded (even though they have a source set)
+            return self.value
             # TODO else try reconstructing the object by, e.g., leveraging side
             # effects and `as_python_constant`.
         return super().as_python_constant()
@@ -1083,7 +1115,7 @@ def call_method(
             if torch._dynamo.config.enable_faithful_generator_behavior and isinstance(
                 self.value, types.GeneratorType
             ):
-                unimplemented_v2(
+                unimplemented(
                     gb_type="call_method on generator",
                     context=f"object={self.value}, method={name}, args={args}, kwargs={kwargs}",
                     explanation="Detected a method call to a user-defined generator object. "
@@ -1122,7 +1154,7 @@ def method_setattr_standard(
         try:
             name = name.as_python_constant()
         except NotImplementedError:
-            unimplemented_v2(
+            unimplemented(
                 gb_type="non-const setattr name on user-defined object",
                 context=f"object={self}, name={name}, value={value}",
                 explanation="Detected a call to `setattr` of a user-defined object with a non-constant name.",
@@ -1248,7 +1280,7 @@ def call_function(
                 ).call_function(tx, [var], kwargs)
 
             if self.source is None:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="attempted to call sourceless user-defined object as a method",
                     context=f"object={self.value}, function={func}, args={args}, kwargs={kwargs}",
                     explanation="Dynamo does not support this.",
@@ -1378,7 +1410,7 @@ def get_source_by_walking_mro(self, name):
                     )
                 return out_source
 
-        unimplemented_v2(
+        unimplemented(
             gb_type="could not find name in object's mro",
             context=f"name={name}, object type={type(self.value)}, mro={type(self.value).__mro__}",
             explanation=f"Could not find name `{name}` in mro {type(self.value).__mro__}",
@@ -1474,7 +1506,7 @@ def var_getattr(self, tx: "InstructionTranslator", name):
                 return out
 
             elif getattr_fn is not None:
-                unimplemented_v2(
+                unimplemented(
                     gb_type="User-defined object with non-function __getattr__",
                     context=f"object={self.value}, name={name}, getattr_fn={getattr_fn}",
                     explanation=f"Found a non-function __getattr__ {getattr_fn} from a user-defined object {self.value} "
@@ -1600,7 +1632,7 @@ def var_getattr(self, tx: "InstructionTranslator", name):
             if isinstance(subobj, types.MethodType):
                 if dynamic_subobj.__self__ is not self.value:
                     if not isinstance(dynamic_subobj.__func__, types.FunctionType):
-                        unimplemented_v2(
+                        unimplemented(
                             gb_type="User-defined object method with non-function __func__",
                             context=f"object={self.value}, name={name}, method={dynamic_subobj}, "
                             f"method.__self__={dynamic_subobj.__self__}, method.__func__={dynamic_subobj.__func__}",
@@ -1806,7 +1838,7 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
         # Handle specific pytree classes
         import torch.utils._pytree as pytree
 
-        if self.value_type is pytree.LeafSpec:
+        if isinstance(self.value, pytree.TreeSpec) and self.value.is_leaf():
             # Create a new LeafSpec instance by calling the constructor
             codegen.add_push_null(
                 lambda: codegen.load_import_from("torch.utils._pytree", "LeafSpec")
diff --git a/torch/_export/config.py b/torch/_export/config.py
index 1b30a400c855c..ec3963eaa34ac 100644
--- a/torch/_export/config.py
+++ b/torch/_export/config.py
@@ -33,6 +33,9 @@
 # being ready to handle auto_functionalized_v2.
 enable_auto_functionalized_v2_for_export = not is_fbcode()
 
+use_legacy_dynamo_graph_capture = True
+
+
 if TYPE_CHECKING:
     from torch.utils._config_typing import *  # noqa: F401, F403
 
diff --git a/torch/_export/converter.py b/torch/_export/converter.py
index 89b6e3297933f..58de4fd20c953 100644
--- a/torch/_export/converter.py
+++ b/torch/_export/converter.py
@@ -443,7 +443,7 @@ def __init__(
         self.blocks_to_lifted_attrs = blocks_to_lifted_attrs
 
         # Populate methods for the standard operators.
-        for k in kind_to_standard_operators.keys():
+        for k in kind_to_standard_operators:
             handler_func_name = ir_name_to_func_name(k)
             # Create an indirect function call:
             # convert_<namespace>_<opname> --> lambda node: _convert_standard_operator(node)
diff --git a/torch/_export/non_strict_utils.py b/torch/_export/non_strict_utils.py
index ef510480347c8..e84e67e5c5b9b 100644
--- a/torch/_export/non_strict_utils.py
+++ b/torch/_export/non_strict_utils.py
@@ -5,6 +5,7 @@
 import inspect
 import logging
 import math
+import sys
 from collections import defaultdict
 from collections.abc import Callable, Sequence
 from contextlib import contextmanager
@@ -24,6 +25,7 @@
 from torch._export.utils import _fakify_params_buffers
 from torch._guards import Source
 from torch._library.fake_class_registry import FakeScriptObject
+from torch._library.opaque_object import is_opaque_type
 from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.export import Constraint
 from torch.export.dynamic_shapes import (
@@ -421,6 +423,14 @@ def make_fake_inputs(
         if isinstance(nn_module.forward, functools.partial):
             # functools handles nesting by itself, no need to recurse
             code = nn_module.forward.func.__code__
+        elif (
+            sys.version_info >= (3, 14)
+            and (fwd := getattr(nn_module.forward, "__func__", None))
+            and isinstance(fwd, functools.partial)
+        ):
+            # functools.partial is now a method descriptor:
+            # https://docs.python.org/3/whatsnew/3.14.html#changes-in-the-python-api
+            code = fwd.func.__code__
         else:
             code = nn_module.forward.__code__
         co_fields = {
@@ -946,7 +956,9 @@ def _leaf_mod_and_attr(
 
     try:
         for obj, fqns in constant_attrs.items():
-            if torch._library.fake_class_registry._is_script_object(obj):
+            if torch._library.fake_class_registry._is_script_object(
+                obj
+            ) or is_opaque_type(obj):
                 fake_script_obj = _maybe_fakify_obj(obj)
                 for fqn in fqns:
                     cur_mod, attr = _leaf_mod_and_attr(mod, fqn)
diff --git a/torch/_export/passes/replace_with_hop_pass_util.py b/torch/_export/passes/replace_with_hop_pass_util.py
index 6ea3f1adde4f8..862244aac8837 100644
--- a/torch/_export/passes/replace_with_hop_pass_util.py
+++ b/torch/_export/passes/replace_with_hop_pass_util.py
@@ -71,7 +71,7 @@ def set_hoo_node_meta(call_func_node):
 
                 # Rename the name of getitem nodes to the actual name of its contents
                 # for passing verifier and better readability, also propagate metadata
-                for get_item_node in call_func_node.users.keys():
+                for get_item_node in call_func_node.users:
                     idx: int = get_item_node.args[1]  # type: ignore[assignment]
                     output_node = output_args[idx]
                     get_item_node._rename(output_node.name)
diff --git a/torch/_export/serde/serialize.py b/torch/_export/serde/serialize.py
index 9c4629f13337d..84978f0066712 100644
--- a/torch/_export/serde/serialize.py
+++ b/torch/_export/serde/serialize.py
@@ -422,7 +422,17 @@ def deserialize_torch_artifact(
     buffer = io.BytesIO(serialized)
     buffer.seek(0)
     # weights_only=False as we want to load custom objects here (e.g. ScriptObject)
-    artifact = torch.load(buffer, weights_only=False)
+    try:
+        artifact = torch.load(buffer, weights_only=True)
+    except Exception as e:
+        buffer.seek(0)
+        artifact = torch.load(buffer, weights_only=False)
+        log.warning(
+            "Fallback to weights_only=False succeeded. "
+            "Loaded object of type %s after initial failure: %s",
+            type(artifact),
+            exc_info=e,
+        )
     assert isinstance(artifact, (tuple, dict))
     return artifact
 
@@ -617,7 +627,7 @@ def get_triton_kernel_and_cache_entry(node: torch.fx.Node):
         return actual_kernel, matching_entries[0][1]
 
     if is_autotuner:
-        for sig_key, cache_entry in matching_entries:
+        for _sig_key, cache_entry in matching_entries:
             entry_metadata = cache_entry.metadata
             # pyrefly: ignore [missing-attribute]
             for config in kernel.configs:
diff --git a/torch/_export/utils.py b/torch/_export/utils.py
index 648e32758e5fa..3828dc97ac9bc 100644
--- a/torch/_export/utils.py
+++ b/torch/_export/utils.py
@@ -974,6 +974,41 @@ def _name_hoo_subgraph_placeholders(gm: torch.fx.GraphModule) -> None:
         subgraph.recompile()
 
 
+def _assign_new_node_names(
+    gm: torch.fx.GraphModule,
+    name_map: dict[str, str],
+    custom_meta: dict[str, Any],
+) -> None:
+    """
+    Assign new names to all nodes, in the graph module, from name map.
+    """
+    for node in gm.graph.nodes:
+        if node.op == "placeholder":
+            assert node.name in name_map
+            node.name = node.target = name_map[node.name]
+            if node.name in custom_meta:
+                if node.meta.get("custom") is None:
+                    node.meta["custom"] = {}
+                else:
+                    # Assert if any existing key has different value
+                    for k, v in node.meta["custom"].items():
+                        if (
+                            k in custom_meta[node.name]
+                            and v != custom_meta[node.name][k]
+                        ):
+                            raise AssertionError(
+                                f"Mismatch in custom metadata for key {k}. Value in "
+                                f"node.meta is {v} and value in custom_meta is {custom_meta[node.name][k]}."
+                            )
+                node.meta["custom"].update(custom_meta[node.name])
+            # if the constant obj is an input, we also need to update meta["val"]
+            # because this is created before the placeholder naming pass
+            if isinstance(node.meta["val"], CustomObjArgument):
+                node.meta["val"].name = node.name
+        elif node.name in name_map:
+            node.name = name_map[node.name]
+
+
 def placeholder_naming_pass(
     gm: torch.fx.GraphModule,
     export_graph_signature: "ExportGraphSignature",
@@ -1091,21 +1126,7 @@ def _extract_pytree_key(x):
         )
 
     # assign new node names
-    for node in gm.graph.nodes:
-        if node.op == "placeholder":
-            assert node.name in name_map
-            node.name = node.target = name_map[node.name]
-            if node.name in custom_meta:
-                if node.meta.get("custom") is None:
-                    node.meta["custom"] = custom_meta[node.name]
-                else:
-                    assert node.meta["custom"] == custom_meta[node.name]
-            # if the constant obj is an input, we also need to update meta["val"]
-            # because this is created before the placeholder naming pass
-            if isinstance(node.meta["val"], CustomObjArgument):
-                node.meta["val"].name = node.name
-        elif node.name in name_map:
-            node.name = name_map[node.name]
+    _assign_new_node_names(gm, name_map, custom_meta)
 
     # propagate names to higher order op subgraphs
     _name_hoo_subgraph_placeholders(gm)
diff --git a/torch/_functorch/_aot_autograd/aot_autograd_result.py b/torch/_functorch/_aot_autograd/aot_autograd_result.py
index ce01e37f03243..3bbacfaf30802 100644
--- a/torch/_functorch/_aot_autograd/aot_autograd_result.py
+++ b/torch/_functorch/_aot_autograd/aot_autograd_result.py
@@ -22,9 +22,10 @@
 import json
 import logging
 from abc import ABC, abstractmethod
+from collections.abc import Callable
 from copy import copy
 from dataclasses import dataclass
-from typing import Any, Callable, Generic, Optional, TYPE_CHECKING, TypeVar
+from typing import Any, Generic, Optional, TYPE_CHECKING, TypeVar
 
 import torch
 from torch._dynamo.precompile_context import BackendCacheArtifact
@@ -510,6 +511,7 @@ def wrap_post_compile(
         ).post_compile(
             compiled_fw_func, aot_config, runtime_metadata=self.runtime_metadata
         )
+        compiled_fw_func._boxed_call = True
         disable_amp = torch._C._is_any_autocast_enabled()
 
         if needs_autograd:
diff --git a/torch/_functorch/_aot_autograd/frontend_utils.py b/torch/_functorch/_aot_autograd/frontend_utils.py
index c36a71ae96318..4780fd2b8ebcc 100644
--- a/torch/_functorch/_aot_autograd/frontend_utils.py
+++ b/torch/_functorch/_aot_autograd/frontend_utils.py
@@ -8,6 +8,7 @@
 import torch
 import torch.utils._pytree as pytree
 from torch._guards import detect_fake_mode
+from torch._library.opaque_object import is_opaque_type
 from torch._subclasses import FakeTensor, FakeTensorMode
 from torch.fx.experimental.proxy_tensor import _pytree_subclasses_that_lose_info
 from torch.fx.experimental.symbolic_shapes import ShapeEnv
@@ -46,7 +47,7 @@ def convert(idx, x):
                         hint=x,
                         source=source,
                     )
-            if isinstance(x, torch.ScriptObject):
+            if isinstance(x, torch.ScriptObject) or is_opaque_type(type(x)):
                 return torch._library.fake_class_registry.maybe_to_fake_obj(
                     fake_mode, x
                 )
diff --git a/torch/_functorch/_aot_autograd/graph_capture.py b/torch/_functorch/_aot_autograd/graph_capture.py
index 132cf335b3874..b6ea08a802240 100644
--- a/torch/_functorch/_aot_autograd/graph_capture.py
+++ b/torch/_functorch/_aot_autograd/graph_capture.py
@@ -33,6 +33,7 @@
     handle_effect_tokens_fn,
 )
 from .schemas import AOTConfig, FxValue, SubclassMeta, TraceFn, ViewAndMutationMeta
+from .streams import assign_backward_streams
 from .utils import (
     call_and_expect_output_descs,
     copy_fwd_metadata_to_bw_nodes,
@@ -473,6 +474,9 @@ def aot_dispatch_autograd_graph(
     # fw node match might be erased
     copy_fwd_metadata_to_bw_nodes(fx_g)
 
+    # After copying metadata, assign streams to gradient accumulation nodes
+    assign_backward_streams(fx_g)
+
     fx_g.graph.eliminate_dead_code()
     if not aot_config.disable_functionalization:
         # There should be *NO* mutating ops in the graph at this point.
diff --git a/torch/_functorch/_aot_autograd/graph_capture_wrappers.py b/torch/_functorch/_aot_autograd/graph_capture_wrappers.py
index d81b0e9d0bd24..bc4dc87ddeced 100644
--- a/torch/_functorch/_aot_autograd/graph_capture_wrappers.py
+++ b/torch/_functorch/_aot_autograd/graph_capture_wrappers.py
@@ -1272,15 +1272,18 @@ def inner_fw_only(*args):
 
         args_unwrapped = (primals_unwrapped_pair[0], tangents_unwrapped_pair[0])
         args_descs_unwrapped = (primals_unwrapped_pair[1], tangents_unwrapped_pair[1])
+        remapped_static_indices = remap_unwrapped_subclass_arg_indices(
+            args[0], meta.static_input_indices
+        )
     else:
         args_unwrapped, args_descs_unwrapped = unwrap_tensor_subclasses(  # type: ignore[assignment]
             args,  # type: ignore[arg-type]
             args_descs,  # type: ignore[arg-type]
             append_symints=True,
         )
-    remapped_static_indices = remap_unwrapped_subclass_arg_indices(
-        args, meta.static_input_indices
-    )
+        remapped_static_indices = remap_unwrapped_subclass_arg_indices(
+            args, meta.static_input_indices
+        )
 
     if is_joint_structure:
         primals_unwrapped = args_unwrapped[0]  # type: ignore[assignment]
diff --git a/torch/_functorch/_aot_autograd/graph_compile.py b/torch/_functorch/_aot_autograd/graph_compile.py
index 60ee3bc2973b1..78320c1b37563 100644
--- a/torch/_functorch/_aot_autograd/graph_compile.py
+++ b/torch/_functorch/_aot_autograd/graph_compile.py
@@ -25,6 +25,9 @@
 if TYPE_CHECKING:
     from collections.abc import Sequence
 
+import threading
+from contextlib import contextmanager
+
 import torch
 import torch.utils._pytree as pytree
 import torch.utils.dlpack
@@ -97,6 +100,54 @@
 )
 
 
+_thread_local = threading.local()
+
+
+@contextmanager
+def maybe_skip_decompose(aot_config: AOTConfig):
+    old_decomp = aot_config.decompositions
+    try:
+        if config.selective_decompose:
+            aot_config.decompositions = {}
+        yield
+    finally:
+        aot_config.decompositions = old_decomp
+
+
+# Saved tensor hooks context
+# Compiled saved tensor hooks are convenient way to inline some logic in the graphs
+# for saved nodes from forward to backward. (E.g. activations quantization)
+# In base implementation user does not have any additional information about saved value
+# in the hook, except FakeTensor shape, dtype, device etc.
+# _get_saved_tensor_hook_context gives additional graph information about that saved value,
+# that can be used to make a decisions which pack/unpack to apply for particular saved value.
+# This allows user to reuse saved tensors hooks api to apply selective pack/unpack in
+# graph aware way.
+# Alternative to this will be making user to write a custom pass that mucks with forward outputs,
+# backward input metadata, which requires significantly more effort.
+#
+# As for now in context we expose forward graph, backward graph and current saved node,
+# which contains node.meta with additional information about that fx.Node.
+# Warning: This API may change without backward compatibility.
+@contextmanager
+def _saved_tensor_hook_context(state: dict[str, Any]):
+    previous_state = getattr(_thread_local, "state", None)
+    try:
+        _thread_local.state = state
+        yield
+    finally:
+        # Clean up: restore previous state or remove attribute
+        if previous_state is not None:
+            _thread_local.state = previous_state
+        else:
+            if hasattr(_thread_local, "state"):
+                delattr(_thread_local, "state")
+
+
+def _get_saved_tensor_hook_context() -> dict[str, Any] | None:
+    return getattr(_thread_local, "state", None)
+
+
 zip = strict_zip
 
 log = logging.getLogger(__name__)
@@ -156,11 +207,28 @@ def orig_flat_fn2(*args: FxValue) -> tuple[list[FxValue], list[AOTOutput]]:
     # deterministic TLS can be different
     aot_state.fw_metadata.deterministic = torch.are_deterministic_algorithms_enabled()
     updated_flat_args: Union[list[Any], tuple[list[Any], list[Any]]]
-    if aot_state.needs_autograd and not aot_config.pre_dispatch:
-        # FYI: this being moved to trigger in export is new, seems fine!
-        with dynamo_timed("aot_trace_joint_graph", log_pt2_compile_event=True):
+
+    with maybe_skip_decompose(aot_config):
+        # if config.selective_decompose, skip decomposition and apply selective_decompose
+        # after we get the joint graph. See [Note: Selective Decomposition] for details.
+        if aot_state.needs_autograd and not aot_config.pre_dispatch:
+            # FYI: this being moved to trigger in export is new, seems fine!
+            with dynamo_timed("aot_trace_joint_graph", log_pt2_compile_event=True):
+                (
+                    graph,
+                    updated_flat_args,
+                    updated_flat_args_descs,
+                    maybe_subclass_meta,
+                ) = aot_dispatch_autograd_graph(
+                    flat_fn,
+                    aot_state.flat_args,
+                    aot_state.flat_args_descs,
+                    aot_config,
+                    fw_metadata=aot_state.fw_metadata,
+                )
+        else:
             graph, updated_flat_args, updated_flat_args_descs, maybe_subclass_meta = (
-                aot_dispatch_autograd_graph(
+                aot_dispatch_base_graph(
                     flat_fn,
                     aot_state.flat_args,
                     aot_state.flat_args_descs,
@@ -168,15 +236,17 @@ def orig_flat_fn2(*args: FxValue) -> tuple[list[FxValue], list[AOTOutput]]:
                     fw_metadata=aot_state.fw_metadata,
                 )
             )
-    else:
-        graph, updated_flat_args, updated_flat_args_descs, maybe_subclass_meta = (
-            aot_dispatch_base_graph(  # type: ignore[assignment]
-                flat_fn,
-                aot_state.flat_args,
-                aot_state.flat_args_descs,
-                aot_config,
-                fw_metadata=aot_state.fw_metadata,
-            )
+
+    if config.selective_decompose:
+        from torch.fx.experimental.proxy_tensor import selective_decompose
+        from torch.fx.passes.regional_inductor import _needs_inductor_compile
+
+        graph = selective_decompose(
+            graph,
+            *updated_flat_args,
+            decomposition=aot_config.decompositions,
+            should_decompose=_needs_inductor_compile,
+            trace_joint_graph=aot_state.needs_autograd and not aot_config.pre_dispatch,
         )
 
     return AOTGraphCapture(
@@ -1097,7 +1167,11 @@ def _gen_unused_name(candidate: str):
         if not isinstance(val, torch.Tensor):
             continue
 
-        pack_out_val = pack_hook_gm(val)
+        def _get_extra_info() -> dict[str, Any]:
+            return {"_fw_graph": fw_g, "_bw_graph": bw_g, "_node": saved}
+
+        with _saved_tensor_hook_context(_get_extra_info()):
+            pack_out_val = pack_hook_gm(val)
 
         requires_sc_handling = any(
             is_traceable_wrapper_subclass(x) for x in pytree.tree_leaves(pack_out_val)
@@ -1109,16 +1183,17 @@ def _gen_unused_name(candidate: str):
                 " in the pack hook, and reconstructing the subclass in the unpack hook"
             )
 
-        pack_gm = prepare_hook_gm(aot_config, pack_hook_gm, (val,))
-        pack_g = pack_gm.graph
-        maybe_log_graph(
-            pack_gm,
-            f"saved_tensors_pack_hook {saved.name}",
-            aot_config,
-            lambda: f"aot_saved_tensors_hooks_pack {saved.name}",
-            structured_logs,
-        )
-        pack_out_val = pack_gm(val)
+        with _saved_tensor_hook_context(_get_extra_info()):
+            pack_gm = prepare_hook_gm(aot_config, pack_hook_gm, (val,))
+            pack_g = pack_gm.graph
+            maybe_log_graph(
+                pack_gm,
+                f"saved_tensors_pack_hook {saved.name}",
+                aot_config,
+                lambda: f"aot_saved_tensors_hooks_pack {saved.name}",
+                structured_logs,
+            )
+            pack_out_val = pack_gm(val)
 
         # Install pack hook graph as eiplogue of fw_module.
         # Saved tensor output becomes input of pack hook graph.
@@ -1188,15 +1263,16 @@ def _gen_unused_name(candidate: str):
         # Install unpack hook graph as a prologue of backward graph
         # Saved tensors inputs are replaced with packed tensors and packed sym scalars.
         # The saved tensors inputs usages in the graph are replaced with unpack hook graph outputs.
-        unpack_gm = prepare_hook_gm(aot_config, unpack_hook_gm, (pack_out_val,))
-        unpack_g = unpack_gm.graph
-        maybe_log_graph(
-            unpack_gm,
-            f"saved_tensors_unpack_hook {saved.name}",
-            aot_config,
-            lambda: f"aot_saved_tensors_hooks_unpack {saved.name}",
-            structured_logs,
-        )
+        with _saved_tensor_hook_context(_get_extra_info()):
+            unpack_gm = prepare_hook_gm(aot_config, unpack_hook_gm, (pack_out_val,))
+            unpack_g = unpack_gm.graph
+            maybe_log_graph(
+                unpack_gm,
+                f"saved_tensors_unpack_hook {saved.name}",
+                aot_config,
+                lambda: f"aot_saved_tensors_hooks_unpack {saved.name}",
+                structured_logs,
+            )
 
         def find_saved_in_bw_inputs(bw_inputs):
             for n in bw_inputs:
diff --git a/torch/_functorch/_aot_autograd/logging_utils.py b/torch/_functorch/_aot_autograd/logging_utils.py
index b059d6b62b2c5..6325b6e6ab248 100644
--- a/torch/_functorch/_aot_autograd/logging_utils.py
+++ b/torch/_functorch/_aot_autograd/logging_utils.py
@@ -125,9 +125,7 @@ def posthook(grad_input, grad_output):
         node.register_prehook(get_prehook(forward_node_stack, node._sequence_nr()))
 
         special_stack = forward_node_stack.copy()
-        special_stack.append(
-            "Gradient addition node due to multiple use of tensor around:"
-        )
+        special_stack.append(fx_traceback.GRADIENT_ACC_SPECIAL_STACK)
         node.register_hook(get_posthook(special_stack, node._sequence_nr()))
 
 
diff --git a/torch/_functorch/_aot_autograd/runtime_wrappers.py b/torch/_functorch/_aot_autograd/runtime_wrappers.py
index 4846f1ca74edb..86202e2cd319d 100644
--- a/torch/_functorch/_aot_autograd/runtime_wrappers.py
+++ b/torch/_functorch/_aot_autograd/runtime_wrappers.py
@@ -2365,8 +2365,6 @@ def backward(double_ctx, *args):
 
             @staticmethod
             def _backward_impl(ctx, all_args):
-                from torch._inductor.async_compile import async_compile_pool_manager
-
                 # compiled autograd reimplements this function at proxy_call_aot_backward
                 assert not backward_state_indices, (
                     "BackwardState requires CompiledAutograd"
@@ -2446,7 +2444,6 @@ def _backward_impl(ctx, all_args):
                     with (
                         tracing(saved_context),
                         compile_context(saved_compile_context),
-                        async_compile_pool_manager(),
                         context(),
                         track_graph_compiling(aot_config, "backward"),
                         metrics_context,
diff --git a/torch/_functorch/_aot_autograd/streams.py b/torch/_functorch/_aot_autograd/streams.py
new file mode 100644
index 0000000000000..f78a2c6cad1de
--- /dev/null
+++ b/torch/_functorch/_aot_autograd/streams.py
@@ -0,0 +1,53 @@
+from typing import Optional, TypeAlias
+
+import torch.fx
+import torch.fx.traceback
+from torch._dynamo.graph_utils import _get_flat_args
+
+
+Node: TypeAlias = torch.fx.Node
+
+
+def is_gradient_acc(node: Node) -> bool:
+    return node.meta.get("is_gradient_acc", False)
+
+
+def get_stream(node: Node) -> Optional[int]:
+    maybe_annotation = node.meta.get("custom", None)
+    if maybe_annotation is not None:
+        return node.meta["custom"].get("stream", None)
+    else:
+        return None
+
+
+def set_stream(node: Node, ind: int) -> None:
+    if "custom" in node.meta:
+        node.meta["custom"].update({"stream": ind})
+    else:
+        node.meta["custom"] = {"stream": ind}
+
+
+def assign_backward_streams(gm: torch.fx.GraphModule) -> None:
+    """Assigns backward streams to gradient accumulation nodes"""
+
+    # NB: iterate in reverse order to more closely match eager
+    # the user node stream will be populated first
+    for node in reversed(list(gm.graph.nodes)):
+        if is_gradient_acc(node):
+            # Accumulation stream selection. Follow the rules from top to bottom to determine the accumulation stream:
+            # 1. Match first stream assignment of the first user with a stream
+            # 2. Match first stream assignment encountered in the args from left to right
+            # This differs from eager in some cases:
+            # Specifically the eager code uses the autograd node to determine the stream,
+            # crucially this does not necessarily correspond to the FX graph node. For example,
+            # in the backward for an add node with a constant we will passthrough and during backward tracing,
+            # no op will be added to the FX graph, so our stream assignment will differ in this case.
+            gradients = _get_flat_args(node, {})
+            users = list(node.users.keys())
+
+            # All gradients will be on same device, they will be coerced if they were not with a .to() node
+            for neighbor in users + gradients:
+                ind = get_stream(neighbor)
+                if ind is not None:
+                    set_stream(node, ind)
+                    break
diff --git a/torch/_functorch/_aot_autograd/utils.py b/torch/_functorch/_aot_autograd/utils.py
index 844f34bb576da..7a290161bb25b 100644
--- a/torch/_functorch/_aot_autograd/utils.py
+++ b/torch/_functorch/_aot_autograd/utils.py
@@ -3,6 +3,7 @@
 Contains various utils for AOTAutograd, including those for handling collections.
 """
 
+import copy
 import dataclasses
 import logging
 import operator
@@ -453,13 +454,20 @@ def _copy_metadata_to_bw_nodes_in_subgraph(
         if not _is_backward_node_with_seq_nr(node):
             continue
 
+        # We exclude gradient accumulation nodes from copying tags
+        if node.meta.get("is_gradient_acc", False):
+            annotation_log.debug("is_gradient_acc")
+            continue
+
         # fwd_node should always exist, but handle non-existence just in case
         fwd_node = fwd_seq_nr_to_node.get(node.meta["seq_nr"])
         if fwd_node is not None:
             node.meta["fwd_nn_module_stack"] = fwd_node.meta.get("nn_module_stack")
             node.meta["fwd_source_fn_stack"] = fwd_node.meta.get("source_fn_stack")
             # TODO: better to change to a specific field of custom?
-            node.meta["custom"] = fwd_node.meta.get("custom")
+            custom = fwd_node.meta.get("custom")
+            if custom is not None:
+                node.meta["custom"] = copy.deepcopy(custom)
 
 
 def copy_fwd_metadata_to_bw_nodes(fx_g: torch.fx.GraphModule) -> None:
diff --git a/torch/_functorch/aot_autograd.py b/torch/_functorch/aot_autograd.py
index 2aa70a76e6e78..8555026122ece 100644
--- a/torch/_functorch/aot_autograd.py
+++ b/torch/_functorch/aot_autograd.py
@@ -26,6 +26,7 @@
 from torch._guards import detect_fake_mode
 from torch._inductor.cudagraph_utils import BoxedDeviceIndex
 from torch._inductor.utils import BoxedBool
+from torch._library.autograd import autograd_fallback_mode
 from torch._subclasses import FakeTensor, FakeTensorMode
 from torch.export._tree_utils import reorder_kwargs
 from torch.fx.experimental.proxy_tensor import make_fx
@@ -528,8 +529,12 @@ def create_aot_state(
     stack.enter_context(
         torch._dynamo.utils._disable_saved_tensors_hooks_during_tracing()
     )
+    # Make it an error to backprop through PT2 compliant ops that silently
+    # detach autograd
+    stack.enter_context(autograd_fallback_mode("error"))
 
     from torch._library.fake_class_registry import FakeScriptObject, maybe_to_fake_obj
+    from torch._library.opaque_object import is_opaque_type
 
     # Tracing may mutate the states the fake script object,
     # so we need to duplicate the fake script objects so that subsequent tracing
@@ -537,7 +542,7 @@ def create_aot_state(
     def _dup_fake_script_obj(fake_flat_args):
         return [
             maybe_to_fake_obj(detect_fake_mode(fake_flat_args), arg.real_obj)
-            if isinstance(arg, FakeScriptObject)
+            if isinstance(arg, FakeScriptObject) or is_opaque_type(type(arg))
             else arg
             for arg in fake_flat_args
         ]
diff --git a/torch/_functorch/benchmark_utils.py b/torch/_functorch/benchmark_utils.py
index ba0b31c018bd1..596f1e7c00dc5 100644
--- a/torch/_functorch/benchmark_utils.py
+++ b/torch/_functorch/benchmark_utils.py
@@ -73,8 +73,8 @@ def dump_chrome_trace(
 
 
 def get_chrome_trace_events(filename):
-    f = open(filename)
-    data = json.load(f)
+    with open(filename) as f:
+        data = json.load(f)
     events = data["traceEvents"]
     return events
 
diff --git a/torch/_functorch/config.py b/torch/_functorch/config.py
index 3dd2529b1b107..790cf71a83a23 100644
--- a/torch/_functorch/config.py
+++ b/torch/_functorch/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Callable
+from collections.abc import Callable
 
 
 """
@@ -374,6 +374,13 @@ def remote_autograd_cache_default() -> Optional[bool]:
 # This callback is invoked on the joint graph before partitioning
 joint_custom_pass: Callable = None  # type: ignore[assignment]
 
+# Note [Selective Decomposition]
+# This config allows selective decomposition of certain operators in the graph.
+# When True, it does NOT decompose any nodes, except those nodes that users explicitly
+# annotated with regional inductor compile. Please read torch.fx.passes.regional_inductor
+# on to explicitly annotate. This is currently only used by inductor lite mode.
+selective_decompose: bool = False
+
 
 if TYPE_CHECKING:
     from torch.utils._config_typing import *  # noqa: F401, F403
diff --git a/torch/_functorch/functional_call.py b/torch/_functorch/functional_call.py
index 55f45c9256962..8e2f943d3e447 100644
--- a/torch/_functorch/functional_call.py
+++ b/torch/_functorch/functional_call.py
@@ -131,7 +131,7 @@ def compute_loss(params, x, t):
             raise ValueError(
                 "Expected all elements of parameter_and_buffer_dicts to be dictionaries"
             )
-        all_keys = [k for d in parameter_and_buffer_dicts for k in d.keys()]
+        all_keys = [k for d in parameter_and_buffer_dicts for k in d]
         all_keys_counter: dict[str, int] = {}
         for k in all_keys:
             v = all_keys_counter.get(k, 0)
diff --git a/torch/_guards.py b/torch/_guards.py
index bac59965a3aef..32b796d71eea7 100644
--- a/torch/_guards.py
+++ b/torch/_guards.py
@@ -145,6 +145,7 @@ class GuardSource(enum.Enum):
     GLOBAL_UNSPECIALIZED_NN_MODULE = 13
     LOCAL_UNSPECIALIZED_BUILTIN_NN_MODULE = 14
     GLOBAL_UNSPECIALIZED_BUILTIN_NN_MODULE = 15
+    TEMP_LOCAL = 16
 
     def is_fsdp_module(self) -> bool:
         return self in (GuardSource.GLOBAL_FSDP_MODULE, GuardSource.LOCAL_FSDP_MODULE)
@@ -903,7 +904,7 @@ def patch(**kwargs: Any) -> Generator[None, None, None]:
         prior = {}
         ctx = TracingContext.get()
 
-        for key in kwargs.keys():
+        for key in kwargs:
             # KeyError on invalid entry
             prior[key] = getattr(ctx, key)
         for key, val in kwargs.items():
diff --git a/torch/_higher_order_ops/__init__.py b/torch/_higher_order_ops/__init__.py
index 516d58bdf314e..d1d5c567dced4 100644
--- a/torch/_higher_order_ops/__init__.py
+++ b/torch/_higher_order_ops/__init__.py
@@ -24,6 +24,7 @@
 from torch._higher_order_ops.local_map import local_map_hop
 from torch._higher_order_ops.map import map
 from torch._higher_order_ops.out_dtype import out_dtype
+from torch._higher_order_ops.print import print
 from torch._higher_order_ops.run_const_graph import run_const_graph
 from torch._higher_order_ops.scan import scan
 from torch._higher_order_ops.strict_mode import strict_mode
@@ -34,6 +35,7 @@
 )
 from torch._higher_order_ops.wrap import (
     dynamo_bypassing_wrapper,
+    inductor_compiled_code,
     tag_activation_checkpoint,
     wrap_activation_checkpoint,
     wrap_with_autocast,
@@ -75,4 +77,6 @@
     "map",
     "while_loop_stack_output",
     "local_map_hop",
+    "print",
+    "inductor_compiled_code",
 ]
diff --git a/torch/_higher_order_ops/effects.py b/torch/_higher_order_ops/effects.py
index beb26ca3a8d43..2c8d75c67c791 100644
--- a/torch/_higher_order_ops/effects.py
+++ b/torch/_higher_order_ops/effects.py
@@ -1,13 +1,14 @@
 # mypy: allow-untyped-defs
-from enum import Enum
 from typing import Any, Optional, Union
-from weakref import WeakKeyDictionary
 
 import torch
 import torch.utils._pytree as pytree
 from torch._C import DispatchKey
+from torch._higher_order_ops.print import print as hop_print
 from torch._higher_order_ops.torchbind import call_torchbind
-from torch._library.fake_class_registry import FakeScriptObject
+from torch._library.custom_ops import CustomOpDef
+from torch._library.effects import EffectType
+from torch._library.utils import RegistrationHandle
 from torch._ops import HigherOrderOperator
 from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.fx.experimental.proxy_tensor import (
@@ -17,38 +18,51 @@
 )
 
 
-class _EffectType(Enum):
-    ORDERED = "Ordered"
+_op_identifier = Union[
+    str,
+    "torch._ops.OpOverload",
+    "torch._library.custom_ops.CustomOpDef",
+    "torch._ops.HigherOrderOperator",
+]
+OpType = Union["torch._ops.HigherOrderOperator", "torch._ops.OpOverload"]
 
+_EffectType = EffectType
 
-OpType = Union[torch._ops.HigherOrderOperator, torch._ops.OpOverload]
 
+def _get_op_qualname(op: _op_identifier) -> str:
+    """Convert an op identifier to a qualified string key."""
+    if isinstance(op, torch._ops.OpOverload):
+        return op._name
+    elif isinstance(op, torch._ops.HigherOrderOperator):
+        return f"{op.namespace}::{op.name()}"
+    elif isinstance(op, CustomOpDef):
+        return op._qualname
+    elif isinstance(op, str):
+        return op
 
-SIDE_EFFECTS = WeakKeyDictionary[OpType, _EffectType](
-    [
-        (torch.ops.aten._print.default, _EffectType.ORDERED),
-        (call_torchbind, _EffectType.ORDERED),
-    ]
-)
+    raise ValueError(f"Invalid operator input {op}")
 
 
-def _register_effectful_op(op: OpType, effect: _EffectType):
-    assert isinstance(
-        op, (torch._ops.OpOverload, torch._ops.HigherOrderOperator)
-    ) and not has_aliasing(op)
-    if op in SIDE_EFFECTS and SIDE_EFFECTS[op] != effect:
-        raise RuntimeError(
-            f"Already registered effect type {SIDE_EFFECTS[op]} to op {op}, "
-            f"trying to register a different effect type {effect}."
-        )
-    SIDE_EFFECTS[op] = effect
+def _register_effectful_op(
+    op: _op_identifier, effect: Optional[EffectType]
+) -> RegistrationHandle:
+    qualname = _get_op_qualname(op)
+    entry = torch._library.simple_registry.singleton.find(qualname)
+    handle = entry.effect.register(effect)
+    return handle
 
 
-def _deregister_effectful_op(op: OpType):
-    if op not in SIDE_EFFECTS:
-        raise RuntimeError(f"Op {op} is not registered as effectful")
+def _get_effect(op: _op_identifier) -> Optional[_EffectType]:
+    qualname = _get_op_qualname(op)
+    entry = torch._library.simple_registry.singleton.find(qualname)
+    return entry.effect.effect
 
-    del SIDE_EFFECTS[op]
+
+_register_effectful_op("aten::_print", _EffectType.ORDERED)
+_register_effectful_op("aten::_async_error", _EffectType.ORDERED)
+_register_effectful_op("profiler::_record_function_exit._RecordFunction", None)
+_register_effectful_op(call_torchbind, _EffectType.ORDERED)
+_register_effectful_op(hop_print, _EffectType.ORDERED)
 
 
 class WithEffects(HigherOrderOperator):
@@ -77,7 +91,7 @@ def __call__(
     ) -> tuple[Any, ...]:
         assert isinstance(op, (torch._ops.HigherOrderOperator, torch._ops.OpOverload))
         assert not has_aliasing(op), "Ops with aliasing is not supported"
-        assert has_effects(op, args, kwargs)
+        assert has_effects(op)
         assert isinstance(kwargs, dict)
         return super().__call__(token, op, *args, **kwargs)
 
@@ -88,7 +102,7 @@ def __call__(
 def has_aliasing(op: OpType):
     # NOT FOR PUBLIC USE
     if isinstance(op, torch._ops.HigherOrderOperator):
-        return op not in SIDE_EFFECTS
+        return not _get_effect(op)
 
     for arg in op._schema.arguments:
         if arg.alias_info is not None:
@@ -99,7 +113,7 @@ def has_aliasing(op: OpType):
     return False
 
 
-def has_effects(op, args, kwargs) -> bool:
+def has_effects(op) -> bool:
     # Skip over the profiler's RecordFunction as they should not show up in the graph
     _skip_ops = {torch.ops.profiler._record_function_exit._RecordFunction}
     if op in _skip_ops:
@@ -108,31 +122,10 @@ def has_effects(op, args, kwargs) -> bool:
     return (
         isinstance(op, (torch._ops.HigherOrderOperator, torch._ops.OpOverload))
         and not has_aliasing(op)
-        and get_effect_key(op, args, kwargs) is not None
+        and _get_effect(op) is not None
     )
 
 
-def get_effect_key(op, args, kwargs) -> Optional[_EffectType]:
-    if op in SIDE_EFFECTS:
-        return SIDE_EFFECTS[op]
-
-    for arg in args:
-        if isinstance(arg, (torch.ScriptObject, FakeScriptObject)):
-            # Add it to the table so that next time we see the same op we don't
-            # have to parse through the args again
-            SIDE_EFFECTS[op] = _EffectType.ORDERED
-            return _EffectType.ORDERED
-
-    for arg in kwargs.values():
-        if isinstance(arg, (torch.ScriptObject, FakeScriptObject)):
-            # Add it to the table so that next time we see the same op we don't
-            # have to parse through the args again
-            SIDE_EFFECTS[op] = _EffectType.ORDERED
-            return _EffectType.ORDERED
-
-    return None
-
-
 def new_token_tensor() -> torch.Tensor:
     return torch.tensor([])
 
@@ -205,11 +198,15 @@ def with_effects_proxy(
 with_effects.fallthrough(DispatchKey.AutogradCUDA)
 
 
-def _get_schema(op, args) -> torch.FunctionSchema:
+def _get_schema(op, args, kwargs: Optional[dict] = None) -> torch.FunctionSchema:
     if isinstance(op, torch._ops.OpOverload):
         return op._schema
     elif op == call_torchbind:
         return getattr(args[0], args[1]).schema
+    elif op == hop_print:
+        # hop_print currently expects (format_str, *kwargs) as its arguments
+        extra_kwargs = kwargs or {}
+        return op.gen_schema(*args, **extra_kwargs)
     else:
         raise RuntimeError(f"Unable to get schema for op {op}")
 
@@ -237,7 +234,7 @@ def handle_effects(
     # Get a token. We can't do `tokens.get(op, torch.tensor([]))` because
     # this will create an empty tensor during proxy mode tracing if the token
     # doesn't exist. But the tokens should always exist during proxy mode tracing.
-    key = get_effect_key(op, args, kwargs)
+    key = _get_effect(op)
     assert key is not None
     if key not in tokens:
         assert allow_token_discovery, (
@@ -282,7 +279,7 @@ def handle_effects(
             unwrapped_token, op, *unwrapped_args, **unwrapped_kwargs
         )
 
-    schema = _get_schema(op, unwrapped_args)
+    schema = _get_schema(op, unwrapped_args, unwrapped_kwargs)
     if len(schema.returns) == 0:
         assert unwrapped_outs[0] is None
         unwrapped_outs = None  # type: ignore[assignment]
diff --git a/torch/_higher_order_ops/flex_attention.py b/torch/_higher_order_ops/flex_attention.py
index de23e07671ed0..ade9cfb3d5689 100644
--- a/torch/_higher_order_ops/flex_attention.py
+++ b/torch/_higher_order_ops/flex_attention.py
@@ -356,9 +356,10 @@ def trace_flex_attention(
     )
     # pyrefly: ignore [missing-attribute]
     proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, node_args)
-    out_proxy = proxy_mode.tracer.create_proxy(
-        "call_function", flex_attention, proxy_args, {}
-    )
+    with torch.fx.experimental.proxy_tensor.set_original_aten_op(flex_attention):
+        out_proxy = proxy_mode.tracer.create_proxy(
+            "call_function", flex_attention, proxy_args, {}
+        )
     return track_tensor_tree(
         example_out,
         out_proxy,
@@ -1114,23 +1115,26 @@ def flex_attention_backward_proxy_torch_dispatch_mode(
     torch.Tensor, torch.Tensor, torch.Tensor, tuple[Optional[torch.Tensor], ...]
 ]:
     assert mode is not None, "Mode should always be enabled for python fallback key"
-    return trace_flex_attention_backward(
-        mode,
-        query,
-        key,
-        value,
-        out,
-        logsumexp,
-        grad_out,
-        grad_logsumexp,
-        fw_graph,
-        joint_graph,
-        block_mask,
-        scale,
-        kernel_options,
-        score_mod_other_buffers,
-        mask_mod_other_buffers,
-    )
+    with torch.fx.experimental.proxy_tensor.set_original_aten_op(
+        flex_attention_backward
+    ):
+        return trace_flex_attention_backward(
+            mode,
+            query,
+            key,
+            value,
+            out,
+            logsumexp,
+            grad_out,
+            grad_logsumexp,
+            fw_graph,
+            joint_graph,
+            block_mask,
+            scale,
+            kernel_options,
+            score_mod_other_buffers,
+            mask_mod_other_buffers,
+        )
 
 
 @flex_attention_backward.py_functionalize_impl
diff --git a/torch/_higher_order_ops/print.py b/torch/_higher_order_ops/print.py
new file mode 100644
index 0000000000000..889c2a4ca7836
--- /dev/null
+++ b/torch/_higher_order_ops/print.py
@@ -0,0 +1,93 @@
+import builtins
+
+import torch
+import torch.utils._pytree as pytree
+from torch._ops import HigherOrderOperator
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode
+
+
+class Print(HigherOrderOperator):
+    """
+    print(format_str, **kwargs) -> None
+
+    This Higher Order Operator (HOP) provides a functional version of print for use in PyTorch graphs.
+    It enables format printing with named arguments, e.g., torch._higher_order_ops.print("moo {x} {y}", x=1, y=2).
+
+    This HOP enables printing without causing graph break.
+    """
+
+    def __init__(self) -> None:
+        super().__init__("print")
+
+    def __call__(self, format_str: str, **kwargs: object) -> None:
+        assert isinstance(format_str, str)
+        return super().__call__(format_str, **kwargs)
+
+    # pyrefly: ignore [bad-override]
+    def gen_schema(self, format_str: str, **kwargs: object) -> torch.FunctionSchema:
+        from torch._higher_order_ops.schema import HopSchemaGenerator
+
+        schema_gen = HopSchemaGenerator(self)
+        schema_gen.add_arg("format_str", format_str[0])
+
+        # Add each kwarg as a keyword-only argument
+        for key, value in kwargs.items():
+            schema_gen.add_arg(key, value, kw_only=True)
+
+        schema_gen.add_schema_tree_spec(format_str, **kwargs)
+
+        return schema_gen.gen_schema()
+
+
+print = Print()
+
+
+@print.py_impl(ProxyTorchDispatchMode)
+# pyre-ignore
+def print_proxy_torch_dispatch_mode(
+    mode: ProxyTorchDispatchMode, format_str: str, **kwargs: object
+) -> None:
+    proxy_kwargs = pytree.tree_map(mode.tracer.unwrap_proxy, kwargs)  # type: ignore[union-attr]  # noqa: F841
+    mode.tracer.create_proxy("call_function", print, (format_str,), proxy_kwargs)
+
+
+@print.py_impl(FakeTensorMode)
+# pyre-ignore
+def print_fake_tensor_mode(mode, format_str: str, **kwargs: object):
+    return None
+
+
+@print.py_impl(torch._C.DispatchKey.CompositeExplicitAutograd)
+# pyre-ignore
+def print_impl(format_str: str, **kwargs: object) -> None:
+    # Ensure all immutable_dict/list in kwargs are converted to regular dict/list
+    map_types: dict[type, type] = {
+        torch.fx.immutable_collections.immutable_dict: dict,
+        torch.fx.immutable_collections.immutable_list: list,
+    }
+    new_kwargs = pytree.tree_map_only(
+        tuple(map_types.keys()),
+        lambda a: map_types[type(a)](a),
+        kwargs,
+        lambda a: isinstance(a, tuple(map_types.keys())),
+    )
+    #  Use built-in print to avoid recursion with the HOP print
+    builtins.print(format_str.format(**new_kwargs))
+
+
+print.fallthrough(torch._C.DispatchKey.AutogradCPU)
+print.fallthrough(torch._C.DispatchKey.AutogradCUDA)
+
+
+@print.py_functionalize_impl
+def print_func(ctx, format_str: str, **kwargs: object):
+    from torch._higher_order_ops.effects import handle_effects
+
+    return handle_effects(
+        ctx.mode._allow_token_discovery,
+        ctx.mode._tokens,
+        print,  # type: ignore[arg-type]
+        (format_str,),
+        kwargs,  # type: ignore[arg-type]
+    )
diff --git a/torch/_higher_order_ops/triton_kernel_wrap.py b/torch/_higher_order_ops/triton_kernel_wrap.py
index 8ffab37699422..0e398897a7eab 100644
--- a/torch/_higher_order_ops/triton_kernel_wrap.py
+++ b/torch/_higher_order_ops/triton_kernel_wrap.py
@@ -498,6 +498,7 @@ def get_signature_value(idx: int, arg: Any) -> str:
         # pyrefly: ignore  # missing-attribute
         codegen_fns = backend.get_codegen_implementation(*codegen_args)
         module_map = backend.get_module_map()
+        # pyrefly: ignore[missing-argument,bad-argument-type]
         ttir_module = src.make_ir(options, codegen_fns, module_map, context)
     else:
         codegen_args = [options] if get_codegen_implementation_sig_params == 1 else []
diff --git a/torch/_higher_order_ops/utils.py b/torch/_higher_order_ops/utils.py
index 160e149fd769f..fad19b1d5ffae 100644
--- a/torch/_higher_order_ops/utils.py
+++ b/torch/_higher_order_ops/utils.py
@@ -337,7 +337,7 @@ def analyze_potential_input_alias_or_mutation(name, aliases, input_mutations):
         raise RuntimeError(
             f"{name} where aliases appear. "
             + f"In particular, these inputs \
-            {set(el for el_map in aliases if len(el_map.keys()) > 0 for el in el_map.keys())} "  # noqa: C401
+            {set(el for el_map in aliases if len(el_map.keys()) > 0 for el in el_map)} "  # noqa: C401
             + "get aliased. Please ensure that this doesn't happen."
         )
     if len(input_mutations):
diff --git a/torch/_higher_order_ops/wrap.py b/torch/_higher_order_ops/wrap.py
index ba6bbe0c39b6b..3a7f287425ae1 100644
--- a/torch/_higher_order_ops/wrap.py
+++ b/torch/_higher_order_ops/wrap.py
@@ -6,12 +6,15 @@
 
 import torch
 import torch.utils._pytree as pytree
-from torch._higher_order_ops.utils import reenter_make_fx
+from torch._C import DispatchKey
+from torch._higher_order_ops.utils import redirect_to_mode, reenter_make_fx
 from torch._logging import warning_once
 from torch._ops import HigherOrderOperator
 from torch.fx import GraphModule
 from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode, track_tensor_tree
 from torch.types import _dtype
+from torch.utils._debug_mode import DebugMode
+from torch.utils.checkpoint import _CachedTorchDispatchMode, _CachingTorchDispatchMode
 
 
 log = logging.getLogger(__name__)
@@ -41,6 +44,35 @@ def wrapper():
 wrap = Wrap()
 
 
+class InductorCompiledCode(HigherOrderOperator):
+    """
+    Defines a HOP for wrapping inductor compiled functions as a callable.
+    When used with torch.compile via "wrap_inductor_compiled_regions",
+    this HOP will automatically be wrapped and redirect various torch dispatch modes.
+    """
+
+    def __init__(self) -> None:
+        super().__init__("inductor_compiled_code")
+
+    def __call__(self, func, *args, **kwargs):
+        return super().__call__(func, *args, **kwargs)
+
+
+inductor_compiled_code = InductorCompiledCode()
+inductor_compiled_code.fallthrough(DispatchKey.AutogradCPU)
+inductor_compiled_code.fallthrough(DispatchKey.AutogradCUDA)
+
+
+@inductor_compiled_code.py_impl(DispatchKey.CompositeExplicitAutograd)
+def inductor_compiled_code_impl(func, inputs):
+    return func(inputs)
+
+
+redirect_to_mode(inductor_compiled_code, DebugMode)
+redirect_to_mode(inductor_compiled_code, _CachingTorchDispatchMode)
+redirect_to_mode(inductor_compiled_code, _CachedTorchDispatchMode)
+
+
 class WrapWithSetGradEnabled(HigherOrderOperator):
     def __init__(self) -> None:
         super().__init__("wrap_with_set_grad_enabled")
@@ -228,10 +260,10 @@ def divide_kwargs(kwargs):
         checkpoint_keys.add("preserve_rng_state")
 
         checkpoint_kwargs = {
-            name: kwargs[name] for name in kwargs.keys() if name in checkpoint_keys
+            name: kwargs[name] for name in kwargs if name in checkpoint_keys
         }
         gmod_kwargs = {
-            name: kwargs[name] for name in kwargs.keys() if name not in checkpoint_keys
+            name: kwargs[name] for name in kwargs if name not in checkpoint_keys
         }
         return checkpoint_kwargs, gmod_kwargs
 
diff --git a/torch/_inductor/__init__.py b/torch/_inductor/__init__.py
index f2f3021984f66..810649e7b7b25 100644
--- a/torch/_inductor/__init__.py
+++ b/torch/_inductor/__init__.py
@@ -315,6 +315,25 @@ def aot_compile(
         )
 
 
+lite_mode_options = {
+    # Fallback by default unless users explicitly annotated with
+    # regional inductor compile.
+    "fallback_by_default": True,
+    "selective_decompose": True,
+    # Disable reorder optimizations
+    "reorder_for_peak_memory": False,
+    "reorder_for_compute_comm_overlap": False,
+    "triton.reorder_for_reducing_graph_partitions": False,
+    # Disable pre-, joint-, post-grad passes
+    "use_pre_grad_passes": False,
+    "use_joint_graph_passes": False,
+    "use_post_grad_passes": False,
+    # Disable dead code elimination (dce) and buffer reuse
+    "use_dce": False,
+    "allow_buffer_reuse": False,
+}
+
+
 def list_mode_options(
     mode: Optional[str] = None, dynamic: Optional[bool] = None
 ) -> dict[str, Any]:
@@ -332,6 +351,8 @@ def list_mode_options(
 
     mode_options: dict[str, dict[str, bool]] = {
         "default": {},
+        # lite backend for opt-in optimizations
+        "lite": lite_mode_options,
         # enable cudagraphs
         "reduce-overhead": {
             "triton.cudagraphs": True,
diff --git a/torch/_inductor/async_compile.py b/torch/_inductor/async_compile.py
index ac0d60bdebd71..5ede0cd085010 100644
--- a/torch/_inductor/async_compile.py
+++ b/torch/_inductor/async_compile.py
@@ -2,7 +2,6 @@
 from __future__ import annotations
 
 import atexit
-import contextlib
 import functools
 import json
 import logging
@@ -230,18 +229,6 @@ def remove_future(kernel_src: str) -> None:
             del CompiledTritonKernels._cache[key]
 
 
-@contextlib.contextmanager
-def async_compile_pool_manager():
-    """
-    Context manager to quiesce the subproc pool at the end of compilation, i.e.,
-    when dynamo is done.
-    """
-    try:
-        yield
-    finally:
-        AsyncCompile.quiesce()
-
-
 class AsyncCompile:
     """
     Utilities to compile in thread pools or subprocess pools (in the case of Triton).
@@ -277,7 +264,9 @@ def process_pool() -> AnyPool:
         pool: AnyPool
         if config.worker_start_method == "subprocess":
             # Wrapper around ProcessPoolExecutor forks in a new process we control
-            pool = SubprocPool(get_compile_threads())
+            pool = SubprocPool(
+                get_compile_threads(), quiesce=config.quiesce_async_compile_pool
+            )
         else:
             if config.worker_start_method == "spawn":
                 # Avoid creating pools in the spawned subprocs themselves:
@@ -333,20 +322,6 @@ def use_process_pool(cls):
             cls._ready_future = cls.process_pool().submit(cls._get_ready)
         return cls._ready_future.done()
 
-    @classmethod
-    def quiesce(cls) -> None:
-        """
-        If using a SubprocPool, signal the sidecar process to shut down its
-        ProcessPoolExecutor.
-        """
-        # Don't inadvertently create a process pool if it doesn't already exist:
-        if not cls.process_pool.cache_info().currsize:
-            return
-        if config.quiesce_async_compile_pool:
-            pool = cls.process_pool()
-            if isinstance(pool, SubprocPool):
-                pool.quiesce()
-
     @classmethod
     def wakeup(cls) -> None:
         """
@@ -626,6 +601,42 @@ def task():
             future = self.submit(task)
             return LambdaFuture(lambda: future.result())
 
+    def pallas(self, kernel_name: str, source_code: str):
+        """
+        Compile Pallas (JAX experimental) kernels.
+
+        Args:
+            kernel_name: Name of the kernel to be defined
+            source_code: Source code of the Pallas kernel, as a string
+
+        Note:
+            Pallas kernels are Python code that uses JAX and Pallas APIs.
+            We use the PyCodeCache to write the source code to a file and load it.
+        """
+        from torch._inductor.codegen.pallas import MAIN_SUFFIX, PallasKernelWrapper
+
+        kernel_code_log.info("Pallas Kernel:\n%s", source_code)
+
+        def task():
+            key, path = torch._inductor.codecache.PyCodeCache.write(source_code)
+            mod = torch._inductor.codecache.PyCodeCache.load_by_key_path(key, path)
+
+            # Find our special entry point named function
+            main_func_name = f"{kernel_name}_{MAIN_SUFFIX}"
+            if not hasattr(mod, main_func_name):
+                available = [name for name in dir(mod) if callable(getattr(mod, name))]
+                raise RuntimeError(
+                    f"Could not find Pallas main kernel function '{main_func_name}'. Available callables: {available}"
+                )
+
+            return PallasKernelWrapper(getattr(mod, main_func_name), kernel_path=path)
+
+        if get_compile_threads() <= 1:
+            return task()
+        else:
+            future = self.submit(task)
+            return LambdaFuture(lambda: future.result())
+
     def wait(self, scope: dict[str, Any]) -> None:
         if get_compile_threads() > 1:
             with dynamo_timed(
diff --git a/torch/_inductor/augmented_graph_helper.py b/torch/_inductor/augmented_graph_helper.py
index 81dca605940e5..5a70a34f7b64b 100644
--- a/torch/_inductor/augmented_graph_helper.py
+++ b/torch/_inductor/augmented_graph_helper.py
@@ -164,7 +164,7 @@ def transfer_erased_node_deps(self, erased_to_new: dict[fx.Node, fx.Node]) -> No
                 self.extra_uses[new_node].add(updated_use)
 
         # Clean up erased nodes
-        for old_node in erased_merge_sets.keys():
+        for old_node in erased_merge_sets:
             self.extra_deps[old_node].clear()
             self.extra_uses[old_node].clear()
             del self.merge_sets[old_node]
diff --git a/torch/_inductor/autoheuristic/artifacts/_MixedMMH100.py b/torch/_inductor/autoheuristic/artifacts/_MixedMMH100.py
index c215790770420..8fe46cf75d8c6 100644
--- a/torch/_inductor/autoheuristic/artifacts/_MixedMMH100.py
+++ b/torch/_inductor/autoheuristic/artifacts/_MixedMMH100.py
@@ -2,7 +2,7 @@
 # fmt: off
 # This file was generated by AutoHeuristic. Do not modify it manually!
 # To regenerate this file, take a look at the steps in the README.md file inside torchgen/_autoheuristic/mixed_mm/
-from typing import List, Optional, Tuple
+from typing import Optional
 
 from torch._inductor.autoheuristic.autoheuristic_utils import (
     AHContext,
diff --git a/torch/_inductor/autotune_process.py b/torch/_inductor/autotune_process.py
index 821897d266a8e..1d1687141fb05 100644
--- a/torch/_inductor/autotune_process.py
+++ b/torch/_inductor/autotune_process.py
@@ -78,11 +78,13 @@ def process_main(read_pipe: IO[bytes], write_pipe: IO[bytes]) -> None:
 
         def workloop():
             while True:
-                job = TuningProcess.recv(read_pipe)
+                job, extra_env = TuningProcess.recv(read_pipe)
                 if job is None:
                     # None is a sentinel for the child to shut down
                     break
                 try:
+                    if extra_env:
+                        os.environ.update(extra_env)
                     result = job()
                 except Exception as e:
                     result = e
@@ -95,8 +97,10 @@ def workloop():
             pass
 
     @staticmethod
-    def send(obj: Any, write_pipe: IO[bytes]) -> None:
-        pickle.dump(obj, write_pipe)
+    def send(
+        obj: Any, write_pipe: IO[bytes], extra_env: dict[str, str] | None = None
+    ) -> None:
+        pickle.dump((obj, extra_env), write_pipe)
         write_pipe.flush()
 
     @staticmethod
@@ -158,13 +162,13 @@ def alive(self) -> bool:
         """
         return self.running and self.process.poll() is None
 
-    def put(self, req: Any) -> None:
+    def put(self, req: Any, extra_env: dict[str, str] | None = None) -> None:
         """
         Push a work item to the child process.
         """
         if not self.alive():
             self.start()
-        TuningProcess.send(req, self.write_pipe)
+        TuningProcess.send(req, self.write_pipe, extra_env=extra_env)
 
     def get(self, timeout: float = 120.0) -> Any:
         """
@@ -174,7 +178,7 @@ def get(self, timeout: float = 120.0) -> Any:
         try:
             if not self.selector.select(timeout):
                 raise TimeoutError(f"Timeout in autotune subprocess {self.process.pid}")
-            result = TuningProcess.recv(self.read_pipe)
+            result, _ = TuningProcess.recv(self.read_pipe)
         except TimeoutError:
             self.kill()
             raise
@@ -305,8 +309,10 @@ def target(self, choice: TritonTemplateCaller) -> float:
         """
         assert choice.bmreq is not None
 
+        env_vars = ["TORCHINDUCTOR_CACHE_DIR", "TRITON_CACHE_DIR"]
+        extra_env = {v: os.environ[v] for v in env_vars if v in os.environ}
         process = self.process_queue.get()
-        process.put(choice.bmreq.benchmark)
+        process.put(choice.bmreq.benchmark, extra_env=extra_env)
         try:
             return process.get(
                 config.max_autotune_subproc_result_timeout_seconds,
diff --git a/torch/_inductor/bounds.py b/torch/_inductor/bounds.py
index a227239356a61..bc8dba5119252 100644
--- a/torch/_inductor/bounds.py
+++ b/torch/_inductor/bounds.py
@@ -86,7 +86,7 @@ def swap_submodules(
         self, submodules: dict[str, Callable[..., Any]]
     ) -> dict[str, Callable[..., ValueRanges[Expr]]]:
         result: dict[str, Callable[..., ValueRanges[Expr]]] = {}
-        for key in submodules.keys():
+        for key in submodules:
             if key == "get_index":
                 result[key] = self.get_index
             elif "masked_subblock" in key:
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index cf17bf2e9478b..a30644312332b 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -624,7 +624,7 @@ def dumps(self, obj: Any) -> bytes:
         try:
             self.dump(obj)
             return self._stream.getvalue()
-        except (TypeError, AttributeError) as e:
+        except (TypeError, AttributeError, pickle.PicklingError) as e:
             # Some configs options may not pickle.
             log.warning("Failed to pickle cache key", exc_info=True)
             raise BypassFxGraphCache("Failed to pickle cache key") from e
@@ -1680,30 +1680,42 @@ def set(
         basename, _ = get_name_and_dir_from_output_file_path(bin_path)
 
         if config.aot_inductor.emit_multi_arch_kernel:
-            bin_type_to_ext = {"cubin": ".fatbin", "spv": ".spv"}
-            assert bin_type in bin_type_to_ext.keys(), (
-                "multi_arch_kernel_binary only supported in CUDA/XPU"
+            bin_type_to_ext = {"cubin": ".fatbin", "spv": ".spv", "hsaco": ".hsaco"}
+            assert bin_type in bin_type_to_ext, (
+                "multi_arch_kernel_binary only supported in CUDA/XPU/ROCm"
             )
             base_path, _ = os.path.splitext(bin_path)
             bin_path = base_path + bin_type_to_ext[bin_type]
 
         asm_path: str = ""
+
+        # Kernel assembly/IR requirements for AOT Inductor:
+        # - CUDA/XPU: Always require PTX/SPV
+        # - ROCm multi-arch: Require LLVM IR (.ll) for bundle compilation
         if (
             config.aot_inductor.emit_multi_arch_kernel
             or config.aot_inductor.package_cpp_only
         ):
-            assert asm, "Missing kernel assembly code"
-            assert asm_type, "Missing kernel assembly type"
-            _, asm_path = write(
-                asm,
-                asm_type,
-                hash_type=asm_type,
-                specified_dir=split_aot_inductor_output_path(
-                    config.aot_inductor.output_path
-                )[0],
-                # make sure asm file has the same basename
-                key=basename,
-            )
+            # Allow ROCm single-arch to skip (asm=None OK), require for everything else
+            if torch.version.hip is None or (asm and asm_type):
+                assert asm, "Missing kernel assembly code"
+                assert asm_type, "Missing kernel assembly type"
+
+                # Cache directory mapping: asm_type → hash_type
+                # Problem: LLVM IR extension ".ll" isn't a recognized cache category
+                # Solution: Map to "code" (generic category for non-standard formats)
+                # Recognized categories: "ptx", "amdgcn", "spv", "code"
+                hash_kind = asm_type if asm_type in {"amdgcn", "ptx", "spv"} else "code"
+
+                _, asm_path = write(
+                    asm,
+                    asm_type,
+                    hash_type=hash_kind,
+                    specified_dir=split_aot_inductor_output_path(
+                        config.aot_inductor.output_path
+                    )[0],
+                    key=basename,
+                )
 
         params[get_cpp_wrapper_cubin_path_name()] = bin_path
         params["asm"] = asm_path
@@ -2139,7 +2151,7 @@ def get_zero_consts_asm_code(
             )
             all_cuda = all(
                 graph.get_original_value_of_constant(name).is_cuda
-                for name in graph.constants.keys()
+                for name in graph.constants
                 if name not in graph.folded_constants
             )
 
@@ -2180,7 +2192,7 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
             ):
                 serialized_weights = b"".join(
                     _to_bytes(graph.get_original_value_of_constant(name), all_cuda)
-                    for name in graph.constants.keys()
+                    for name in graph.constants
                     if name not in graph.folded_constants
                 )
             else:
@@ -2194,7 +2206,7 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
                             graph.get_original_value_of_constant(name),
                             TensorProperties(graph.constants[name]),
                         )
-                        for name in graph.constants.keys()
+                        for name in graph.constants
                         if name not in graph.folded_constants
                     }
                 )
@@ -2383,28 +2395,57 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
                         config.aot_inductor.emit_multi_arch_kernel
                         and device_type == "cuda"
                     ):
-                        current_arch = _nvcc_arch_as_compile_option()
-                        cmd = (
-                            # pyrefly: ignore [unbound-name]
-                            f"{_cuda_compiler()} -fatbin {asm_file} -o {cubin_file} "
-                            # Triton only allows generating PTX version as same as the current arch
-                            f"-gencode arch=compute_{current_arch},code=compute_{current_arch} "
-                            # Include SASS for the current specific arch
-                            f"-gencode arch=compute_{current_arch},code=sm_{current_arch} "
-                        )
-                        try:
-                            subprocess.run(
-                                cmd.split(),
-                                capture_output=True,
-                                text=True,
-                                check=True,
+                        if torch.version.hip is None:
+                            current_arch = _nvcc_arch_as_compile_option()
+                            cmd = (
+                                # pyrefly: ignore [unbound-name]
+                                f"{_cuda_compiler()} -fatbin {asm_file} -o {cubin_file} "
+                                # Triton only allows generating PTX version as same as the current arch
+                                f"-gencode arch=compute_{current_arch},code=compute_{current_arch} "
+                                # Include SASS for the current specific arch
+                                f"-gencode arch=compute_{current_arch},code=sm_{current_arch} "
                             )
-                        except subprocess.CalledProcessError as e:
-                            print(
-                                f"{cmd} failed with:\nstdout:\n{e.stdout}\nstderr:\n{e.stderr}",
-                                file=sys.stderr,
+                            try:
+                                subprocess.run(
+                                    cmd.split(),
+                                    capture_output=True,
+                                    text=True,
+                                    check=True,
+                                )
+                            except subprocess.CalledProcessError as e:
+                                print(
+                                    f"{cmd} failed with:\nstdout:\n{e.stdout}\nstderr:\n{e.stderr}",
+                                    file=sys.stderr,
+                                )
+                                raise
+
+                        else:
+                            # ROCm multi-arch: compile LLVM IR to multi-arch bundle
+                            from torch._inductor.rocm_multiarch_utils import (
+                                compile_multiarch_bundle_from_llvm_ir,
                             )
-                            raise
+
+                            if not os.path.exists(asm_file):
+                                raise RuntimeError(
+                                    f"Multi-arch ROCm compilation requires LLVM IR file, "
+                                    f"but {asm_file} not found. "
+                                    f"Ensure asm_type='ll' is captured in triton_heuristics.py"
+                                )
+
+                            # Compile for multiple archs and bundle them
+                            success = compile_multiarch_bundle_from_llvm_ir(
+                                llvm_ir_path=asm_file,
+                                output_bundle_path=cubin_file,
+                                target_archs=None,
+                            )
+
+                            if not success:
+                                raise RuntimeError(
+                                    f"Failed to compile multi-arch bundle for kernel {kernel_name}. "
+                                    f"Check that ROCm toolchain is available and LLVM IR is valid."
+                                )
+
+                            log.info("Created multi-arch bundle: %s", cubin_file)
 
                     if config.aot_inductor.embed_kernel_binary:
                         # Embed cubin files into model.so using objcopy
@@ -2471,10 +2512,18 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
                     generated_files.append(consts_o)
                     so_builder.save_src_to_cmake(cmake_path, consts_o)
 
-                if config.aot_inductor.emit_multi_arch_kernel:
+                # Different CMake strategies for CUDA vs ROCm:
+                # - CUDA: Save asm for CMake to recompile (user has nvcc)
+                # - ROCm: Link pre-compiled bundle (user may lack dev tools)
+                if (
+                    config.aot_inductor.emit_multi_arch_kernel
+                    and torch.version.hip is None
+                ):
                     so_builder.save_kernel_asm_to_cmake(cmake_path, asm_files)
                     generated_files.extend(asm_files)
                 else:
+                    # ROCm multi-arch + all single-arch: Link pre-compiled objects
+                    # Bundle already embedded in .o files - just link into .so
                     obj_srcs = [*gpu_kernels_o, *cubins_o]
                     generated_files.extend(obj_srcs)
                     for obj in obj_srcs:
@@ -2970,6 +3019,12 @@ class CppPythonBindingsCodeCache(CppCodeCache):
                 throw std::runtime_error("expected int arg");
             return reinterpret_cast<uintptr_t>(result);
         }}
+        template <> inline float parse_arg<float>(PyObject* args, size_t n) {{
+            auto result = PyFloat_AsDouble(PyTuple_GET_ITEM(args, n));
+            if(unlikely(result == -1.0 && PyErr_Occurred()))
+                throw std::runtime_error("expected float arg");
+            return static_cast<float>(result);
+        }}
 
         {extra_parse_arg}
 
@@ -3542,7 +3597,7 @@ def _worker_task_halide(lockfile: str, jobs: list[partial[Any]]) -> None:
             cmd: list[Any]
             python, script, *cmd = getattr(e, "cmd", ("", "", ""))
             if os.path.basename(python).startswith("python"):
-                code = open(script).read()
+                code = Path(script).read_text()
                 main = "    hl.main()"
                 assert code.count(main) == 1
 
@@ -3743,11 +3798,13 @@ def cutlass_key() -> bytes:
     Note: OSS and fbcode will have different keys.
     """
     if config.is_fbcode():
-        with importlib.resources.path(
-            "cutlass_library", "src_hash.txt"
-        ) as resource_path:
-            with open(resource_path) as resource_file:
-                return resource_file.read().encode()
+        with (
+            importlib.resources.path(
+                "cutlass_library", "src_hash.txt"
+            ) as resource_path,
+            open(resource_path) as resource_file,
+        ):
+            return resource_file.read().encode()
 
     combined_hash = hashlib.sha256()
     build_code_hash([config.cuda.cutlass_dir], "", combined_hash)
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index e6a5c5e8ec176..8b5e68780cb28 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -510,6 +510,7 @@ def init_backend_registration() -> None:
     from .cuda_combined_scheduling import CUDACombinedScheduling
     from .halide import HalideScheduling
     from .mps import MetalScheduling
+    from .pallas import PallasScheduling
     from .python_wrapper_mtia import PythonWrapperMtia
     from .triton import TritonScheduling
     from .wrapper import PythonWrapperCodegen
@@ -520,6 +521,7 @@ def init_backend_registration() -> None:
             "cpp": CppScheduling,
             "halide": HalideScheduling,
             "triton": TritonScheduling,
+            "pallas": PallasScheduling,
         }
         register_backend_for_device(
             "cpu",
@@ -536,6 +538,7 @@ def init_backend_registration() -> None:
         cuda_backends = {
             "triton": CUDACombinedScheduling,
             "halide": HalideScheduling,
+            "pallas": PallasScheduling,
         }
         register_backend_for_device(
             "cuda",
@@ -1730,9 +1733,15 @@ def cpp_argdefs(
             call_args.append(self.wrap_ptr_arg(outer, dtype))
             arg_types.append(f"{cpp_dtype}*")
         for outer, inner in self.sizevars.items():
-            arg_defs.append(f"const {INDEX_TYPE} {inner}")
+            if isinstance(outer, sympy.Symbol) and symbol_is_type(
+                outer, (SymT.UNBACKED_FLOAT)
+            ):
+                arg_defs.append(f"const float {inner}")
+                arg_types.append("const float")
+            else:
+                arg_defs.append(f"const {INDEX_TYPE} {inner}")
+                arg_types.append(f"const {INDEX_TYPE}")
             call_args.append(self.wrap_size_arg(outer))
-            arg_types.append(f"const {INDEX_TYPE}")
             if V.graph.wrapper_code:
                 V.graph.wrapper_code.ensure_size_computed(outer)
         assert not self.workspace_args, "Workspace not supported on CPU "
@@ -2351,6 +2360,7 @@ def rename_indexing(
                     SymT.UNBACKED_INT,
                     SymT.SIZE,
                     SymT.PRECOMPUTED_SIZE,
+                    SymT.UNBACKED_FLOAT,
                 ),
             )
         }
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index d7fc29219b533..88f203421cc1c 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -239,7 +239,10 @@ def reduction_combine(
     if reduction_type in ("min", "max"):
         return f"{reduction_type}_propagate_nan({var}, {next_value})"
     if reduction_type == "welford_reduce":
-        return f"welford_combine({var}, {next_value})"
+        if helper_val:
+            return f"welford_combine({var}, {next_value}, &{helper_val})"
+        else:
+            return f"welford_combine({var}, {next_value})"
     if reduction_type == "welford_combine":
         if isinstance(next_value, tuple):
             mean, m2, weight = next_value
@@ -2194,10 +2197,8 @@ def need_use_acc_helper(self, reduction_type, dtype, use_scalar):
         # sum and welford
         # Note: using helper has non-negligible impact on performance
 
-        # keep the original behavior for welford_reduce
-        # acc helper is not used for scalar welford_reduce
         if reduction_type == "welford_reduce":
-            return not use_scalar
+            return True
 
         # TODO add supports for more data types when needed
         if reduction_type == "sum" and dtype == torch.float:
@@ -2205,28 +2206,22 @@ def need_use_acc_helper(self, reduction_type, dtype, use_scalar):
             reduction_size = functools.reduce(
                 operator.mul, self.call_ranges[self.reduction_depth :]
             )
-            if config.cpp.dynamic_threads:
-                # If dynamic threads, to be conservative,
-                # use reduction_size as the range size
-                rt_size = reduction_size
-            else:
-                rt_size = CeilDiv(reduction_size, parallel_num_threads())
 
             # chunk size to balance accuracy and performance
-            chunk_size = 2**20
+            chunk_size = 4096
 
             # use acc helper If cannot get size_hint
             try:
-                rt_size_hint = V.graph.sizevars.size_hint(rt_size)
+                reduction_size_hint = V.graph.sizevars.size_hint(reduction_size)
             except Exception:
                 return True
 
-            if rt_size_hint > chunk_size:
+            if reduction_size_hint > chunk_size:
                 # use helper if the reduction size is too large
-                V.graph.sizevars.check_lt(chunk_size, rt_size)
+                V.graph.sizevars.check_lt(chunk_size, reduction_size)
                 return True
             else:
-                V.graph.sizevars.check_leq(rt_size, chunk_size)
+                V.graph.sizevars.check_leq(reduction_size, chunk_size)
         return False
 
     def _acc_helper_init(
@@ -2243,7 +2238,7 @@ def _acc_helper_init(
         )
         num_range_thread_expr = cexpr_index(num_range_thread)
         assert reduction_type in ["welford_reduce", "sum"]
-        chunk_size = 4096 if reduction_type == "welford_reduce" else 2**20
+        chunk_size = 4096
         num_chunks = CeilDiv(num_range_thread, chunk_size)
         helper_type = (
             "WelfordHelper"
@@ -2329,9 +2324,15 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
             reduction_size = functools.reduce(
                 operator.mul, self.ranges[self.reduction_depth :]
             )
-            helper_val = self.cascade_helper_cse.generate(
-                self.compute, f"reduction {reduction_key}", write=False
-            )
+            # use welford_helper/cascade_helper for vec kernel
+            if reduction_type == "welford_reduce":
+                helper_val = self.welford_helper_cse.generate(
+                    self.compute, f"reduction {reduction_key}", write=False
+                )
+            else:
+                helper_val = self.cascade_helper_cse.generate(
+                    self.compute, f"reduction {reduction_key}", write=False
+                )
             # rename the helper variable to distinguish it from vectorized version
             scalar_helper_val = f"scalar_{helper_val}"
             self._use_acc_helper(
@@ -2544,7 +2545,7 @@ def codegen_loops(self, code, worksharing):
     @property
     def assert_function(self) -> str:
         if V.graph.aot_mode:
-            return "STD_TORCH_CHECK"
+            return "AOTI_TORCH_CHECK"
         else:
             return "TORCH_CHECK"
 
@@ -3098,19 +3099,16 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
                 if self.ranges[self.tiling_idx] % self.tiling_factor
                 else sympy.Integer(0)
             )
-            # scalar helper for scalar sum is also needed when vec kernel is included
-            # Note: is it different from welford reduction as welford reduction of scalar version
-            # does not need helper, and the helper needs the information of reduction size to initialize
-            if reduction_type == "sum":
-                scalar_helper_val = f"scalar_{helper_val}"
-                self._use_acc_helper(
-                    reduction_type,
-                    acc,
-                    scalar_helper_val,
-                    reduction_size,
-                    dtype,
-                    use_scalar=True,
-                )
+            # scalar helper for scalar welford_reduce/sum is also needed when vec kernel is included
+            scalar_helper_val = f"scalar_{helper_val}"
+            self._use_acc_helper(
+                reduction_type,
+                acc,
+                scalar_helper_val,
+                reduction_size,
+                dtype,
+                use_scalar=True,
+            )
             self._use_acc_helper(
                 reduction_type, acc, helper_val, helper_vec_range, dtype
             )
@@ -3690,6 +3688,8 @@ def store(self, name, index, value, mode=None):
             if self.tail_size or V.graph.get_dtype(name) in DTYPE_LOWP_FP + [
                 torch.uint8,
                 torch.int8,
+                torch.float8_e4m3fn,
+                torch.float8_e5m2,
             ]:
                 line = f"{value}.store({storebuf}, {cexpr_index(self.num_elems)});"
             else:
diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu.py b/torch/_inductor/codegen/cpp_wrapper_cpu.py
index 1b994dcf3ffa6..61a97fd740cbc 100644
--- a/torch/_inductor/codegen/cpp_wrapper_cpu.py
+++ b/torch/_inductor/codegen/cpp_wrapper_cpu.py
@@ -221,7 +221,9 @@ def write_header(self):
                 """
             )
 
-        self.add_device_include(self.device)
+        for device in V.graph.device_types:
+            if device != "meta":
+                self.add_device_include(device)
 
         if V.graph.aot_mode:
             if config.aot_inductor.dynamic_linkage:
@@ -631,7 +633,7 @@ def write_wrapper_decl(self):
                 debug_printer_manager.codegen_model_inputs_value_print(
                     input_args_to_print=[
                         input_key
-                        for input_key in V.graph.graph_inputs.keys()
+                        for input_key in V.graph.graph_inputs
                         if input_key.startswith("arg")
                     ]
                 )
@@ -811,7 +813,7 @@ def codegen_model_constructor(self):
 
             all_cuda = all(
                 V.graph.get_original_value_of_constant(name).is_cuda
-                for name in V.graph.constants.keys()
+                for name in V.graph.constants
                 if name not in V.graph.folded_constants
             )
             for idx, name in enumerate(V.graph.constants.keys()):
@@ -1423,11 +1425,13 @@ def _generate_scatter_fallback(
         src_is_tensor,
         reduce,
         kwargs,
+        device,
     ):
         reduce = self._get_scatter_reduce_enum(reduce)
 
         # call the ABI shim function instead of the ATen one
-        cpp_kernel_name = self.get_c_shim_func_name(cpp_kernel_name, self.device)
+        self.add_device_include(device)
+        cpp_kernel_name = self.get_c_shim_func_name(cpp_kernel_name, device)
         # TODO: consider remove "_out" and add missing inplace variants to fallback_ops.py
         cpp_kernel_name = cpp_kernel_name.replace("__", "_") + "_out"
         inputs_wrapped = [str(x) for x in inputs]
diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py b/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py
index 11e74b9ddf8b8..c0c9aef609ba4 100644
--- a/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py
+++ b/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py
@@ -708,11 +708,14 @@ def _generate_scatter_fallback(
         src_is_tensor,
         reduce,
         kwargs,
+        device,
     ):
         reduce = self._get_scatter_reduce_enum(reduce)
 
         # call the ABI shim function instead of the ATen one
-        cpp_kernel_name = self.get_c_shim_func_name(cpp_kernel_name, self.device)
+        self.add_device_include(device)
+        cpp_kernel_name = self.get_c_shim_func_name(cpp_kernel_name, device)
+
         # TODO: consider remove "_out" and add missing inplace variants to fallback_ops.py
         cpp_kernel_name = cpp_kernel_name.replace("__", "_") + "_out"
         self._assert_safe_to_use_borrow_arrayref_tensor_as_tensor()
diff --git a/torch/_inductor/codegen/cpp_wrapper_gpu.py b/torch/_inductor/codegen/cpp_wrapper_gpu.py
index 02129fff24160..fad4ce84f2971 100644
--- a/torch/_inductor/codegen/cpp_wrapper_gpu.py
+++ b/torch/_inductor/codegen/cpp_wrapper_gpu.py
@@ -337,7 +337,7 @@ def process_args_for_input_shape(arg, arg_type, arg_signature=None):
                     elif (
                         isinstance(arg_type, type(SymbolicCallArg))
                         and arg_signature is not None
-                        and arg_signature in signature2dtype.keys()
+                        and arg_signature in signature2dtype
                     ) or arg_type in (sympy.Integer, int, sympy.Float, float):
                         write_dummy_scalar_ivalue(arg_name)
                     elif arg_signature and arg_signature.startswith("tensordesc<"):
@@ -719,7 +719,7 @@ def process_args(arg, arg_type, arg_signature=None):
             elif (
                 isinstance(arg_type, type(SymbolicCallArg))
                 and arg_signature is not None
-                and arg_signature in signature2dtype.keys()
+                and arg_signature in signature2dtype
             ):
                 code.writeline(
                     f"{signature2dtype[arg_signature]} {var_name} = {cexpr(arg)};"
diff --git a/torch/_inductor/codegen/cuda/cuda_kernel.py b/torch/_inductor/codegen/cuda/cuda_kernel.py
index a76e77dbe75ae..97643ef00a7bd 100644
--- a/torch/_inductor/codegen/cuda/cuda_kernel.py
+++ b/torch/_inductor/codegen/cuda/cuda_kernel.py
@@ -312,7 +312,7 @@ def def_kernel(
         size_vars.extend(str(s) for s in free_symbols)
         self.size_args.extend(free_symbols)
         size_args = [f"const int {s}" for s in size_vars]
-        offset_args = [f"const int {name}_offset" for name in self.named_nodes.keys()]
+        offset_args = [f"const int {name}_offset" for name in self.named_nodes]
         runtime_arg_decls = ",".join(
             [f"{arg.ty} {arg.name}" for arg in self.runtime_arg_info]
         )
diff --git a/torch/_inductor/codegen/cuda/cutlass_python_evt.py b/torch/_inductor/codegen/cuda/cutlass_python_evt.py
index 72108b29b3cb0..e6b7d2afe6c39 100644
--- a/torch/_inductor/codegen/cuda/cutlass_python_evt.py
+++ b/torch/_inductor/codegen/cuda/cutlass_python_evt.py
@@ -168,7 +168,7 @@ def __init__(self, accumulator_node_name: str, removed_buffers: OrderedSet[str])
         self.removed_buffers: OrderedSet[str] = removed_buffers
         self.cur_node: Optional[ComputedBuffer] = None
         self.name_to_buffer = V.graph.name_to_buffer | V.graph.graph_inputs
-        for name in V.graph.constants.keys():
+        for name in V.graph.constants:
             self.name_to_buffer[name] = V.graph.add_tensor_constant(
                 V.graph.constants[name], name
             )
diff --git a/torch/_inductor/codegen/halide.py b/torch/_inductor/codegen/halide.py
index 495b9c04f75fc..e47e8e6d7841d 100644
--- a/torch/_inductor/codegen/halide.py
+++ b/torch/_inductor/codegen/halide.py
@@ -906,7 +906,7 @@ def setup_dom_indexing(self):
             return self.dom_renames[prefix]
 
         renames = {}
-        for var in self.halide_vars.keys():
+        for var in self.halide_vars:
             if not self.inside_reduction and var in self.reduction_renames:
                 continue
             m = re.match(r"^h(\d+)$", var.name)
diff --git a/torch/_inductor/codegen/mps.py b/torch/_inductor/codegen/mps.py
index 4c668ea194409..8b72a8c97df28 100644
--- a/torch/_inductor/codegen/mps.py
+++ b/torch/_inductor/codegen/mps.py
@@ -955,7 +955,7 @@ def call_kernel(
         """
         wrapper = V.graph.wrapper_code
         # Make sure sizevars has been computed
-        for v in self.args.sizevars.keys():
+        for v in self.args.sizevars:
             wrapper.ensure_size_computed(v)
 
         _, call_args, _, arg_types = self.args.python_argdefs()
@@ -965,7 +965,7 @@ def call_kernel(
 
         args = [*self.args.output_buffers.keys(), *self.args.input_buffers.keys()]
         args = [arg for arg in args if arg not in self.removed_buffers]
-        args += [str(v) for v in self.args.sizevars.keys()]
+        args += [str(v) for v in self.args.sizevars]
         arg_types = [arg_name_to_type[arg] for arg in args]
 
         # Add any dynamic ints as inputs
diff --git a/torch/_inductor/codegen/pallas.py b/torch/_inductor/codegen/pallas.py
new file mode 100644
index 0000000000000..512cf89795b0d
--- /dev/null
+++ b/torch/_inductor/codegen/pallas.py
@@ -0,0 +1,1189 @@
+from __future__ import annotations
+
+import hashlib
+from typing import Any, Optional, TYPE_CHECKING, Union
+
+import sympy  # noqa: TC002
+
+import torch  # noqa: TC001
+from torch.utils._ordered_set import OrderedSet
+
+from .. import config
+from ..runtime.runtime_utils import torch_dtype_to_jax
+from ..utils import get_fused_kernel_name, get_kernel_metadata
+from ..virtualized import V
+from .block_analysis import BlockPatternMatcher
+from .common import BackendFeature, CSEVariable, IndentedBuffer, OpOverrides
+from .simd import pexpr, SIMDKernel, SIMDScheduling
+
+
+if TYPE_CHECKING:
+    from collections.abc import Callable, Sequence
+
+    from ..ir import IRNode
+    from ..ops_handler import ReductionType
+    from ..scheduler import BaseSchedulerNode
+
+
+# Main function suffix used in generated Pallas code
+MAIN_SUFFIX = "main"
+
+# Logger for Pallas kernel code
+kernel_code_log = torch._logging.getArtifactLogger(__name__, "kernel_code")
+
+
+class PallasKernelWrapper:
+    """Wrapper to provide .run() interface for Pallas kernels"""
+
+    def __init__(
+        self, kernel_fn: Callable[..., Any], kernel_path: Optional[str] = None
+    ):
+        self.kernel_fn = kernel_fn
+        self.kernel_path = kernel_path
+        kernel_code_log.info("Pallas kernel path: %s", kernel_path)
+
+    def run(self, *args, stream=None, **kwargs):
+        """
+        Execute the Pallas kernel.
+
+        Args:
+            *args: Arguments to pass to the kernel function
+            stream: CUDA stream to pass to the kernel function
+            **kwargs: Additional keyword arguments for the kernel
+
+        Returns:
+            Result of the kernel execution
+        """
+        return self.kernel_fn(*args, stream=stream, **kwargs)
+
+
+class Unsupported(RuntimeError):
+    """Exception raised when an operation is not supported by the Pallas backend."""
+
+
+class PallasKernelOverrides(OpOverrides):
+    """
+    Map element-wise ops to JAX/Pallas operations.
+
+    For now, we use the default Python operators which are compatible
+    with JAX numpy broadcasting semantics.
+    """
+
+    @staticmethod
+    def sin(x: str) -> str:
+        return f"jnp.sin({x})"
+
+    @staticmethod
+    def cos(x: str) -> str:
+        return f"jnp.cos({x})"
+
+    @staticmethod
+    def tan(x: str) -> str:
+        return f"jnp.tan({x})"
+
+    @staticmethod
+    def sinh(x: str) -> str:
+        return f"jnp.sinh({x})"
+
+    @staticmethod
+    def cosh(x: str) -> str:
+        return f"jnp.cosh({x})"
+
+    @staticmethod
+    def tanh(x: str) -> str:
+        return f"jnp.tanh({x})"
+
+    @staticmethod
+    def asin(x: str) -> str:
+        return f"jnp.arcsin({x})"
+
+    @staticmethod
+    def acos(x: str) -> str:
+        return f"jnp.arccos({x})"
+
+    @staticmethod
+    def atan(x: str) -> str:
+        return f"jnp.arctan({x})"
+
+    @staticmethod
+    def exp(x: str) -> str:
+        return f"jnp.exp({x})"
+
+    @staticmethod
+    def exp2(x: str) -> str:
+        return f"jnp.exp2({x})"
+
+    @staticmethod
+    def expm1(x: str) -> str:
+        return f"jnp.expm1({x})"
+
+    @staticmethod
+    def log(x: str) -> str:
+        return f"jnp.log({x})"
+
+    @staticmethod
+    def log10(x: str) -> str:
+        return f"jnp.log10({x})"
+
+    @staticmethod
+    def log2(x: str) -> str:
+        return f"jnp.log2({x})"
+
+    @staticmethod
+    def log1p(x: str) -> str:
+        return f"jnp.log1p({x})"
+
+    @staticmethod
+    def sqrt(x: str) -> str:
+        return f"jnp.sqrt({x})"
+
+    @staticmethod
+    def rsqrt(x: str) -> str:
+        return f"(1.0 / jnp.sqrt({x}))"
+
+    @staticmethod
+    def abs(x: str) -> str:
+        return f"jnp.abs({x})"
+
+    @staticmethod
+    def neg(x: str) -> str:
+        return f"(-{x})"
+
+    @staticmethod
+    def floor(x: str) -> str:
+        return f"jnp.floor({x})"
+
+    @staticmethod
+    def ceil(x: str) -> str:
+        return f"jnp.ceil({x})"
+
+    @staticmethod
+    def trunc(x: str) -> str:
+        return f"jnp.trunc({x})"
+
+    @staticmethod
+    def round(x: str) -> str:
+        return f"jnp.round({x})"
+
+    @staticmethod
+    def sigmoid(x: str) -> str:
+        return f"(1.0 / (1.0 + jnp.exp(-{x})))"
+
+    @staticmethod
+    def relu(x: str) -> str:
+        return f"jnp.maximum({x}, 0)"
+
+    @staticmethod
+    def pow(a: str, b: str) -> str:
+        return f"jnp.power({a}, {b})"
+
+    @staticmethod
+    def maximum(a: str, b: str) -> str:
+        return f"jnp.maximum({a}, {b})"
+
+    @staticmethod
+    def minimum(a: str, b: str) -> str:
+        return f"jnp.minimum({a}, {b})"
+
+    @staticmethod
+    def where(cond: str, a: str, b: str) -> str:
+        return f"jnp.where({cond}, {a}, {b})"
+
+    @staticmethod
+    def to_dtype(
+        x: str,
+        dtype: torch.dtype,
+        src_dtype: Optional[torch.dtype] = None,
+        use_compute_types: bool = True,
+    ) -> str:
+        jax_dtype = torch_dtype_to_jax(dtype)
+        # Wrap in jnp.asarray to handle scalars from integer indexing
+        return f"jnp.asarray({x}).astype({jax_dtype})"
+
+    @staticmethod
+    def index_expr(expr: sympy.Expr, dtype: torch.dtype) -> str:
+        """Convert a sympy expression to a JAX array indexing expression."""
+        from ..utils import get_bounds_index_expr
+
+        idx_str = V.kernel.kexpr(V.kernel.prepare_indexing(expr))
+        var = V.kernel.cse.generate(
+            V.kernel.compute, idx_str, bounds=get_bounds_index_expr(expr)
+        )
+        return PallasKernelOverrides.to_dtype(var, dtype)
+
+    @staticmethod
+    def constant(val, dtype: torch.dtype) -> str:
+        """Convert a constant value to JAX representation."""
+        jax_dtype = torch_dtype_to_jax(dtype)
+        if dtype == torch.bool:
+            return "True" if val else "False"
+        return f"jnp.array({val}, dtype={jax_dtype})"
+
+    @staticmethod
+    def real(x: str) -> str:
+        return f"jnp.real({x})"
+
+    @staticmethod
+    def imag(x: str) -> str:
+        return f"jnp.imag({x})"
+
+    @staticmethod
+    def conj(x: str) -> str:
+        return f"jnp.conj({x})"
+
+    @staticmethod
+    def angle(x: str) -> str:
+        return f"jnp.angle({x})"
+
+    @staticmethod
+    def view_as_real(x: str) -> str:
+        """View complex tensor as real tensor with extra dimension."""
+        return f"jnp.stack([jnp.real({x}), jnp.imag({x})], axis=-1)"
+
+    @staticmethod
+    def view_as_complex(x: str) -> str:
+        """View real tensor as complex tensor."""
+        return f"({x}[..., 0] + 1j * {x}[..., 1])"
+
+    # Comparison operations
+    @staticmethod
+    def eq(a: str, b: str) -> str:
+        return f"({a} == {b})"
+
+    @staticmethod
+    def ne(a: str, b: str) -> str:
+        return f"({a} != {b})"
+
+    @staticmethod
+    def lt(a: str, b: str) -> str:
+        return f"({a} < {b})"
+
+    @staticmethod
+    def le(a: str, b: str) -> str:
+        return f"({a} <= {b})"
+
+    @staticmethod
+    def gt(a: str, b: str) -> str:
+        return f"({a} > {b})"
+
+    @staticmethod
+    def ge(a: str, b: str) -> str:
+        return f"({a} >= {b})"
+
+    # Logical operations
+    @staticmethod
+    def logical_and(a: str, b: str) -> str:
+        return f"jnp.logical_and({a}, {b})"
+
+    @staticmethod
+    def logical_or(a: str, b: str) -> str:
+        return f"jnp.logical_or({a}, {b})"
+
+    @staticmethod
+    def logical_not(x: str) -> str:
+        return f"jnp.logical_not({x})"
+
+    @staticmethod
+    def logical_xor(a: str, b: str) -> str:
+        return f"jnp.logical_xor({a}, {b})"
+
+    # Math operations
+    @staticmethod
+    def atan2(a: str, b: str) -> str:
+        return f"jnp.arctan2({a}, {b})"
+
+    @staticmethod
+    def hypot(a: str, b: str) -> str:
+        return f"jnp.hypot({a}, {b})"
+
+    @staticmethod
+    def fmod(a: str, b: str) -> str:
+        return f"jnp.fmod({a}, {b})"
+
+    @staticmethod
+    def remainder(a: str, b: str) -> str:
+        return f"jnp.remainder({a}, {b})"
+
+    @staticmethod
+    def clamp(x: str, min_val: str, max_val: str) -> str:
+        return f"jnp.clip({x}, {min_val}, {max_val})"
+
+    @staticmethod
+    def clip(x: str, min_val: str, max_val: str) -> str:
+        return f"jnp.clip({x}, {min_val}, {max_val})"
+
+    # Sign operations
+    @staticmethod
+    def sign(x: str) -> str:
+        return f"jnp.sign({x})"
+
+    @staticmethod
+    def signbit(x: str) -> str:
+        return f"jnp.signbit({x})"
+
+    # Special math functions
+    @staticmethod
+    def erf(x: str) -> str:
+        return f"jax.scipy.special.erf({x})"
+
+    @staticmethod
+    def erfc(x: str) -> str:
+        return f"jax.scipy.special.erfc({x})"
+
+    @staticmethod
+    def erfinv(x: str) -> str:
+        return f"jax.scipy.special.erfinv({x})"
+
+    @staticmethod
+    def lgamma(x: str) -> str:
+        return f"jax.scipy.special.gammaln({x})"
+
+    @staticmethod
+    def digamma(x: str) -> str:
+        return f"jax.scipy.special.digamma({x})"
+
+    # Reciprocal and square
+    @staticmethod
+    def reciprocal(x: str) -> str:
+        return f"jnp.reciprocal({x})"
+
+    @staticmethod
+    def square(x: str) -> str:
+        return f"jnp.square({x})"
+
+    # Additional operations
+    @staticmethod
+    def fma(a: str, b: str, c: str) -> str:
+        """Fused multiply-add: a * b + c"""
+        return f"jnp.fma({a}, {b}, {c})"
+
+    @staticmethod
+    def copysign(a: str, b: str) -> str:
+        return f"jnp.copysign({a}, {b})"
+
+    @staticmethod
+    def nextafter(a: str, b: str) -> str:
+        return f"jnp.nextafter({a}, {b})"
+
+    @staticmethod
+    def ldexp(a: str, b: str) -> str:
+        return f"jnp.ldexp({a}, {b})"
+
+    @staticmethod
+    def frexp(x: str) -> str:
+        return f"jnp.frexp({x})"
+
+    @staticmethod
+    def modf(x: str) -> str:
+        return f"jnp.modf({x})"
+
+    # Bitwise operations
+    @staticmethod
+    def bitwise_and(a: str, b: str) -> str:
+        return f"jnp.bitwise_and({a}, {b})"
+
+    @staticmethod
+    def bitwise_or(a: str, b: str) -> str:
+        return f"jnp.bitwise_or({a}, {b})"
+
+    @staticmethod
+    def bitwise_xor(a: str, b: str) -> str:
+        return f"jnp.bitwise_xor({a}, {b})"
+
+    @staticmethod
+    def bitwise_not(x: str) -> str:
+        return f"jnp.bitwise_not({x})"
+
+    @staticmethod
+    def left_shift(a: str, b: str) -> str:
+        return f"jnp.left_shift({a}, {b})"
+
+    @staticmethod
+    def right_shift(a: str, b: str) -> str:
+        return f"jnp.right_shift({a}, {b})"
+
+
+class PallasKernel(SIMDKernel):
+    """
+    Pallas kernel for elementwise operations with support for strided/scatter access.
+
+    Strategy:
+    - Convert index expressions to JAX-compatible array slicing
+    - Load/store using indexed access: "in_ptrX[slice]" or full-array "in_ptrX[...]"
+    - Compute expression with Python operators (compatible with jax.numpy broadcasting)
+    - Generate Python code that defines a Pallas kernel and a host entrypoint.
+    - Use async_compile.pallas path to compile and load Python code.
+
+    For GPU (Triton backend):
+    - Use masked loads/stores with power-of-2 block sizes to handle non-power-of-2 shapes
+    """
+
+    overrides = PallasKernelOverrides  # type: ignore[assignment]
+    kexpr: Callable[[sympy.Expr], str] = pexpr  # Use Python expression printer
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Determine device type once at initialization
+        device = V.graph.get_current_device_or_throw()
+        self.is_gpu = device.type == "cuda"
+        self.use_masked_ops: bool | None = None
+        self.tensor_masks = {}  # Map tensor name to mask variable name
+
+    def check_bounds(
+        self, expr: sympy.Expr, size: sympy.Expr, lower: bool, upper: bool
+    ) -> None:
+        """Check array bounds for indirect indexing."""
+        # For now, skip explicit bounds checking as JAX/Pallas handles this internally
+        # TODO: Implement explicit bounds checking with assertions if needed
+
+    def _get_index_str(self, index: sympy.Expr) -> str:
+        """
+        Convert an index expression to a string suitable for Pallas indexing.
+
+        Pallas operates on full arrays, so we need to convert index expressions
+        to JAX array slicing. For example:
+        - x0 -> "..." (contiguous access, full array)
+        - 2*x0 -> "::2" (strided access with stride 2)
+        - 2*x0 + 1 -> "1::2" (strided access with offset 1, stride 2)
+
+        Args:
+            index: The indexing expression to convert
+
+        Returns:
+            The indexing string to use in generated code
+        """
+        # Prepare and simplify the index
+        prepared_index = self.prepare_indexing(index)
+
+        # For simple single-symbol access (contiguous case), we can use [...]
+        # which is more efficient as it operates on the entire array at once
+        if isinstance(prepared_index, sympy.Symbol):
+            return "..."
+        elif prepared_index.is_Integer:
+            # Scalar index
+            return str(prepared_index)
+        else:
+            # Complex expression (strided/scatter access)
+            # Try to extract stride and offset for common patterns
+            return self._convert_to_jax_slice(prepared_index)
+
+    def _convert_to_jax_slice(self, index: sympy.Expr) -> str:
+        """
+        Convert a sympy index expression to JAX slice notation.
+
+        Handles common patterns like:
+        - stride*var -> ::stride
+        - stride*var + offset -> offset::stride
+
+        For more complex patterns, falls back to explicit indexing.
+        Uses BlockPatternMatcher for robust pattern matching.
+        """
+        # Get the iteration variables for this kernel
+        if not self.range_trees:
+            return "..."
+
+        # Simplify the index
+        index = V.graph.sizevars.simplify(index)
+        free_symbols = index.free_symbols
+
+        # Get iteration variables from range_tree_nodes
+        iter_vars = OrderedSet(self.range_tree_nodes.keys())
+
+        # Find which iteration variable(s) are used
+        used_vars = free_symbols & iter_vars
+
+        if len(used_vars) == 0:
+            # No iteration variables, this is a constant index
+            return str(index)
+        elif len(used_vars) == 1:
+            # Single iteration variable - try to extract stride and offset using BlockPatternMatcher
+            var = next(iter(used_vars))
+
+            # Get the subexpression involving this variable
+            var_expr = BlockPatternMatcher.get_subexpr_involving_symbol(index, var)
+
+            # Try to match affine pattern: stride * var
+            stride = BlockPatternMatcher.match_affine_block_expr(var_expr, var)
+
+            if stride is not None:
+                # Extract the constant offset (terms not involving var)
+                offset = index - var_expr
+                offset = V.graph.sizevars.simplify(offset)
+
+                # Generate JAX slice notation
+                if stride == 1 and offset == 0:
+                    # Contiguous access
+                    return "..."
+                elif offset == 0:
+                    # Pure stride: ::stride
+                    stride_str = self.kexpr(stride)
+                    return f"::{stride_str}"
+                else:
+                    # Offset + stride: offset::stride
+                    offset_str = self.kexpr(offset)
+                    stride_str = self.kexpr(stride)
+                    return f"{offset_str}::{stride_str}"
+            else:
+                # Couldn't match affine pattern, fall back to original logic
+                offset = index - var_expr
+                offset = V.graph.sizevars.simplify(offset)
+                if offset == 0 and var_expr == var:
+                    # Just the variable itself, unit stride
+                    return "..."
+        elif len(used_vars) > 1:
+            # Multi-dimensional indexing
+            # For contiguous multi-dim access, all terms should have unit stride
+            all_unit_stride = True
+            for var in used_vars:
+                var_expr = BlockPatternMatcher.get_subexpr_involving_symbol(index, var)
+                stride = BlockPatternMatcher.match_affine_block_expr(var_expr, var)
+                if stride != 1:
+                    all_unit_stride = False
+                    break
+
+            if all_unit_stride:
+                # Contiguous multi-dimensional access
+                return "..."
+            else:
+                # Strided multi-dimensional access - requires advanced indexing
+                # For now, use ellipsis which may work for many cases
+                # TODO: Implement proper multi-dimensional strided indexing
+                return "..."
+
+        # For complex cases, raise an error
+        return self._generate_index_array(index)
+
+    def _generate_index_array(self, index: sympy.Expr) -> str:
+        """
+        Generate JAX code to compute an index array for complex indexing patterns.
+
+        For very complex patterns that can't be expressed as simple slices,
+        we need to compute the indices explicitly. This is not yet fully implemented.
+        """
+        # For now, raise an error for complex patterns
+        # TODO: Implement advanced indexing support
+        raise Unsupported(
+            f"Pallas backend does not yet support complex indexing pattern: {index}"
+        )
+
+    def _has_iteration_vars(self, index: sympy.Expr) -> bool:
+        """Check if index expression contains iteration variables (x0, x1, etc.)."""
+        free_symbols = index.free_symbols
+        iter_vars = OrderedSet(self.range_tree_nodes.keys())
+        return bool(free_symbols & iter_vars)
+
+    def _has_indirect_vars(self, index: sympy.Expr) -> bool:
+        """Check if index expression contains indirect variables (tmp0, tmp1, etc.)."""
+        free_symbols = index.free_symbols
+        for sym in free_symbols:
+            if str(sym).startswith("tmp"):
+                return True
+        return False
+
+    def _get_index_expr(self, index: sympy.Expr) -> tuple[str, bool]:
+        """
+        Get the index expression string and whether it needs flattening.
+
+        Returns:
+            Tuple of (index_str, needs_flatten) where needs_flatten indicates
+            if the buffer should be flattened before indexing (for mixed indexing).
+        """
+        has_indirect = self._has_indirect_vars(index)
+        has_iter_vars = self._has_iteration_vars(index)
+
+        if has_indirect and has_iter_vars:
+            return self._handle_mixed_indexing(index), True
+        elif has_indirect:
+            return self.kexpr(index), False
+        else:
+            return self._get_index_str(index), False
+
+    def _determine_masked_ops_for_kernel(self) -> bool:
+        """
+        Determine if we should use masked ops for this entire kernel.
+
+        Masked ops with pl.ds(block_size) flatten tensors to 1D, which works when:
+        1. We're on GPU (CUDA backend uses Triton which requires power-of-2 sizes)
+        2. All tensors are already 1D (so flattening doesn't change dimensionality)
+        3. All tensors have the same size (so broadcasting works correctly)
+
+        With per-tensor masks, each tensor gets its own mask based on its size.
+
+        This should be called once in codegen_kernel() before generating the kernel body.
+        """
+        if not self.is_gpu:
+            return False
+
+        # Get all buffer sizes
+        # We need ALL buffers - inputs, outputs, and intermediates
+        all_buffer_names = OrderedSet()
+
+        # Get input buffers from args
+        all_buffer_names.update(self.args.input_buffers.keys())
+        # Get output buffers from args
+        all_buffer_names.update(self.args.output_buffers.keys())
+        # Also get any intermediate buffers from the graph
+        all_buffer_names.update(V.graph.name_to_buffer.keys())
+
+        # Get shapes and sizes for all buffers
+        buf_info = []
+        for buf_name in all_buffer_names:
+            try:
+                buf = V.graph.get_buffer(buf_name)
+                size = buf.get_size()
+                shape = tuple(int(s) if hasattr(s, "__int__") else s for s in size)
+                # Calculate flattened size
+                total_size = 1
+                for s in size:
+                    if hasattr(s, "__int__"):
+                        total_size *= int(s)
+                    else:
+                        total_size *= s
+                buf_info.append((buf_name, shape, total_size))
+            except Exception:
+                pass
+
+        # Only use masked ops if:
+        # 1. All buffers are 1D (single-element shape tuples)
+        # 2. All buffers have the same size
+        # This ensures that pl.ds(block_size) flattening works correctly
+        # and masks can be properly applied without broadcasting issues.
+        if buf_info and len(buf_info) > 0:
+            # Check if all are 1D
+            all_1d = all(len(shape) == 1 for _, shape, _ in buf_info)
+            if not all_1d:
+                return False
+
+            # Check if all have the same size
+            first_size = buf_info[0][2]
+            all_same_size = all(size == first_size for _, _, size in buf_info)
+            return all_same_size
+
+        return False
+
+    def _get_or_create_mask(self, buf_name: str) -> str:
+        """Get or create a unique mask variable for a buffer."""
+        if buf_name not in self.tensor_masks:
+            mask_var = f"mask_{buf_name}"
+            self.tensor_masks[buf_name] = mask_var
+        return self.tensor_masks[buf_name]
+
+    def load(self, name: str, index: sympy.Expr) -> CSEVariable:  # type: ignore[override]
+        buf = self.args.input(name)
+        dtype = V.graph.get_dtype(name)
+
+        # Determine masked ops strategy on first load/store if not yet determined
+        if self.use_masked_ops is None:
+            self.use_masked_ops = self._determine_masked_ops_for_kernel()
+
+        index_str, needs_flatten = self._get_index_expr(index)
+
+        # Build load expression using string concatenation
+        use_masked = index_str == "..." and not needs_flatten and self.use_masked_ops
+
+        if use_masked:
+            # GPU masked load: flatten tensor and apply per-tensor mask
+            mask_var = self._get_or_create_mask(name)
+            load_expr = f"pltriton.load({buf}.at[pl.ds(block_size)], mask={mask_var})"
+        elif needs_flatten:
+            # Flatten then index for non-contiguous access
+            load_expr = f"{buf}[...].flatten()[{index_str}]"
+        else:
+            # Direct indexing for contiguous access
+            load_expr = f"{buf}[{index_str}]"
+
+        return self.cse.generate(
+            self.compute,
+            load_expr,
+            dtype=dtype,
+        )
+
+    def _handle_mixed_indexing(self, index: sympy.Expr) -> str:
+        """
+        Handle indexing with both indirect variables and iteration variables.
+
+        For example, x[indices, :] generates index = i0 + stride * tmp0
+        where tmp0 is loaded from indices and i0 is the iteration variable.
+
+        We need to convert this to JAX advanced indexing with proper broadcasting.
+        """
+        # Get iteration variables
+        iter_vars = OrderedSet(self.range_tree_nodes.keys())
+        free_symbols = index.free_symbols
+        used_iter_vars = sorted(free_symbols & iter_vars, key=str)
+
+        if len(used_iter_vars) == 0:
+            return self.kexpr(index)
+
+        index_str = self.kexpr(index)
+        indirect_vars = [str(sym) for sym in free_symbols if str(sym).startswith("tmp")]
+
+        for i, var in enumerate(used_iter_vars):
+            var_name = str(var)
+            if var in self.range_tree_nodes:
+                range_entry = self.range_tree_nodes[var]
+                range_size = range_entry.length
+
+                arange_expr = f"jnp.arange({self.kexpr(range_size)})"
+                if indirect_vars:
+                    arange_expr = f"{arange_expr}[None, :]"
+
+                index_str = index_str.replace(var_name, arange_expr)
+
+        # Reshape indirect variables for proper broadcasting
+        for indirect_var in indirect_vars:
+            index_str = index_str.replace(indirect_var, f"{indirect_var}[:, None]")
+
+        return index_str
+
+    def store(
+        self, name: str, index: sympy.Expr, value: CSEVariable, mode: Any = None
+    ) -> None:  # type: ignore[override]
+        if mode is not None:
+            raise Unsupported("pallas store mode not supported")
+        out = self.args.output(name)
+        self.store_buffer_names.add(name)
+
+        # Determine masked ops strategy on first load/store if not yet determined
+        if self.use_masked_ops is None:
+            self.use_masked_ops = self._determine_masked_ops_for_kernel()
+
+        # Check if this is a scalar output (reduction to scalar)
+        # Only shape () is a true scalar, not (1,) which is a 1-element tensor
+        try:
+            buf = V.graph.get_buffer(name)
+            output_shape = buf.get_size()
+            is_scalar = len(output_shape) == 0
+        except Exception:
+            is_scalar = False
+
+        if is_scalar:
+            # For scalar outputs, use [...] to assign the entire scalar
+            store_expr = f"{out}[...] = {value}"
+        else:
+            index_str, needs_flatten = self._get_index_expr(index)
+
+            # Build store expression using string concatenation
+            use_masked = (
+                index_str == "..." and not needs_flatten and self.use_masked_ops
+            )
+
+            if use_masked:
+                # GPU masked store: flatten tensor and apply per-tensor mask
+                mask_var = self._get_or_create_mask(name)
+                store_expr = f"pltriton.store({out}.at[pl.ds(block_size)], {value}, mask={mask_var})"
+            else:
+                # Direct indexed assignment
+                store_expr = f"{out}[{index_str}] = {value}"
+
+        self.stores.writeline(store_expr)
+
+    def reduction(
+        self,
+        dtype: torch.dtype,
+        src_dtype: torch.dtype,
+        reduction_type: ReductionType,
+        value: Union[CSEVariable, tuple[CSEVariable, ...]],
+    ) -> Union[CSEVariable, tuple[CSEVariable, ...]]:  # type: ignore[override]
+        """
+        Generate code for reduction operations in JAX/Pallas.
+
+        Reductions in Pallas work by:
+        1. Loading the input data into the kernel
+        2. Applying JAX reduction operations (jnp.sum, jnp.max, etc.)
+        3. Storing the reduced result
+
+        The reduction happens over the loaded block of data.
+        """
+        assert self.inside_reduction
+
+        if isinstance(value, tuple):
+            raise Unsupported(
+                "Tuple reductions (e.g., welford_combine) not supported in Pallas backend"
+            )
+
+        # Check if this reduction is already cached
+        cache_key = (src_dtype, reduction_type, value)
+        if cache_key in self.cse.reduction_cache:
+            return self.cse.reduction_cache[cache_key]
+
+        # Map reduction types to JAX functions
+        reduction_ops = {
+            "sum": "jnp.sum",
+            "prod": "jnp.prod",  # CPU only - not supported in Pallas GPU (Triton) backend
+            "max": "jnp.max",
+            "min": "jnp.min",
+            "any": "jnp.any",
+        }
+
+        if reduction_type == "xor_sum":
+            reduction_expr = f"jnp.bitwise_xor.reduce({value})"
+        elif reduction_type in reduction_ops:
+            # Apply reduction over all axes to get scalar result
+            reduction_expr = f"{reduction_ops[reduction_type]}({value})"
+        else:
+            raise Unsupported(
+                f"Reduction type '{reduction_type}' not yet supported in Pallas backend. "
+                f"Supported types: {list(reduction_ops.keys())}, xor_sum"
+            )
+
+        # Generate CSE variable for the reduction result
+        result = self.cse.generate(
+            self.compute,
+            reduction_expr,
+            dtype=dtype,
+        )
+
+        # Cache the result
+        self.cse.reduction_cache[cache_key] = result
+        return result
+
+    @staticmethod
+    def _buffer_is_contiguous(buffer_name: str) -> bool:
+        buf = V.graph.get_buffer(buffer_name)
+        layout = buf.get_layout()
+        return layout.is_contiguous()
+
+    def codegen_kernel(self, name: Optional[str] = None) -> str:  # type: ignore[override]
+        """
+        Generate the complete Pallas kernel code as a Python string.
+
+        This includes:
+        - Import statements for JAX/Pallas
+        - The kernel function that operates on refs
+        - The main wrapper function that handles PyTorch<->JAX conversions via DLPack
+
+        Args:
+            name: Optional kernel name (will use placeholder if not provided)
+
+        Returns:
+            str: Complete Python source code for the Pallas kernel
+        """
+        # Ensure one (1) output for now
+        live_outs = list(self.args.live_output_buffers())
+        if len(live_outs) != 1:
+            raise Unsupported(
+                "Pallas backend currently supports single-output elementwise kernels only"
+            )
+
+        code = IndentedBuffer()
+
+        # Define the Pallas kernel: accepts refs, uses broadcasted expressions
+        arg_defs, _, _, _ = self.args.python_argdefs()
+        kernel_params = [a.name for a in arg_defs]
+        pure_out_params = [p for p in kernel_params if p.startswith("out_ptr")]
+        output_params = [
+            p for p in kernel_params if p.startswith(("out_ptr", "in_out_ptr"))
+        ]
+        if not output_params:
+            raise RuntimeError("Pallas backend requires at least one output buffer")
+
+        output_buffer_lookup = {
+            inner: outer
+            for outer, inner in self.args.output_buffers.items()
+            if isinstance(inner, str)
+        }
+
+        kernel_name = name or "<KERNEL_NAME>"
+        interpret_is_cpu = V.graph.get_current_device_or_throw().type == "cpu"
+        interpret_literal = "True" if interpret_is_cpu else "False"
+
+        # For GPU (Triton backend), import pltriton for masked loads/stores
+        # Import math at module level if we'll use it for masked ops
+        imports = (
+            """
+            import functools
+            """
+            + ("import math\n            " if self.use_masked_ops else "")
+            + """import torch
+            import jax
+            import jax.numpy as jnp
+            from jax.experimental import pallas as pl
+            from torch._inductor.runtime.runtime_utils import torch_dtype_to_jax_runtime
+            """
+            + (
+                "\n            from jax.experimental.pallas import triton as pltriton"
+                if not interpret_is_cpu
+                else ""
+            )
+            + (
+                "\n            from torch._inductor.runtime.runtime_utils import next_power_of_2"
+                if self.use_masked_ops
+                else ""
+            )
+        )
+        code.splice(imports, strip=True)
+
+        aliasable_flags: dict[str, bool] = {}
+        for param in pure_out_params:
+            buffer_name = output_buffer_lookup.get(param)
+            is_contiguous = buffer_name is not None and self._buffer_is_contiguous(
+                buffer_name
+            )
+            aliasable_flags[param] = (not interpret_is_cpu) and is_contiguous
+        alias_params = [
+            f"{param}_alias" for param in pure_out_params if aliasable_flags[param]
+        ]
+        pointer_tail = [
+            p for p in kernel_params if p.startswith(("in_out_ptr", "in_ptr"))
+        ]
+        kernel_input_params = alias_params + pointer_tail
+        full_kernel_params = alias_params + kernel_params
+        non_alias_out_set = OrderedSet(
+            [name for name, flag in aliasable_flags.items() if not flag]
+        )
+        copy_output_indices = [
+            idx for idx, name in enumerate(output_params) if name in non_alias_out_set
+        ]
+        self.aliasable_out_ptrs = aliasable_flags
+
+        # For GPU with masked ops, add block_size as keyword-only parameter
+        kernel_signature = (
+            f"def {kernel_name}_kernel({', '.join(full_kernel_params)}"
+            + (", *, block_size" if self.use_masked_ops else "")
+            + "):"
+        )
+        code.writeline(kernel_signature)
+        with code.indent():
+            # For masked ops on GPU, generate per-tensor masks at the start
+            if self.use_masked_ops and self.tensor_masks:
+                # Create a mapping from buffer name to parameter name
+                buf_to_param = {}
+                for outer, inner in self.args.input_buffers.items():
+                    buf_to_param[outer] = inner if isinstance(inner, str) else outer
+                for outer, inner in self.args.output_buffers.items():
+                    buf_to_param[outer] = inner if isinstance(inner, str) else outer
+
+                # Generate a mask for each tensor that was accessed
+                for buf_name, mask_var in sorted(self.tensor_masks.items()):
+                    param_name = buf_to_param.get(buf_name, buf_name)
+                    # Find the corresponding parameter in kernel_params
+                    matching_param = None
+                    for p in kernel_params:
+                        # Check if this parameter corresponds to the buffer
+                        if param_name == p or buf_name in str(p):
+                            matching_param = p
+                            break
+
+                    if matching_param:
+                        # Calculate flattened size for this tensor
+                        code.writeline(f"# Mask for {buf_name}")
+                        code.writeline(f"{mask_var}_size = {matching_param}.size")
+                        code.writeline(
+                            f"{mask_var} = jnp.arange(block_size) < {mask_var}_size"
+                        )
+
+            # Emit compute (CSE) and store lines; they reference *_ptr[index] directly.
+            # Iteration variables are implicitly handled by JAX vectorization, so
+            # explicit indices should be JAX-traced values.
+            for line in self.compute._lines:
+                code.writeline(str(line))
+            for line in self.stores._lines:
+                code.writeline(str(line))
+
+        jit_wrapper_name = f"{kernel_name}_jit_wrapper"
+        donate_indices = []
+        for idx, name in enumerate(kernel_input_params):
+            if (name in alias_params) or name.startswith("in_out_ptr"):
+                donate_indices.append(idx + 2)
+        if donate_indices:
+            donate_literal = "(" + ", ".join(str(x) for x in donate_indices) + ",)"
+        else:
+            donate_literal = "()"
+        code.writeline(
+            "@functools.partial("
+            "jax.jit, static_argnums=(0, 1), donate_argnums="
+            f"{donate_literal})"
+        )
+        code.writeline(
+            f"def {jit_wrapper_name}(out_shapes, out_dtypes, {', '.join(kernel_input_params)}):"
+        )
+        with code.indent():
+            code.writeline("out_specs = tuple(")
+            code.writeline("    jax.ShapeDtypeStruct(shape, dtype)")
+            code.writeline("    for shape, dtype in zip(out_shapes, out_dtypes)")
+            code.writeline(")")
+
+            # For masked ops, calculate block_size as next power of 2 of max flattened size
+            if self.use_masked_ops:
+                code.writeline(
+                    "# Calculate block_size as next power of 2 for Triton backend"
+                )
+                code.writeline("# Find maximum flattened size across all tensors")
+                code.writeline("max_size = 0")
+                # Calculate size for all input tensors
+                for param in kernel_input_params:
+                    code.writeline(f"max_size = max(max_size, {param}.size)")
+                # Also consider output shapes
+                code.writeline("for shape in out_shapes:")
+                code.writeline(
+                    "    tensor_size = shape[0] if len(shape) == 1 else math.prod(shape)"
+                )
+                code.writeline("    max_size = max(max_size, tensor_size)")
+                code.writeline("block_size = next_power_of_2(max_size)")
+
+            alias_pairs: list[tuple[int, int]] = []
+            for out_idx, name in enumerate(output_params):
+                if name.startswith("out_ptr"):
+                    if aliasable_flags.get(name, False):
+                        alias_name = f"{name}_alias"
+                        input_idx = kernel_input_params.index(alias_name)
+                        alias_pairs.append((input_idx, out_idx))
+                else:
+                    input_idx = kernel_input_params.index(name)
+                    alias_pairs.append((input_idx, out_idx))
+            alias_map_literal = ", ".join(f"{i}: {o}" for (i, o) in alias_pairs)
+
+            # For masked ops, wrap kernel with functools.partial to pass block_size
+            kernel_arg = (
+                f"functools.partial({kernel_name}_kernel, block_size=block_size),"
+                if self.use_masked_ops
+                else f"{kernel_name}_kernel,"
+            )
+            code.writeline("return pl.pallas_call(")
+            code.writeline("    " + kernel_arg)
+
+            code.writeline("    out_shape=out_specs,")
+            code.writeline(f"    interpret={interpret_literal},")
+            code.writeline("    grid=(1,),")
+            code.writeline(
+                f"    input_output_aliases={{ {alias_map_literal} }},"
+                if alias_pairs
+                else "    input_output_aliases={},"
+            )
+            code.writeline(")(")
+            code.writeline(f"    {', '.join(kernel_input_params)},")
+            code.writeline(")")
+
+        main_name = f"{kernel_name}_main"
+        code.writeline(
+            f"def {main_name}({', '.join(full_kernel_params)}, stream=None):"
+        )
+        with code.indent():
+            code.writeline("# Enable JAX x64 mode for float64/int64 support")
+            code.writeline("jax.config.update('jax_enable_x64', True)")
+            if alias_params:
+                code.writeline("# Convert Torch -> JAX for donated outputs")
+                for alias_name in alias_params:
+                    code.writeline(
+                        f"{alias_name}_jax = jax.dlpack.from_dlpack({alias_name})"
+                    )
+            code.writeline("# Convert Torch -> JAX for in-place tensors")
+            for ptr in pointer_tail:
+                if ptr.startswith("in_out_ptr"):
+                    code.writeline(f"{ptr}_jax = jax.dlpack.from_dlpack({ptr})")
+            code.writeline("# Convert Torch -> JAX for inputs")
+            for ptr in pointer_tail:
+                if ptr.startswith("in_ptr"):
+                    code.writeline(
+                        f"{ptr}_jax = jax.dlpack.from_dlpack({ptr}.contiguous())"
+                    )
+
+            code.writeline("# Prepare output metadata from PyTorch tensor")
+            code.writeline(
+                "out_shapes = ("
+                + ", ".join([f"tuple({name}.shape)" for name in output_params])
+                + ",)"
+            )
+            code.writeline(
+                "out_dtypes = ("
+                + ", ".join(
+                    [
+                        f"torch_dtype_to_jax_runtime({name}.dtype)"
+                        for name in output_params
+                    ]
+                )
+                + ",)"
+            )
+            arg_name_map: dict[str, str] = {}
+            for alias_name in alias_params:
+                arg_name_map[alias_name] = f"{alias_name}_jax"
+            for ptr in pointer_tail:
+                arg_name_map[ptr] = f"{ptr}_jax"
+
+            if kernel_input_params:
+                alias_args_str = ", ".join(
+                    arg_name_map[name] for name in kernel_input_params
+                )
+                code.writeline(
+                    f"res = {jit_wrapper_name}(out_shapes, out_dtypes, {alias_args_str})"
+                )
+            else:
+                code.writeline(f"res = {jit_wrapper_name}(out_shapes, out_dtypes)")
+            if copy_output_indices:
+                code.writeline(
+                    "result_values = res if isinstance(res, tuple) else (res,)"
+                )
+                for idx in copy_output_indices:
+                    name = output_params[idx]
+                    code.writeline(
+                        f"{name}.copy_(torch.from_dlpack(result_values[{idx}]))"
+                    )
+
+        return code.getvalue()
+
+    def call_kernel(self, name: str, node: Optional[IRNode] = None) -> None:  # type: ignore[override]
+        """Generate the Python code that calls this Pallas kernel."""
+        wrapper = V.graph.wrapper_code
+        arg_defs, call_args, _, _ = self.args.python_argdefs()
+        kernel_param_names = [a.name for a in arg_defs]
+        pure_out_params = [p for p in kernel_param_names if p.startswith("out_ptr")]
+        call_arg_strs = list(map(str, call_args))
+        aliasable = getattr(self, "aliasable_out_ptrs", {})
+        alias_call_args = [
+            call_arg_strs[kernel_param_names.index(p)]
+            for p in pure_out_params
+            if aliasable.get(p, False)
+        ]
+
+        # Generate kernel call: kernel_name.run(arg1, arg2, ...)
+        # Note: async_compile.pallas loads {name}_main function and wraps it in PallasKernelWrapper
+        # which exposes a run() method
+        kernel_call = f"{name}.run({', '.join(alias_call_args + call_arg_strs)})"
+        wrapper.writeline(kernel_call)
+
+
+class PallasScheduling(SIMDScheduling):
+    kernel_type = PallasKernel  # type: ignore[assignment]
+
+    @classmethod
+    def get_backend_features(cls, device: torch.device) -> OrderedSet[BackendFeature]:
+        # Pallas/JAX can handle reductions to single elements efficiently
+        # without requiring split reductions
+        return OrderedSet([BackendFeature.REDUCE_TO_SINGLE_ELEMENT])
+
+    def define_kernel(
+        self,
+        src_code: str,
+        node_schedule: Sequence[BaseSchedulerNode],
+        kernel: PallasKernel,
+    ) -> str:  # type: ignore[override]
+        wrapper = V.graph.wrapper_code
+        if src_code in wrapper.src_to_kernel:
+            return wrapper.src_to_kernel[src_code]
+
+        fused_name = (
+            get_fused_kernel_name(node_schedule, config.triton.descriptive_names)
+            if config.triton.descriptive_names
+            else ""
+        )
+        kernel_hash = hashlib.sha256(src_code.encode("utf-8")).hexdigest()[:8]
+        if fused_name == "fused":
+            kernel_name = f"pallas_{kernel_hash}"
+        else:
+            kernel_name = f"pallas_{fused_name}_{kernel_hash}"
+        wrapper.src_to_kernel[src_code] = kernel_name
+
+        # Replace placeholder if any
+        src_code = src_code.replace("<KERNEL_NAME>", kernel_name)
+
+        compile_wrapper = IndentedBuffer()
+        compile_wrapper.writeline(f"async_compile.pallas({kernel_name!r}, r'''")
+        compile_wrapper.splice(src_code, strip=True)
+        compile_wrapper.writeline("''')")
+
+        origins, detailed_origins = get_kernel_metadata(node_schedule, wrapper)
+        metadata_comment = f"{origins}\n{detailed_origins}"
+        wrapper.define_kernel(kernel_name, compile_wrapper.getvalue(), metadata_comment)
+
+        return kernel_name
diff --git a/torch/_inductor/codegen/simd.py b/torch/_inductor/codegen/simd.py
index f062bf12f1778..65e8f88b1c425 100644
--- a/torch/_inductor/codegen/simd.py
+++ b/torch/_inductor/codegen/simd.py
@@ -594,6 +594,17 @@ def dense_size_list(self) -> list[str]:
                 sizes[tree.tensor_dim] = f"{tree.prefix.upper()}BLOCK"
         return sizes
 
+    def create_constant_mask(self, entry) -> str:
+        x = entry.prefix
+        if entry.tensor_dim is None:
+            sizestr = self.dense_size_str()
+            return f"{x}mask = tl.full({sizestr}, True, tl.int1)"
+        sizes = ["None"] * self.triton_tensor_ndim()
+        sizes[entry.tensor_dim] = ":"
+        suffix = ", ".join(sizes)
+        out = f"{x}mask = tl.full([{x.upper()}BLOCK], True, tl.int1)[{suffix}]"
+        return out
+
     def dense_size_str(self) -> str:
         sizes = self.dense_size_list()
         return f"[{', '.join(sizes)}]"
@@ -1648,7 +1659,11 @@ def _pick_split_size():
         if (
             not torch._inductor.config.deterministic
             and config.triton.mix_order_reduction_split_size is None
-            and config.triton.mix_order_reduction_autotune_split_size
+            and (
+                config.triton.mix_order_reduction_autotune_split_size
+                or config.max_autotune
+                or config.coordinate_descent_tuning
+            )
         ):
 
             def _bench(candidate_split_size):
@@ -2052,7 +2067,7 @@ def _codegen_single_template(
 
             # TODO: Maybe unify CUDATemplateKernel to also use PartialRender for flexible epilogue fusion.
 
-            for input_name in kernel.named_input_nodes.keys():
+            for input_name in kernel.named_input_nodes:
                 subgraph_name = f"<LOAD_INPUT_{input_name}>"
                 # pyrefly: ignore [missing-attribute]
                 partial_code.finalize_hook(subgraph_name, strict=False)
@@ -2289,13 +2304,12 @@ def generate_combo_kernel_code(
         for node_group in partitions:
             if len(node_group) == 0:
                 continue
-            fused_node_lists = [node.get_nodes() for node in node_group]
             kernel = ComboKernel(
                 enable_autotune=enable_autotune,
                 mixed_sizes=mixed_sizes,
             )
 
-            for pn, nodes in zip(node_group, fused_node_lists):
+            for pn in node_group:
                 self.codegen_node_schedule_with_kernel(
                     node_schedule_map[pn][0],
                     kernel.create_sub_kernel(subkernel_map[pn]),
@@ -2550,8 +2564,10 @@ def get_nd_tilings(
                 all_var_ranges = [*dep.ranges.items()]
                 pointwise_vars_numel = sympy.S.One
                 sizevars = V.graph.sizevars
-                for pointwise_end_idx, (var, numel) in enumerate(all_var_ranges):
+                pointwise_end_idx = 0
+                for idx, (_var, numel) in enumerate(all_var_ranges):
                     pointwise_vars_numel *= numel
+                    pointwise_end_idx = idx
                     if sizevars.statically_known_geq(
                         pointwise_vars_numel, pointwise_numel
                     ):
@@ -2803,6 +2819,8 @@ def process_node_vars(
         bad_size_additional_tiling_penalty = 1.025
         good_size_tiling_penalty = 1.005
 
+        total_uncoalesced = sum(coalesce_analysis.uncoalesced_addrs.values())
+
         def score_mod(t):
             score_factor = 1.0
             for tile_size in t[0].tiling.values():
@@ -2811,12 +2829,19 @@ def score_mod(t):
                 else:
                     score_factor = score_factor / good_size_tiling_penalty
 
-            return -t[0].score * score_factor
+            # Add uncoalesced memory score to prevent small coalesced benefits
+            # from dominating large amounts of uncoalesced memory
+            uncoalesced_penalty = total_uncoalesced * 0.05
+
+            return -(t[0].score + uncoalesced_penalty) * score_factor
 
         # apply penalty for longer tilings that dont increase score much
         for cand, tiling_score in sorted(tilings, key=score_mod):
-            if cls.tiling_is_compatible(
-                node_schedule, pointwise_numel, reduction_numel, cand.tiling
+            if (
+                cls.tiling_is_compatible(
+                    node_schedule, pointwise_numel, reduction_numel, cand.tiling
+                )
+                or cand.tiling == default_tiling
             ):
                 # we always include default reduction numel == 1, dont include
                 tiling_len = len(cand.tiling) - (1 if reduction_numel == 1 else 0)
diff --git a/torch/_inductor/codegen/subgraph.py b/torch/_inductor/codegen/subgraph.py
index 4cc3f0ef282a8..1c1f0f1c9cd2c 100644
--- a/torch/_inductor/codegen/subgraph.py
+++ b/torch/_inductor/codegen/subgraph.py
@@ -24,6 +24,22 @@
 log = logging.getLogger(__name__)
 
 
+def inline_subgraph_to_ir_nodes(
+    gm: torch.fx.GraphModule, inputs: list[Any], name: str
+) -> Any:
+    """Inline a subgraph by converting its FX operations to individual IR nodes.
+
+    This converts a subgraph to multiple ComputedBuffer nodes (fusable),
+    enabling epilogue fusion with subsequent operations.
+
+    Returns:
+        TensorBox containing the final operation result as individual IR nodes
+    """
+    from torch._inductor.lowering import process_subgraph_nodes
+
+    return process_subgraph_nodes(gm, inputs)
+
+
 class SubgraphChoiceCaller(ir.ChoiceCaller):
     """
     Represents a Subgraph Autotuning choice, and the subgraph can be any arbitrary
@@ -261,7 +277,14 @@ def make_fx_graph(
                 # decomp_kwargs contains all merged parameters: CustomOpConfig params + runtime kwargs
                 from torch.fx.experimental.proxy_tensor import make_fx
 
-                return make_fx(functools.partial(decomp, **decomp_kwargs))(*args)
+                from ..decomposition import select_decomp_table
+
+                decomposition_table = select_decomp_table()
+
+                return make_fx(
+                    functools.partial(decomp, **decomp_kwargs),
+                    decomposition_table=decomposition_table,
+                )(*args)
 
             # Generate descriptive name for this variant
             variant_name = self._generate_variant_name(decomp, decomp_kwargs)
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index 8426f46887d06..4ac481478196a 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -4571,14 +4571,20 @@ def codegen_body(self):
                 )
                 accumname2var[name] = self.cse.namedvar(name, dtype=torch.float)
             self.body.writeline("split_size = min(RSPLIT_SIZE, xnumel - xoffset)")
-            self.body.writeline("for _ in range(0, split_size, XBLOCK):")
+            self.body.writeline(
+                "for _ in tl.range(0, split_size, XBLOCK, num_stages=NUM_STAGES):"
+            )
             with self.body.indent(offset=1):
+                # generate xmask if it's not constant
+                if not self._has_constant_xmask():
+                    entry = self.range_trees[0]
+                    assert entry.prefix == "x"
+                    x = entry.prefix
+                    self.body.writeline(f"{x}mask = {entry.name} < {x}numel")
                 self.body.splice(self.indexing_code)
                 self.body.writelines(
                     [
                         "xindex += XBLOCK",
-                        # TODO we force XBLOCK==1 for now so there is
-                        # no need to update the xmask
                     ]
                 )
                 self.body.splice(self.loads)
@@ -5038,6 +5044,7 @@ def add_constexpr_arg(arg_name):
 
         if self.mix_order_reduction:
             add_constexpr_arg("RSPLIT_SIZE")
+            add_constexpr_arg("NUM_STAGES")
 
         triton_meta_signature = signature_to_meta(
             signature, size_dtype=self.index_dtype, argdefs=argdefs
@@ -5586,9 +5593,9 @@ def iteration_ranges_codegen_header(
                 ]
             )
         if self._has_constant_mask(entry):
-            sizes = self.dense_size_str()
-            code.writeline(f"{x}mask = tl.full({sizes}, True, tl.int1)")
-        else:
+            code.writeline(self.create_constant_mask(entry))
+        elif not (x == "x" and self.mix_order_reduction):
+            # mix order reduction should generate xmask inside the loop
             code.writeline(f"{x}mask = {entry.name} < {x}numel")
 
 
diff --git a/torch/_inductor/codegen/triton_combo_kernel.py b/torch/_inductor/codegen/triton_combo_kernel.py
index e86753348c6b1..615913933326e 100644
--- a/torch/_inductor/codegen/triton_combo_kernel.py
+++ b/torch/_inductor/codegen/triton_combo_kernel.py
@@ -98,7 +98,7 @@ def _default_custom_combo_kernel_horizontal_partition(
         ]
         short_reduction = [n for n in reduction if n not in long_reduction]
         if long_reduction:
-            log.warning(
+            log.debug(
                 "ComboKernels: %d long reduction nodes are separated",
                 len(long_reduction),
             )
@@ -112,7 +112,7 @@ def _default_custom_combo_kernel_horizontal_partition(
         ]
         if large_pointwise:
             # TODO benchmark the performance when large pointwise nodes combining with others
-            log.warning(
+            log.debug(
                 "ComboKernels: %d large pointwise nodes are separated",
                 len(large_pointwise),
             )
@@ -699,7 +699,7 @@ def get_block_args(self) -> list[ConstexprArg]:
                 block_names[f"{tree.prefix.upper()}BLOCK"] = tree.prefix
         self.block_args = list(block_names.keys())
 
-        return [ConstexprArg(x) for x in block_names.keys()]
+        return [ConstexprArg(x) for x in block_names]
 
     def add_numel_to_args(
         self, argdefs: list[ArgName], signature: list[Any]
diff --git a/torch/_inductor/codegen/triton_utils.py b/torch/_inductor/codegen/triton_utils.py
index 2a2706ad5720b..75a34813c876b 100644
--- a/torch/_inductor/codegen/triton_utils.py
+++ b/torch/_inductor/codegen/triton_utils.py
@@ -4,6 +4,7 @@
 import sympy
 
 import torch
+from torch.utils._sympy.symbol import symbol_is_type, SymT
 
 from .. import config
 from ..runtime.hints import AttrsDescriptorWrapper
@@ -71,6 +72,10 @@ def signature_of(arg: KernelArgType, *, size_dtype: Optional[str]) -> str:
             return "constexpr"
         elif isinstance(arg.expr, (float, sympy.Float)):
             return "fp32"
+        elif isinstance(arg.expr, sympy.Symbol) and symbol_is_type(
+            arg.expr, (SymT.UNBACKED_FLOAT)
+        ):
+            return "fp32"
         elif isinstance(arg.expr, bool):
             return "i1"
 
diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py
index e629d9c7bdebd..c6c56c86b2c24 100644
--- a/torch/_inductor/codegen/wrapper.py
+++ b/torch/_inductor/codegen/wrapper.py
@@ -971,6 +971,7 @@ def codegen(self, code: IndentedBuffer) -> None:
         else:
             (x, index) = (t.codegen_reference() for t in node.inputs)
             src = node.constant_args[1]
+        device = d.type if (d := node.get_device()) else V.graph.device_type
         self.wrapper._generate_scatter_fallback(
             x,
             [x, node.constant_args[0], index, src],
@@ -979,6 +980,7 @@ def codegen(self, code: IndentedBuffer) -> None:
             node.src_is_tensor,
             node.kwargs["reduce"],
             node.codegen_kwargs(),
+            device,
         )
 
     def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
@@ -1632,6 +1634,7 @@ def _generate_scatter_fallback(
         src_is_tensor,
         reduce,
         kwargs,
+        device,
     ):
         line = f"{python_kernel_name}({','.join(map(str, inputs))}"
         if python_kernel_name.startswith("aten.scatter_reduce"):
@@ -2063,7 +2066,8 @@ def clamp_index(x):
             neg = self.codegen_sizevar(
                 sympy.Max(0, sympy.Min(x + node.size, node.size))
             )
-            return f"{pos} if {x} >= 0 else {neg}"
+            x_cond = self.codegen_sizevar(x)
+            return f"{pos} if {x_cond} >= 0 else {neg}"
 
         def codegen_with_step(start_var, end_var, step):
             if step == 1:
@@ -2118,6 +2122,10 @@ def add_expr_input(name, val):
             output.writeline(f"{name} = {val}")
 
         def add_torchbind_input(name, value):
+            if value is None:
+                output.writeline(f"{name} = None")
+                return
+
             import pickle
 
             assert isinstance(value, torch.ScriptObject)
@@ -2255,7 +2263,7 @@ def _define_kernel_helper(
         gpu: bool = True,
         cpp_definition: Optional[str] = None,
     ):
-        if config.triton.autotune_at_compile_time:
+        if config.triton.autotune_at_compile_time and gpu:
             body = self._format_kernel_definition(
                 kernel_name, kernel_body, metadata=metadata
             )
@@ -3741,6 +3749,13 @@ def __init__(
 
         super().__init__()
 
+        root = self.get_root_graph()
+        # Only generate auto-tuning block in the main graph
+        self.kernel_autotune_defs = root.kernel_autotune_defs
+        self.kernel_autotune_calls = root.kernel_autotune_calls
+        # Only store kernel src to name mapping in the main graph
+        self.src_to_kernel = root.src_to_kernel
+
     def set_launcher_fn_name(self) -> None:
         # This sets up the name of the function containing the launcher code of
         # the subgraph.
@@ -3833,3 +3848,16 @@ def write_get_raw_stream_header_once(self) -> None:
         #         V.graph.device_ops.import_get_raw_stream_as("get_raw_stream")
         #     )
         self.parent_wrapper.write_get_raw_stream_header_once()
+
+    @cache_on_self
+    def get_root_graph(self) -> PythonWrapperCodegen:
+        root: PythonWrapperCodegen | SubgraphPythonWrapperCodegen = self
+        while isinstance(root, SubgraphPythonWrapperCodegen):
+            root = root.parent_wrapper
+
+        assert isinstance(root, PythonWrapperCodegen)
+        return root
+
+    def generate_and_run_autotune_block(self):
+        # Only execute auto-tuning block in the main graph
+        pass
diff --git a/torch/_inductor/codegen/wrapper_fxir.py b/torch/_inductor/codegen/wrapper_fxir.py
index 0659bee04d689..02c498d6debce 100644
--- a/torch/_inductor/codegen/wrapper_fxir.py
+++ b/torch/_inductor/codegen/wrapper_fxir.py
@@ -678,6 +678,7 @@ def _generate_allocate(self, line: WrapperLine) -> None:
         assert name not in V.graph.removed_buffers
 
         device = buffer.get_device()
+        assert device
         dtype = buffer.get_dtype()
         shape = self._generate_sym_nodes(buffer.get_size())
         stride = self._generate_sym_nodes(buffer.get_stride())
@@ -685,7 +686,7 @@ def _generate_allocate(self, line: WrapperLine) -> None:
         node = self.gm.graph.call_function(
             torch.empty_strided,
             args=(shape, stride),
-            kwargs={"dtype": dtype, "device": device},
+            kwargs={"dtype": dtype, "device": device.type},
         )
         assert name
         node.name = name
diff --git a/torch/_inductor/comm_analysis.py b/torch/_inductor/comm_analysis.py
index afa569ff97da2..681aef9afb35f 100644
--- a/torch/_inductor/comm_analysis.py
+++ b/torch/_inductor/comm_analysis.py
@@ -23,6 +23,7 @@ class NCCL_COLL(IntEnum):
     ALL_GATHER = 1
     REDUCE_SCATTER = 2
     ALL_TO_ALL = 3
+    UNSUPPORTED = 4
 
 
 class NVIDIA_GPU_TYPE(IntEnum):
@@ -53,10 +54,10 @@ def get_collective_type_from_kernel_name(kernel_name: str) -> NCCL_COLL:
         return NCCL_COLL.ALL_GATHER
     elif "reduce_scatter" in kernel_name:
         return NCCL_COLL.REDUCE_SCATTER
-    elif "torch.ops._dtensor.shard_dim_alltoall.default" in kernel_name:
+    elif any(comm in kernel_name for comm in ("all_to_all", "alltoall")):
         return NCCL_COLL.ALL_TO_ALL
     else:
-        raise ValueError(f"Unsupported collective kernel: {kernel_name}")
+        return NCCL_COLL.UNSUPPORTED
 
 
 def get_collective_type(node: ir.IRNode) -> NCCL_COLL:
@@ -196,16 +197,9 @@ def estimate_nccl_collective_runtime_nccl_estimator(snode) -> Optional[float]:
     if "all_gather_into_tensor_out" in py_kernel_name:
         args = args[1:] + args[0]
 
-    try:
-        with torch.distributed._time_estimator(
-            group=pg, device=device
-        ) as time_estimator:
-            w = fn(*args, **kwargs)
-            torch.ops._c10d_functional.wait_tensor.default(w)
-    except Exception as e:
-        # NCCL estimator can fail
-        log.info(e)  # noqa: G200
-        return None
+    with torch.distributed._time_estimator(group=pg, device=device) as time_estimator:
+        w = fn(*args, **kwargs)
+        torch.ops._c10d_functional.wait_tensor.default(w)
 
     est_time_us = time_estimator.estimated_time
     # -1000 constant is NCCL return in case of error during estimations.
@@ -327,7 +321,7 @@ def estimate_nccl_collective_runtime_impl(
 
 def estimate_nccl_collective_runtime(node: ir.IRNode) -> float:
     """
-    Returns estimated NCCL collective runtime in nanoseconds (ns).
+    Returns estimated NCCL collective runtime in nanoseconds (ms).
 
     The following heuristics are copied from https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc.
     We aim to estimate the runtime as accurately as possible.
@@ -347,13 +341,12 @@ def estimate_nccl_collective_runtime(node: ir.IRNode) -> float:
 
 
 def estimate_fx_collective_size(fx_node: torch.fx.Node) -> int:
-    size = 0
+    sz_bytes = 0
     for node in fx_node.all_input_nodes:
         if (t := node.meta.get("val")) is not None:
-            size += t.numel() * t.element_size()
-
-    # TODO - symbolic
-    return size
+            numel = get_size_numel(t.size())
+            sz_bytes += numel * get_dtype_size(t.dtype)
+    return sz_bytes
 
 
 def estimate_nccl_collective_runtime_from_fx_node(
@@ -362,7 +355,7 @@ def estimate_nccl_collective_runtime_from_fx_node(
     use_nccl_estimator: bool = True,
 ) -> float:
     """
-    Returns estimated NCCL collective runtime in nanoseconds (ns).
+    Returns estimated NCCL collective runtime in nanoseconds (ms).
 
     The following heuristics are copied from https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc.
     We aim to estimate the runtime as accurately as possible.
@@ -397,6 +390,20 @@ def estimate_nccl_collective_runtime_from_fx_node(
 
     def _nccl_estimate() -> Optional[float]:
         # TODO: Refactor with estimate_nccl_collective_runtime_nccl_estimator
+        from torch.distributed.distributed_c10d import (
+            _get_pg_default_device,
+            _resolve_process_group,
+        )
+
+        pg = _resolve_process_group(group_name)
+        if torch.distributed.distributed_c10d.get_backend(pg) == "fake":
+            # nccl estimator requires real process group
+            return None
+
+        device = _get_pg_default_device(pg)
+        backend = pg._get_backend(device)
+        if not backend.supports_time_estimate:
+            return None
 
         flat_args, flat_args_pytree_spec = pytree.tree_flatten((args, kwargs))
 
@@ -420,13 +427,6 @@ def to_real_tensor(e: Any) -> Any:
         flat_args = [to_real_tensor(a) for a in flat_args]
         real_args, real_kwargs = pytree.tree_unflatten(flat_args, flat_args_pytree_spec)
 
-        from torch.distributed.distributed_c10d import _resolve_process_group
-
-        pg = _resolve_process_group(group_name)
-        if torch.distributed.distributed_c10d.get_backend(pg) == "fake":
-            # nccl estimator requires real process group
-            return None
-
         fn = fx_node.target
         assert isinstance(fn, torch._ops.OpOverload)
         with torch.distributed._time_estimator(group=pg) as time_estimator:
@@ -440,7 +440,7 @@ def to_real_tensor(e: Any) -> Any:
         est_time_ms = est_time_us / 1e3
         return est_time_ms
 
-    if torch.distributed.is_nccl_available() and use_nccl_estimator:
+    if use_nccl_estimator:
         est_time_ms = _nccl_estimate()
         if est_time_ms is not None:
             return est_time_ms
diff --git a/torch/_inductor/comms.py b/torch/_inductor/comms.py
index 6c7c9a8bd7dab..ba2571f266244 100644
--- a/torch/_inductor/comms.py
+++ b/torch/_inductor/comms.py
@@ -18,13 +18,12 @@
 from torch.multiprocessing.reductions import StorageWeakRef
 from torch.utils._ordered_set import OrderedSet
 
-from . import config, ir
+from . import config, config_comms, ir
 from .dependencies import WeakDep
 
 
 if TYPE_CHECKING:
     from .ir import IRNode, Operation
-    from .scheduler import SchedulerBuffer
 
 from .memory import (
     estimate_peak_memory,
@@ -155,12 +154,15 @@ class ReorderInfo:
     Debug info describing how an individual snode was reordered
     """
 
-    initial_exposed: float = -1
-    final_exposed: float = -1
     limiting_factor: str = "None"
     moves: int = 0
     grouped: int = 0
     grouped_info: str = ""
+    comm_time: float = -1.0
+    comp_time: float = -1.0
+    initial_exposed: float = -1.0
+    final_exposed: float = -1.0
+    overlap_info: str = "None"
 
     @property
     def improvement(self):
@@ -193,7 +195,7 @@ def contains_gemm_like(snode: BaseSchedulerNode) -> bool:
         return is_gemm_like(snode.node)
 
 
-def _temp_group_visit_leaves(snode, fn):
+def _temp_group_visit_leaves(snode: BaseSchedulerNode, fn):
     from torch._inductor.scheduler import GroupedSchedulerNode
 
     if isinstance(snode, GroupedSchedulerNode) and snode.temp_grouping:
@@ -203,6 +205,126 @@ def _temp_group_visit_leaves(snode, fn):
         fn(snode)
 
 
+def wait_exposed_communication_time(
+    snodes_to_wait: list[BaseSchedulerNode], runtimes: dict[BaseSchedulerNode, float]
+) -> tuple[float, float, str]:
+    """
+    Calculate exposed communication time for a wait operation by finding its corresponding
+    collective and accumulating overlapping compute time between them.
+
+    The Wait node must be the last in snodes_to_wait.
+    Compute time between corresponding Collective and Wait is accumulated.
+    If there is another pair of Collective and Wait inside,
+    Only compute before first such Wait' is considered as overlapping.
+
+    Multiple process groups are not modeled so far.
+    """
+    wait_snode = snodes_to_wait[-1]
+    assert is_wait(wait_snode.node)
+    assert len(snodes_to_wait) > 1
+    idx = len(snodes_to_wait) - 2
+    comm_time = 0.0
+    comp_time = 0.0
+    overlap_info = ""
+    waits_found = []
+    for i in range(idx, -1, -1):
+        c = snodes_to_wait[i]
+        if contains_wait(c):
+            waits_found.append(c)
+        if contains_collective(c):
+            if is_corresponding_collective_wait(c, wait_snode):
+                comm_time = runtimes[c]
+                overlap_info += f"->C[{c.get_name()}]"
+                break
+
+            if not contains_async_collective(c):
+                # Sync Collective
+                comp_time = 0.0
+                continue
+            else:
+                for w in waits_found:
+                    if is_corresponding_collective_wait(c, w):
+                        # Similar to Sync Collective
+                        # If after our Collective exist another Collective-Wait,
+                        # All compute after it will not be overlapping
+                        comp_time = 0.0
+                        continue
+
+        comp_time_before = comp_time
+
+        def accumulate_time(_snode: BaseSchedulerNode) -> None:
+            nonlocal comp_time
+            comp_time += runtimes[_snode]
+
+        _temp_group_visit_leaves(c, accumulate_time)
+        comp_time_after = comp_time
+        overlap_info += f"+{c.get_name()}[{comp_time_after - comp_time_before}]"
+
+    return comm_time, comp_time, overlap_info
+
+
+def coll_exposed_communication_time(
+    snodes: list[BaseSchedulerNode],
+    runtimes: dict[BaseSchedulerNode, float],
+) -> tuple[float, float, str]:
+    """
+    Calculate exposed communication time for a collective operation by finding its corresponding
+    wait and accumulating compute time that can overlap with communication.
+
+    The Collective node must be the first in snodes.
+    Compute time between corresponding Collective and Wait is accumulated.
+    If there is another pair of Collective and Wait inside,
+    Only compute before first such Wait' is considered as overlapping.
+
+    Multiple process groups are not modeled so far.
+    """
+    collective_snode = snodes[0]
+    comm_time = runtimes[collective_snode]
+    comp_time = 0.0
+    collective_outs: OrderedSet[str] = OrderedSet(
+        o.get_name() for o in collective_snode.get_outputs()
+    )
+    overlap_info = ""
+    collectives_found: list[BaseSchedulerNode] = []
+    for snode in snodes[1:]:
+        # We may have some ops without Wait,
+        # e.g. DTensor torch.ops._dtensor.shard_dim_alltoall
+        unmet_deps = OrderedSet(
+            d.name for d in snode.unmet_dependencies if not _is_fake_dep(d)
+        )
+
+        if unmet_deps & collective_outs:
+            overlap_info += f"->W[{snode.get_name()}]"
+            break
+
+        if contains_collective(snode):
+            if not contains_async_collective(snode):
+                break
+            else:
+                collectives_found.append(snode)
+                continue
+        if contains_wait(snode):
+            has_wait_for_collectives_found = False
+            for _coll in collectives_found:
+                if is_corresponding_collective_wait(collective_snode, snode):
+                    has_wait_for_collectives_found = True
+                    break
+            if has_wait_for_collectives_found:
+                # Any compute after not overlapping original Collective
+                break
+
+        comp_time_before = comp_time
+
+        def accumulate_time(_snode: BaseSchedulerNode) -> None:
+            nonlocal comp_time
+            comp_time += runtimes[_snode]
+
+        _temp_group_visit_leaves(snode, accumulate_time)
+        comp_time_after = comp_time
+        overlap_info += f"+{snode.get_name()}[{comp_time_after - comp_time_before}]"
+    return comm_time, comp_time, overlap_info
+
+
 def _group_name(snode, with_bufs=False) -> str:
     ret = ""
     for n in snode.snodes:
@@ -258,369 +380,361 @@ def _initialize_double_linked_list(
     return _prev, _next, _head
 
 
-def _reorder_communication_preserving_peak_memory_internal(
-    snodes: list[BaseSchedulerNode],
-) -> tuple[list[BaseSchedulerNode], dict[BaseSchedulerNode, ReorderInfo]]:
+def is_corresponding_collective_wait(
+    collective_snode: BaseSchedulerNode, wait_snode: BaseSchedulerNode
+) -> bool:
     """
-    Internal testing helper that also returns debug info.
+    Check if a wait node corresponds to a given collective node by verifying if the wait
+    depends on outputs from the collective.
+    """
+    collective_outs = OrderedSet(o.get_name() for o in collective_snode.get_outputs())
+    unmet_deps = OrderedSet(d.name for d in wait_snode.unmet_dependencies)
+    return bool(unmet_deps & collective_outs)
+
+
+def _op_runtime_estimate_mult(snode):
+    # Apply multipliers for faster experimentation.
+    # TODO(ivankobzarev): Remove after confirmation that runtime estimations are correct.
+    if contains_collective(snode):
+        return config_comms.reorder_sink_runtime_estimations_comm_mult
+
+    return config_comms.reorder_sink_runtime_estimations_non_comm_mult
+
+
+def is_async_collective(snode):
+    """
+    Filtering out ops that contain Collective and Wait inside and considered as Collectives.
+    See contains_collective function.
+    If the op contains Wait inside - consider as Synchronous compute.
+    """
+    if python_kernel_name := getattr(snode.node, "python_kernel_name", None):
+        if "torch.ops._dtensor.shard_dim_alltoall.default" in python_kernel_name:
+            return False
+
+    return True
+
+
+def contains_async_collective(snode):
+    return contains_collective(snode, is_async_collective)
+
+
+def _group_nodes_from_linked_list(
+    head: Optional[BaseSchedulerNode],
+    tail: Optional[BaseSchedulerNode],
+    next_dict: dict[BaseSchedulerNode, Optional[BaseSchedulerNode]],
+) -> list[BaseSchedulerNode]:
+    """
+    Traverse doubly-linked list from head to tail and return nodes as a list.
+
+    Args:
+        head: Starting node of the segment
+        tail: Ending node of the segment (inclusive)
+        next_dict: Dictionary mapping each node to its next node
+
     Returns:
-        - reordered snodes list
-        - dict {snode: ReorderInfo}
+        List of nodes from head to tail (inclusive)
     """
-    has_collectives = False
-    for snode in snodes:
-        if contains_collective(snode):
-            has_collectives = True
+    ret = []
+    n = head
+    while True:
+        if n is not None:
+            ret.append(n)
+        if n == tail:
             break
-    if not has_collectives:
-        return snodes, {}
-
-    from torch._inductor.scheduler import GroupedSchedulerNode
+        n = next_dict[n]  # type: ignore[index]
+    return ret
 
-    original_snodes_num = len(snodes)
-    # heuristic to avoid degenerating to quadratic time
-    graph_inputs: OrderedSet[str] = OrderedSet(V.graph.graph_inputs.keys())
-    graph_outputs: OrderedSet[str] = OrderedSet(V.graph.get_output_names())
-    (
-        peak_memory,
-        _curr_memory,
-        snodes_allocfree,
-        buf_to_snode_last_use,
-        name_to_freeable_input_buf,
-    ) = _initialize_memory_tracking(snodes, graph_inputs, graph_outputs)
-    runtimes: dict[BaseSchedulerNode, float] = {
-        snode: estimate_op_runtime(snode) for snode in snodes
-    }
-    # debug stats
-    stats: dict[BaseSchedulerNode, ReorderInfo] = {}
 
-    def exposed_communication_time(
-        collective_snode: BaseSchedulerNode, remaining_snodes: list[BaseSchedulerNode]
-    ) -> float:
-        # assumes a linear schedule and computes the overlap of the collective with the remaining nodes
-        comm_time = estimate_op_runtime(collective_snode)
-        compute_time = 0.0
-        for snode in remaining_snodes:
-            if contains_collective(snode):
-                continue
-            if contains_wait(snode):
-                # TODO - if the wait is for a collective that started before this collective or on another stream,
-                # we can ignore it. Otherwise, it's the end of the road for overlap opportunities
-                break
+def _perform_double_linked_list_swap(
+    candidate: BaseSchedulerNode,
+    group_head: BaseSchedulerNode,
+    group_tail: BaseSchedulerNode,
+    prev_dict: dict[BaseSchedulerNode, Optional[BaseSchedulerNode]],
+    next_dict: dict[BaseSchedulerNode, Optional[BaseSchedulerNode]],
+    head: BaseSchedulerNode,
+) -> BaseSchedulerNode:
+    """
+    Swap positions of candidate and group in doubly-linked list.
 
-            def accumulate_time(_snode: BaseSchedulerNode) -> None:
-                nonlocal compute_time
-                compute_time += runtimes[_snode]
+    Transforms:
+    candidate_prev -> candidate -> group_head...group_tail -> group_tail_next
+    Into:
+    candidate_prev -> group_head...group_tail -> candidate -> group_tail_next
 
-            _temp_group_visit_leaves(snode, accumulate_time)
-        return max(0, comm_time - compute_time)
+    Args:
+        candidate: Node to swap with group
+        group_head: First node of group
+        group_tail: Last node of group
+        prev_dict: Dictionary mapping nodes to their previous nodes
+        next_dict: Dictionary mapping nodes to their next nodes
+        head: Current head of the linked list
 
-    total_moves = 0
+    Returns:
+        New head of the linked list (may change if candidate was the head)
+    """
+    # 0: Update candidate's previous node
+    candidate_prev = prev_dict[candidate]
+    if candidate_prev:
+        next_dict[candidate_prev] = group_head
+    prev_dict[group_head] = candidate_prev
+
+    # 2: Update group_tail's next node
+    group_tail_next = next_dict[group_tail]
+    if group_tail_next:
+        prev_dict[group_tail_next] = candidate
+    next_dict[candidate] = group_tail_next
+
+    # 1: Link group_tail to candidate
+    prev_dict[candidate] = group_tail
+    next_dict[group_tail] = candidate
+
+    # Update head if candidate was the head
+    if head == candidate:
+        return group_head
+    return head
+
+
+def _calculate_potential_peak_memory_reorder(
+    candidate: BaseSchedulerNode,
+    gns: list[BaseSchedulerNode],
+    group_tail: BaseSchedulerNode,
+    group_peak_memory: int,
+    candidate_delta_mem: int,
+    candidate_allocfree: SNodeMemory,
+    group_n_to_bufs_after_swap_dealloc_by_candidate: dict,
+    curr_memory: dict,
+) -> tuple[int, dict[BaseSchedulerNode, int]]:
+    """
+    Calculate potential peak memory after swapping candidate with group (reorder version).
 
-    _prev, _next, _head = _initialize_double_linked_list(snodes)
+    Computes new memory levels for all affected nodes and returns the potential
+    peak memory along with cached post-allocation memory values for each node.
 
-    def _group_nodes(
-        head: Optional[BaseSchedulerNode], tail: Optional[BaseSchedulerNode]
-    ) -> list[BaseSchedulerNode]:
-        ret = []
-        n = head
-        while True:
-            if n is not None:
-                ret.append(n)
-            if n == tail:
-                break
-            n = _next[n]  # type: ignore[index]
-        return ret
-
-    def _perform_double_linked_list_swap(candidate, group_head, group_tail):
-        # swap (candidate, group_head...group_tail)
-        # Before:
-        # candidate_prev -0-> candidate -1-> group_head...group_tail -2-> group_tail_next
-        # After:
-        # candidate_prev -0-> group_head...group_tail -1-> candidate -2-> group_tail_next
-        # 0
-        candidate_prev = _prev[candidate]
-        if candidate_prev:
-            _next[candidate_prev] = group_head
-        _prev[group_head] = candidate_prev
-
-        # 2
-        group_tail_next = _next[group_tail]
-        if group_tail_next:
-            _prev[group_tail_next] = candidate
-        _next[candidate] = group_tail_next
-
-        # 1
-        _prev[candidate] = group_tail
-        _next[group_tail] = candidate
-
-        nonlocal _head
-        if _head == candidate:
-            _head = group_head
-
-    def _calculate_potential_peak_memory(
-        candidate, group_ns, group_n_to_bufs_after_swap_dealloc_by_candidate
-    ):
-        # Caching calculations of memory for group nodes and candidate,
-        # to apply without recalculation after swap.
-        _post_alloc_update: dict[BaseSchedulerNode, int] = {}
-        potential_peak: int = 0
-        if not group_n_to_bufs_after_swap_dealloc_by_candidate:
-            # Not accounting for buffers last use change
-            potential_peak = max(
-                group_peak_memory - candidate_delta_mem,
-                _curr_memory[group_tail][1]
-                - candidate_delta_mem
-                + candidate_allocfree.size_alloc,
-            )
-            return potential_peak, _post_alloc_update
+    Args:
+        candidate: Node being moved
+        gns: Group nodes
+        group_tail: Last node of group
+        group_peak_memory: Current peak memory within the group
+        candidate_delta_mem: Net memory change from candidate (alloc - free)
+        candidate_allocfree: Candidate's allocation/free info
+        group_n_to_bufs_after_swap_dealloc_by_candidate: Buffers whose deallocation moves to candidate
+        curr_memory: Current memory state dict
 
-        # If candidate will be after group, the starting memory level of group nodes
-        # changes to the -(candidate.size_alloc - candidate.size_free)
-        mem_after_reorder_delta: int = -candidate_delta_mem
-        for gn in gns:
-            gn_post_alloc_mem = _curr_memory[gn][0] + mem_after_reorder_delta
-            _post_alloc_update[gn] = gn_post_alloc_mem
-            potential_peak = max(potential_peak, gn_post_alloc_mem)
-
-            bufs = group_n_to_bufs_after_swap_dealloc_by_candidate.get(gn, None)
-            if bufs is not None:
-                for buf in bufs:
-                    # Candidate will deallocate those buffers
-                    mem_after_reorder_delta += buf.mpi_buffer.size_free
-
-        candidate_mem_post_alloc = (
-            _curr_memory[group_tail][1]
-            + mem_after_reorder_delta
-            + candidate_allocfree.size_alloc
+    Returns:
+        Tuple of (potential_peak_memory, post_alloc_update_dict)
+    """
+    # Caching calculations of memory for group nodes and candidate,
+    # to apply without recalculation after swap.
+    _post_alloc_update: dict[BaseSchedulerNode, int] = {}
+    potential_peak: int = 0
+    if not group_n_to_bufs_after_swap_dealloc_by_candidate:
+        # Not accounting for buffers last use change
+        potential_peak = max(
+            group_peak_memory - candidate_delta_mem,
+            curr_memory[group_tail][1]
+            - candidate_delta_mem
+            + candidate_allocfree.size_alloc,
         )
-        _post_alloc_update[candidate] = candidate_mem_post_alloc
-        potential_peak = max(potential_peak, candidate_mem_post_alloc)
         return potential_peak, _post_alloc_update
 
-    def _update_memory_tracking_after_swap(
-        candidate,
-        gns,
-        group_n_to_bufs_after_swap_dealloc_by_candidate,
-        _post_alloc_update,
-    ):
-        if not group_n_to_bufs_after_swap_dealloc_by_candidate:
-            for gn in gns:
-                cm = _curr_memory[gn]
-                _curr_memory[gn] = (
-                    cm[0] - candidate_delta_mem,
-                    cm[1] - candidate_delta_mem,
-                )
-            _candidate_post_alloc_mem = (
-                _curr_memory[group_tail][1] + candidate_allocfree.size_alloc
-            )
-            _candidate_post_free_mem = (
-                _candidate_post_alloc_mem - candidate_allocfree.size_free
-            )
-            _curr_memory[candidate] = (
-                _candidate_post_alloc_mem,
-                _candidate_post_free_mem,
-            )
-            return
+    # If candidate will be after group, the starting memory level of group nodes
+    # changes to the -(candidate.size_alloc - candidate.size_free)
+    mem_after_reorder_delta: int = -candidate_delta_mem
+    for gn in gns:
+        gn_post_alloc_mem = curr_memory[gn][0] + mem_after_reorder_delta
+        _post_alloc_update[gn] = gn_post_alloc_mem
+        potential_peak = max(potential_peak, gn_post_alloc_mem)
 
-        # Candidate becomes last use of some bufs
-        for bufs in group_n_to_bufs_after_swap_dealloc_by_candidate.values():
+        bufs = group_n_to_bufs_after_swap_dealloc_by_candidate.get(gn)
+        if bufs is not None:
             for buf in bufs:
-                buf_to_snode_last_use[buf] = candidate
-
-        size_free_to_move_to_candidate_sum: int = 0
-        for n in gns:
-            _gn_post_alloc_mem: int = _post_alloc_update[n]
-            size_free_to_move_to_candidate: int = sum(
-                buf.mpi_buffer.size_free
-                for buf in group_n_to_bufs_after_swap_dealloc_by_candidate[n]
-            )
-            size_free_to_move_to_candidate_sum += size_free_to_move_to_candidate
-            # group node does not deallocate this after swap
-            snodes_allocfree[n].size_free -= size_free_to_move_to_candidate
-            gn_post_free_mem: int = _gn_post_alloc_mem - snodes_allocfree[n].size_free
-            _curr_memory[n] = (_gn_post_alloc_mem, gn_post_free_mem)
-        _candidate_post_alloc_mem = _post_alloc_update[candidate]
-        snodes_allocfree[candidate].size_free += size_free_to_move_to_candidate_sum
-        candidate_post_free_mem = (
-            _candidate_post_alloc_mem - snodes_allocfree[candidate].size_free
-        )
-        _curr_memory[candidate] = (
-            _candidate_post_alloc_mem,
-            candidate_post_free_mem,
-        )
+                # Candidate will deallocate those buffers
+                mem_after_reorder_delta += buf.mpi_buffer.size_free
 
-    debug_num_collectives_to_reorder: Optional[int] = (
-        config.reorder_iterative_debug_limit_to_reorder
+    candidate_mem_post_alloc = (
+        curr_memory[group_tail][1]
+        + mem_after_reorder_delta
+        + candidate_allocfree.size_alloc
     )
+    _post_alloc_update[candidate] = candidate_mem_post_alloc
+    potential_peak = max(potential_peak, candidate_mem_post_alloc)
+    return potential_peak, _post_alloc_update
+
+
+def _update_memory_tracking_after_swap_reorder(
+    candidate: BaseSchedulerNode,
+    gns: list[BaseSchedulerNode],
+    group_tail: BaseSchedulerNode,
+    candidate_delta_mem: int,
+    candidate_allocfree: SNodeMemory,
+    group_n_to_bufs_after_swap_dealloc_by_candidate: dict,
+    post_alloc_update: dict[BaseSchedulerNode, int],
+    curr_memory: dict,
+    buf_to_snode_last_use: dict,
+    snodes_allocfree: dict,
+) -> None:
+    """
+    Update memory tracking structures after swap (reorder version).
 
-    num_processed_collectives: int = 0
-    curr = _head
-    debug_iterative_memory_recompute = config.reorder_iterative_debug_memory_recompute
-    iterative_recompute_error = False
-
-    while _next[curr] is not None:
-        if iterative_recompute_error:
-            break
-        # pyrefly: ignore [bad-argument-type]
-        if contains_collective(curr):
-            if debug_num_collectives_to_reorder is not None and (
-                num_processed_collectives >= debug_num_collectives_to_reorder
-            ):
-                break
-            num_processed_collectives += 1
+    Updates curr_memory, buf_to_snode_last_use, and snodes_allocfree dictionaries
+    to reflect the new memory state after swapping candidate with group.
 
-            info = stats[curr] = ReorderInfo()
-            info.initial_exposed = info.final_exposed = exposed_communication_time(
-                curr, _group_nodes(_next[curr], None)
+    Args:
+        candidate: Node that was moved
+        gns: Group nodes
+        group_tail: Last node of group
+        candidate_delta_mem: Net memory change from candidate (alloc - free)
+        candidate_allocfree: Candidate's allocation/free info
+        group_n_to_bufs_after_swap_dealloc_by_candidate: Buffers whose deallocation moves to candidate
+        post_alloc_update: Cached post-allocation memory values
+        curr_memory: Current memory state dict (mutated)
+        buf_to_snode_last_use: Buffer to last-use node mapping (mutated)
+        snodes_allocfree: Node allocation/free info dict (mutated)
+    """
+    if not group_n_to_bufs_after_swap_dealloc_by_candidate:
+        for gn in gns:
+            cm = curr_memory[gn]
+            curr_memory[gn] = (
+                cm[0] - candidate_delta_mem,
+                cm[1] - candidate_delta_mem,
             )
+        _candidate_post_alloc_mem = (
+            curr_memory[group_tail][1] + candidate_allocfree.size_alloc
+        )
+        _candidate_post_free_mem = (
+            _candidate_post_alloc_mem - candidate_allocfree.size_free
+        )
+        curr_memory[candidate] = (
+            _candidate_post_alloc_mem,
+            _candidate_post_free_mem,
+        )
+        return
 
-            candidate = _prev[curr]
-            group_head = curr
-            group_tail = curr
-            group_peak_memory = _curr_memory[curr][0]  # post_alloc memory
-            while candidate is not None:
-                if contains_collective(candidate):
-                    info.limiting_factor = "collective ordering"
-                    break
+    # Candidate becomes last use of some bufs
+    for bufs in group_n_to_bufs_after_swap_dealloc_by_candidate.values():
+        for buf in bufs:
+            buf_to_snode_last_use[buf] = candidate
+
+    size_free_to_move_to_candidate_sum: int = 0
+    for n in gns:
+        _gn_post_alloc_mem: int = post_alloc_update[n]
+        size_free_to_move_to_candidate: int = sum(
+            buf.mpi_buffer.size_free
+            for buf in group_n_to_bufs_after_swap_dealloc_by_candidate[n]
+        )
+        size_free_to_move_to_candidate_sum += size_free_to_move_to_candidate
+        # group node does not deallocate this after swap
+        snodes_allocfree[n].size_free -= size_free_to_move_to_candidate
+        gn_post_free_mem: int = _gn_post_alloc_mem - snodes_allocfree[n].size_free
+        curr_memory[n] = (_gn_post_alloc_mem, gn_post_free_mem)
+    _candidate_post_alloc_mem = post_alloc_update[candidate]
+    snodes_allocfree[candidate].size_free += size_free_to_move_to_candidate_sum
+    candidate_post_free_mem = (
+        _candidate_post_alloc_mem - snodes_allocfree[candidate].size_free
+    )
+    curr_memory[candidate] = (
+        _candidate_post_alloc_mem,
+        candidate_post_free_mem,
+    )
 
-                gns: list[BaseSchedulerNode] = _group_nodes(group_head, group_tail)
-                group = GroupedSchedulerNode(
-                    curr.scheduler,
-                    gns,
-                    temp_grouping=True,
-                )
 
-                # We can have multiple deps with the same name.
-                # As we ignore WeakDep(is_fake=True) =>
-                # filter them out first to avoid overwriting  of real dep.
-                data_deps = {
-                    d.name: d for d in group.unmet_dependencies if not _is_fake_dep(d)
-                }
-
-                candidate_outs = candidate.get_outputs()
-                data_dep = None
-                for o in candidate_outs:
-                    if d := data_deps.get(o.get_name(), None):
-                        data_dep = d
-                        break
+def _find_buffers_with_changed_last_use(
+    candidate: BaseSchedulerNode,
+    gns: list[BaseSchedulerNode],
+    buf_to_snode_last_use: dict,
+) -> dict[BaseSchedulerNode, list[Union[FreeableInputBuffer, Any]]]:
+    """
+    Find buffers whose last use will change after swapping candidate with group.
 
-                if data_dep is not None:
+    When we swap [candidate [group]] to [[group] candidate], some buffers that
+    were last used by a group node will now be last used by candidate instead.
+    This affects memory deallocation timing.
 
-                    def is_groupable(
-                        candidate: BaseSchedulerNode,
-                    ) -> tuple[bool, Optional[str]]:
-                        # preserve ordering
-                        if contains_collective(candidate):
-                            return False, "contains_collective"
+    Args:
+        candidate: The node being moved
+        gns: Group nodes being swapped with candidate
+        buf_to_snode_last_use: Mapping of buffers to their current last-use nodes
 
-                        if contains_gemm_like(candidate):
-                            return False, "contains_gemm_like"
-                        return True, None
+    Returns:
+        Dict mapping group nodes to buffers that will change their last-use node
+    """
+    group_n_to_bufs_after_swap_dealloc_by_candidate: dict[
+        BaseSchedulerNode, list[Union[FreeableInputBuffer, Any]]
+    ] = defaultdict(list)
+    for (
+        buf,
+        snode_last_use,
+    ) in buf_to_snode_last_use.items():
+        succ_nodes = buf.mpi_buffer.succ_nodes
+        if candidate not in succ_nodes:
+            continue
 
-                    is_groupable_result, grouping_reason = is_groupable(candidate)
-                    if is_groupable_result:
-                        group_head = candidate
-                        group_peak_memory = max(
-                            group_peak_memory, _curr_memory[candidate][0]
-                        )
-                        info.grouped += 1
-                        info.grouped_info = _group_names(gns)
-                        candidate = _prev[candidate]
-                        continue
-                    else:
-                        msg = (
-                            f"data dependency {data_dep}(dep_names:{list(data_deps.keys())})"
-                            f"\n candidate:{candidate.get_name()}(outs:{[candidate.get_buffer_names()]})"
-                            f"dep on {_group_names(gns)}"
-                            f"\n non_group_reason:{grouping_reason}"
-                        )
-                        info.limiting_factor = msg
-                        break
+        if not any(gn == snode_last_use for gn in gns):
+            continue
 
-                candidate_allocfree: SNodeMemory = snodes_allocfree[candidate]
-                candidate_delta_mem: int = (
-                    candidate_allocfree.size_alloc - candidate_allocfree.size_free
-                )
-                # candidate and one of group nodes are successors of the same buffer
-                # and last use of the buffer happen in group nodes.
-                # This last use deallocates it.
-                # If we swap [candidate [group]] to [[group] candidate],
-                # candidate becomes the last use
-                # and deallocated this buffer instead of group node.
-                # we need to update size_free accordingly to group_node and candidate,
-                # and recalculate post_alloc, post_free for them.
-                #
-                # Buf that changes its last use snode,
-                # after swap will be deallocated only by candidate,
-                # while before it was deallocated by group node.
-                group_n_to_bufs_after_swap_dealloc_by_candidate: dict[
-                    BaseSchedulerNode, list[Union[FreeableInputBuffer, Any]]
-                ] = defaultdict(list)
-                for (
-                    buf,
-                    snode_last_use,
-                ) in buf_to_snode_last_use.items():
-                    succ_nodes = buf.mpi_buffer.succ_nodes
-                    if candidate not in succ_nodes:
-                        continue
+        group_n_to_bufs_after_swap_dealloc_by_candidate[snode_last_use].append(buf)
 
-                    if not any(gn == snode_last_use for gn in gns):
-                        continue
+    return group_n_to_bufs_after_swap_dealloc_by_candidate
 
-                    group_n_to_bufs_after_swap_dealloc_by_candidate[
-                        snode_last_use
-                    ].append(buf)
 
-                potential_peak, _post_alloc_update = _calculate_potential_peak_memory(
-                    candidate, gns, group_n_to_bufs_after_swap_dealloc_by_candidate
-                )
+def _is_node_groupable_for_reorder(
+    candidate: BaseSchedulerNode,
+) -> tuple[bool, Optional[str]]:
+    """
+    Check if a candidate node can be grouped with collective during reordering.
 
-                if potential_peak > peak_memory:
-                    info.limiting_factor = (
-                        f"peak memory new:{potential_peak} vs base:{peak_memory}"
-                    )
-                    break
-                info.moves += 1
-                total_moves += 1
+    This pass processes collectives left to right, so we avoid grouping with
+    already-processed collectives based on configuration.
 
-                _perform_double_linked_list_swap(candidate, group_head, group_tail)
+    Args:
+        candidate: Node to check for groupability
 
-                info.final_exposed = exposed_communication_time(
-                    curr, _group_nodes(_next[curr], None)
-                )
+    Returns:
+        Tuple of (is_groupable, reason_if_not_groupable)
+    """
+    # This pass processes collectives left to right,
+    # Do not group with processed collectives.
+    # Leaving config for experimentation in 2D
+    if not config_comms.reorder_iterative_group_with_collectives:
+        if contains_async_collective(candidate):
+            return (
+                False,
+                f"candidate contains_collective {candidate.get_name()}",
+            )
+    if not config_comms.reorder_iterative_use_runtime_estimations:
+        if contains_gemm_like(candidate):
+            return False, "contains_gemm_like"
+    return True, None
+
+
+def _format_and_log_reordering_stats(
+    stats: dict[BaseSchedulerNode, ReorderInfo],
+    head: BaseSchedulerNode,
+    next_dict: dict[BaseSchedulerNode, Optional[BaseSchedulerNode]],
+    original_snodes_num: int,
+    peak_memory: int,
+    name_to_freeable_input_buf: dict,
+    graph_outputs: OrderedSet[str],
+) -> list[BaseSchedulerNode]:
+    """
+    Format reordering statistics, log them, and return final node list.
 
-                _update_memory_tracking_after_swap(
-                    candidate,
-                    gns,
-                    group_n_to_bufs_after_swap_dealloc_by_candidate,
-                    _post_alloc_update,
-                )
+    Computes improvement metrics, creates a formatted table (using tabulate if
+    available), validates the reordered node count, recalculates peak memory,
+    and logs all information.
 
-                if debug_iterative_memory_recompute:
-                    # Compare iteratively recomputed memory data
-                    # with full run of estimate_peak_memory
-
-                    from .comms_debug import _debug_iterative_memory_recompute
-
-                    iterative_recompute_error = _debug_iterative_memory_recompute(
-                        candidate,
-                        gns,
-                        _group_names(gns),
-                        _group_nodes(_head, None),
-                        name_to_freeable_input_buf,
-                        graph_outputs,
-                        peak_memory,
-                        _curr_memory,
-                        snodes_allocfree,
-                        "reorder_communication_preserving_peak_memory",
-                        group_n_to_bufs_after_swap_dealloc_by_candidate,
-                    )
-                    if iterative_recompute_error:
-                        break
-                candidate = _prev[group_head]
-        curr = _next[curr]  # type: ignore[assignment]
+    Args:
+        stats: Per-node reordering statistics
+        head: Head of the reordered linked list
+        next_dict: Linked list next pointers
+        original_snodes_num: Original number of nodes (for validation)
+        peak_memory: Initial peak memory before reordering
+        name_to_freeable_input_buf: Buffer memory tracking info
+        graph_outputs: Graph output names
 
+    Returns:
+        Final reordered list of scheduler nodes
+    """
     node_stats = stats
     improvement = {snode: node_stats[snode].improvement for snode in node_stats}
     total_improvement = sum([improvement[snode] for snode in improvement])
@@ -632,28 +746,35 @@ def is_groupable(
     )
     headers = [
         "Collective node",
-        "initial exposed",
-        "final exposed",
-        "improvement",
+        "comm_time(us)",
+        "comp_time(us)",
+        "initial exposed(us)",
+        "final exposed(us)",
+        "improvement(us)",
         "limiting factor",
         "moves",
         "grouped",
         "grouped_info",
+        "overlap_info",
     ]
     rows = [
         [
             node_summary(snode),
-            node_info.initial_exposed,
-            node_info.final_exposed,
-            node_info.improvement,
+            node_info.comm_time / 1e3,
+            node_info.comp_time / 1e3,
+            node_info.initial_exposed / 1e3,
+            node_info.final_exposed / 1e3,
+            node_info.improvement / 1e3,
             node_info.limiting_factor,
             node_info.moves,
             node_info.grouped,
             node_info.grouped_info,
+            node_info.overlap_info,
         ]
         for snode, node_info in node_stats.items()
     ]
     if importlib.util.find_spec("tabulate"):
+        # pyrefly: ignore[import-error]
         from tabulate import tabulate
 
         reorder_log_str += tabulate(
@@ -667,7 +788,7 @@ def is_groupable(
         reorder_log_str += str(headers) + "\n"
         reorder_log_str += "\n".join(map(str, rows))
 
-    new_snodes = _group_nodes(_head, None)
+    new_snodes = _group_nodes_from_linked_list(head, None, next_dict)
     assert len(new_snodes) == original_snodes_num
     new_peak_memory, _, _, _ = estimate_peak_memory_allocfree(
         new_snodes, name_to_freeable_input_buf, graph_outputs
@@ -685,6 +806,334 @@ def is_groupable(
         payload_fn=lambda: reorder_log_str,
     )
 
+    return new_snodes
+
+
+def _reorder_communication_preserving_peak_memory_internal(
+    snodes: list[BaseSchedulerNode],
+) -> tuple[list[BaseSchedulerNode], dict[BaseSchedulerNode, ReorderInfo]]:
+    """
+    Internal testing helper that also returns debug info.
+    Returns:
+        - reordered snodes list
+        - dict {snode: ReorderInfo}
+    """
+    has_collectives = False
+    for snode in snodes:
+        if contains_collective(snode):
+            has_collectives = True
+            break
+    if not has_collectives:
+        return snodes, {}
+
+    from torch._inductor.scheduler import GroupedSchedulerNode
+
+    original_snodes_num = len(snodes)
+    # heuristic to avoid degenerating to quadratic time
+    graph_inputs: OrderedSet[str] = OrderedSet(V.graph.graph_inputs.keys())
+    graph_outputs: OrderedSet[str] = OrderedSet(V.graph.get_output_names())
+    (
+        peak_memory,
+        _curr_memory,
+        snodes_allocfree,
+        buf_to_snode_last_use,
+        name_to_freeable_input_buf,
+    ) = _initialize_memory_tracking(snodes, graph_inputs, graph_outputs)
+
+    runtimes: dict[BaseSchedulerNode, float] = {
+        snode: estimate_op_runtime(snode) * _op_runtime_estimate_mult(snode)
+        for snode in snodes
+    }
+    # debug stats
+    stats: dict[BaseSchedulerNode, ReorderInfo] = {}
+
+    total_moves = 0
+
+    _prev, _next, _head = _initialize_double_linked_list(snodes)
+
+    debug_num_collectives_to_reorder: Optional[int] = (
+        config_comms.reorder_iterative_debug_limit_to_reorder
+    )
+
+    num_processed_collectives: int = 0
+    curr: Optional[BaseSchedulerNode] = _head
+    debug_iterative_memory_recompute = (
+        config_comms.reorder_iterative_debug_memory_recompute
+    )
+    iterative_recompute_error = False
+
+    while curr is not None and _next[curr] is not None:
+        _next_curr = _next[curr]
+        if iterative_recompute_error:
+            break
+        # pyrefly: ignore [bad-argument-type]
+        if not contains_async_collective(curr):
+            curr = _next_curr
+            continue
+
+        if debug_num_collectives_to_reorder is not None and (
+            num_processed_collectives >= debug_num_collectives_to_reorder
+        ):
+            break
+        num_processed_collectives += 1
+
+        info = stats[curr] = ReorderInfo()
+        comm_time, comp_time, overlap_info = coll_exposed_communication_time(
+            _group_nodes_from_linked_list(curr, None, _next), runtimes
+        )
+        info.comm_time = comm_time
+        info.comp_time = comp_time
+        info.initial_exposed = info.final_exposed = comm_time - comp_time
+        info.overlap_info = overlap_info
+
+        candidate = _prev[curr]
+        group_head = curr
+        group_tail = curr
+        group_waits = {}
+        group_runtime = 0.0
+        group_peak_memory = _curr_memory[curr][0]  # post_alloc memory
+
+        while candidate is not None:
+            if config_comms.reorder_iterative_use_runtime_estimations and (
+                info.final_exposed
+                < -config_comms.reorder_iterative_extra_comm_comp_overlap
+                * info.comm_time
+            ):
+                info.limiting_factor = "unexposed by runtime estimations"
+                break
+
+            if (
+                not config_comms.reorder_iterative_unsafe_collectives_reorder
+                and contains_collective(candidate)
+            ):
+                info.limiting_factor = "collective ordering"
+                break
+
+            gns: list[BaseSchedulerNode] = _group_nodes_from_linked_list(
+                group_head, group_tail, _next
+            )
+            group = GroupedSchedulerNode(
+                curr.scheduler,
+                gns,
+                temp_grouping=True,
+            )
+
+            # We can have multiple deps with the same name.
+            # As we ignore WeakDep(is_fake=True) =>
+            # filter them out first to avoid overwriting  of real dep.
+            data_deps = {
+                d.name: d for d in group.unmet_dependencies if not _is_fake_dep(d)
+            }
+
+            candidate_outs = candidate.get_outputs()
+            data_dep = None
+            for o in candidate_outs:
+                if d := data_deps.get(o.get_name(), None):
+                    data_dep = d
+                    break
+
+            if data_dep is not None:
+                is_groupable_result, grouping_reason = _is_node_groupable_for_reorder(
+                    candidate
+                )
+                if is_groupable_result:
+                    group_head = candidate
+                    # pyrefly: ignore[unbound-name]
+                    if config_comms.reorder_iterative_use_runtime_estimations:
+                        if contains_wait(candidate):
+                            comm_time, comp_time, _ = wait_exposed_communication_time(
+                                _group_nodes_from_linked_list(_head, candidate, _next),
+                                runtimes,
+                            )
+                            group_waits[candidate] = comm_time, comp_time
+                        if not contains_async_collective(candidate):
+                            group_runtime += runtimes[candidate]
+
+                    group_peak_memory = max(
+                        group_peak_memory, _curr_memory[candidate][0]
+                    )
+                    info.grouped += 1
+                    info.grouped_info = _group_names(gns)
+                    candidate = _prev[candidate]
+                    continue
+                else:
+                    msg = (
+                        f"data dependency {data_dep}(dep_names:{list(data_deps.keys())})"
+                        f"\n candidate:{candidate.get_name()}(outs:{[candidate.get_buffer_names()]})"
+                        f"dep on {_group_names(gns)}"
+                        f"\n non_group_reason:{grouping_reason}"
+                    )
+                    info.limiting_factor = msg
+                    break
+
+            # pyrefly: ignore[unbound-name]
+            if config_comms.reorder_iterative_use_runtime_estimations:
+                # Check if candidate has sync runtime
+                if not contains_async_collective(candidate):
+                    c_runtime = runtimes[candidate]
+
+                    if c_runtime > 0 and len(group_waits) > 0:
+                        # pyrefly: ignore[no-matching-overload]
+                        exposed_before = max(0, info.comm_time - info.comp_time)
+                        # pyrefly: ignore[no-matching-overload]
+                        exposed_after = max(
+                            0, info.comm_time - info.comp_time - c_runtime
+                        )
+                        exposed_delta = exposed_after - exposed_before
+                        for gw_comm_time, gw_comp_time in group_waits.values():
+                            gw_exposed_before = max(0, gw_comm_time - gw_comp_time)
+                            gw_exposed_after = max(
+                                0, gw_comm_time - gw_comp_time + c_runtime
+                            )
+
+                            exposed_delta += gw_exposed_after - gw_exposed_before
+
+                        if exposed_delta > 0:
+                            info.limiting_factor = (
+                                f"candidate has compute {c_runtime},"
+                                f" group contains waits, total_exposed_delta {exposed_delta}"
+                            )
+                            break
+                        else:
+                            # Update all group_colls comm_time, comp_time
+                            for gw, (
+                                gw_comm_time,
+                                gw_comp_time,
+                            ) in group_waits.items():
+                                group_waits[gw] = (
+                                    gw_comm_time,
+                                    gw_comp_time - c_runtime,
+                                )
+                else:
+                    # Candidate is async_collective
+
+                    # Unsafe collectives reordering
+                    # Cj -> [...group_runtime..., Ci] -> Wj
+                    # Checking that we are not increasing exposed time of Cj
+                    if group_runtime > 0:
+                        comm_time, comp_time, _ = coll_exposed_communication_time(
+                            _group_nodes_from_linked_list(candidate, None, _next),
+                            runtimes,
+                        )
+                        # pyrefly: ignore[no-matching-overload]
+                        exposed_before = max(0, comm_time - comp_time)
+                        # pyrefly: ignore[no-matching-overload]
+                        exposed_after = max(0, comm_time - comp_time + group_runtime)
+                        exposed_delta = exposed_after - exposed_before
+                        if exposed_delta > 0:
+                            info.limiting_factor = (
+                                f"candidate {candidate.get_name()} is collective,"
+                                f" group_runtime:{group_runtime},"
+                                f" exposed_delta:{exposed_delta} c_comm_time:{comm_time} c_comp_time:{comp_time}"
+                            )
+                            break
+
+            candidate_allocfree: SNodeMemory = snodes_allocfree[candidate]
+            candidate_delta_mem: int = (
+                candidate_allocfree.size_alloc - candidate_allocfree.size_free
+            )
+            # candidate and one of group nodes are successors of the same buffer
+            # and last use of the buffer happen in group nodes.
+            # This last use deallocates it.
+            # If we swap [candidate [group]] to [[group] candidate],
+            # candidate becomes the last use
+            # and deallocated this buffer instead of group node.
+            # we need to update size_free accordingly to group_node and candidate,
+            # and recalculate post_alloc, post_free for them.
+            #
+            # Buf that changes its last use snode,
+            # after swap will be deallocated only by candidate,
+            # while before it was deallocated by group node.
+            group_n_to_bufs_after_swap_dealloc_by_candidate = (
+                _find_buffers_with_changed_last_use(
+                    candidate, gns, buf_to_snode_last_use
+                )
+            )
+
+            potential_peak, _post_alloc_update = (
+                _calculate_potential_peak_memory_reorder(
+                    candidate,
+                    gns,
+                    group_tail,
+                    group_peak_memory,
+                    candidate_delta_mem,
+                    candidate_allocfree,
+                    group_n_to_bufs_after_swap_dealloc_by_candidate,
+                    _curr_memory,
+                )
+            )
+
+            if (
+                potential_peak - peak_memory
+                # pyrefly: ignore[unbound-name]
+                > peak_memory * config_comms.reorder_iterative_peak_memory_budget
+            ):
+                info.limiting_factor = (
+                    f"peak memory new:{potential_peak} vs base:{peak_memory}"
+                )
+                break
+            info.moves += 1
+            total_moves += 1
+
+            _head = _perform_double_linked_list_swap(
+                candidate, group_head, group_tail, _prev, _next, _head
+            )
+
+            comm_time, comp_time, overlap_info = coll_exposed_communication_time(
+                _group_nodes_from_linked_list(curr, None, _next), runtimes
+            )
+            info.comm_time = comm_time
+            info.comp_time = comp_time
+            info.overlap_info = overlap_info
+            info.final_exposed = comm_time - comp_time
+
+            _update_memory_tracking_after_swap_reorder(
+                candidate,
+                gns,
+                group_tail,
+                candidate_delta_mem,
+                candidate_allocfree,
+                group_n_to_bufs_after_swap_dealloc_by_candidate,
+                _post_alloc_update,
+                _curr_memory,
+                buf_to_snode_last_use,
+                snodes_allocfree,
+            )
+
+            if debug_iterative_memory_recompute:
+                # Compare iteratively recomputed memory data
+                # with full run of estimate_peak_memory
+
+                from .comms_debug import _debug_iterative_memory_recompute
+
+                iterative_recompute_error = _debug_iterative_memory_recompute(
+                    candidate,
+                    gns,
+                    _group_names(gns),
+                    _group_nodes_from_linked_list(_head, None, _next),
+                    name_to_freeable_input_buf,
+                    graph_outputs,
+                    peak_memory,
+                    _curr_memory,
+                    snodes_allocfree,
+                    "reorder_communication_preserving_peak_memory",
+                    group_n_to_bufs_after_swap_dealloc_by_candidate,
+                )
+                if iterative_recompute_error:
+                    break
+            candidate = _prev[group_head]
+        curr = _next_curr
+
+    new_snodes = _format_and_log_reordering_stats(
+        stats,
+        _head,
+        _next,
+        original_snodes_num,
+        peak_memory,
+        name_to_freeable_input_buf,
+        graph_outputs,
+    )
+
     return new_snodes, stats
 
 
@@ -875,344 +1324,295 @@ class SinkWaitInfo:
     moves: int = 0
     moves_info: str = ""
     limiting_factor: str = "None"
+    comm_time: float = -1.0
+    comp_time: float = -1.0
+    initial_exposed: float = -1.0
+    final_exposed: float = -1.0
+    overlap_info: str = "None"
 
+    @property
+    def improvement(self):
+        return self.initial_exposed - self.final_exposed
 
-def _sink_waits_iterative_internal(
-    snodes: list[BaseSchedulerNode],
-) -> tuple[list[BaseSchedulerNode], dict[BaseSchedulerNode, SinkWaitInfo]]:
-    from torch._inductor.scheduler import GroupedSchedulerNode
-
-    original_snodes_num = len(snodes)
-    if original_snodes_num == 0:
-        return snodes, {}
-    graph_inputs: OrderedSet[str] = OrderedSet(V.graph.graph_inputs.keys())
-    graph_outputs: OrderedSet[str] = OrderedSet(V.graph.get_output_names())
-    (
-        peak_memory,
-        _curr_memory,
-        snodes_allocfree,
-        buf_to_snode_last_use,
-        name_to_freeable_input_buf,
-    ) = _initialize_memory_tracking(snodes, graph_inputs, graph_outputs)
 
-    _prev, _next, _head = _initialize_double_linked_list(snodes)
+def _is_node_groupable_for_sink_waits(
+    candidate: BaseSchedulerNode,
+) -> tuple[bool, Optional[str]]:
+    """
+    Check if a candidate node can be grouped during sink_waits pass.
 
-    stats: dict[BaseSchedulerNode, SinkWaitInfo] = {}
+    Sink Waits traverses waits right to left, so we don't group with
+    processed waits on the right or with async collectives.
 
-    def _group_nodes(
-        head: Optional[BaseSchedulerNode], tail: Optional[BaseSchedulerNode]
-    ) -> list[BaseSchedulerNode]:
-        ret = []
-        n = head
-        while True:
-            if n is not None:
-                ret.append(n)
-            if n == tail:
-                break
-            n = _next[n]  # type: ignore[index]
-        return ret
+    Args:
+        candidate: Node to check for groupability
 
-    def _calculate_potential_peak_memory(
-        candidate, group_ns, group_n_to_bufs_after_swap_dealloc_instead_of_candidate
-    ):
-        pre_group_mem = (
-            _curr_memory[group_head][0] - snodes_allocfree[group_head].size_alloc
+    Returns:
+        Tuple of (is_groupable, reason_if_not_groupable)
+    """
+    # Sink Waits traverse Waits right to left,
+    # => we do not group with processed Waits on the right.
+    if contains_wait(candidate):
+        return False, f"candidate contains wait {candidate.get_name()}"
+    if contains_async_collective(candidate):
+        return (
+            False,
+            f"candidate contains_async_collective {candidate.get_name()}",
         )
-        # Stash memory tracing updates to not recompute them after swap
-        _post_alloc_update: dict[BaseSchedulerNode, int] = {}
-        _size_free_delta_update: dict[BaseSchedulerNode, int] = {}
-
-        potential_peak = 0
-        if not group_n_to_bufs_after_swap_dealloc_instead_of_candidate:
-            # Not accounting for buffers liveliness change
-            potential_peak = max(
-                group_peak_memory + candidate_delta_mem,
-                pre_group_mem + candidate_allocfree.size_alloc,
+
+    # pyrefly: ignore[unbound-name]
+    if not config_comms.sink_iterative_use_runtime_estimations:
+        # Heuristics pre-use_runtime_estimations:
+        # TODO(ivankobzarev): Remove them after confirming,
+        # that using runtime estimations always give better results.
+        # We do not want to group with collectives to not reorder them forward.
+        if contains_collective(candidate):
+            return (
+                False,
+                f"candidate contains collective {candidate.get_name()}",
             )
-            return potential_peak, _post_alloc_update, _size_free_delta_update
+        if contains_gemm_like(candidate):
+            return (
+                False,
+                f"candidate contains gemm_like {candidate.get_name()}",
+            )
+    return True, None
+
+
+def _update_memory_tracking_after_swap_sink_waits(
+    candidate: BaseSchedulerNode,
+    gns: list[BaseSchedulerNode],
+    candidate_delta_mem: int,
+    candidate_allocfree: SNodeMemory,
+    group_n_to_bufs_after_swap_dealloc_instead_of_candidate: dict,
+    post_alloc_update: dict[BaseSchedulerNode, int],
+    size_free_delta_update: dict[BaseSchedulerNode, int],
+    curr_memory: dict,
+    snodes_allocfree: dict,
+) -> None:
+    """
+    Update memory tracking structures after swap (sink_waits version).
 
+    Updates curr_memory and snodes_allocfree dictionaries to reflect the new
+    memory state after swapping candidate with group.
+
+    Args:
+        candidate: Node that was moved
+        gns: Group nodes
+        candidate_delta_mem: Net memory change from candidate (alloc - free)
+        candidate_allocfree: Candidate's allocation/free info
+        group_n_to_bufs_after_swap_dealloc_instead_of_candidate: Buffers whose deallocation moves from candidate to group
+        post_alloc_update: Cached post-allocation memory values
+        size_free_delta_update: Cached size-free delta values
+        curr_memory: Current memory state dict (mutated)
+        snodes_allocfree: Node allocation/free info dict (mutated)
+    """
+    group_head = gns[0]
+    pre_group_mem = curr_memory[group_head][0] - snodes_allocfree[group_head].size_alloc
+    if not group_n_to_bufs_after_swap_dealloc_instead_of_candidate:
         candidate_post_alloc = pre_group_mem + candidate_allocfree.size_alloc
-        _post_alloc_update[candidate] = candidate_post_alloc
-        potential_peak = candidate_post_alloc
-        candidate_size_free_to_move = sum(
-            buf.mpi_buffer.size_free  # type: ignore[attr-defined]
-            for buf in itertools.chain.from_iterable(
-                group_n_to_bufs_after_swap_dealloc_instead_of_candidate.values()
-            )
+        curr_memory[candidate] = (
+            candidate_post_alloc,
+            candidate_post_alloc - candidate_allocfree.size_free,
         )
-        _size_free_delta_update[candidate] = -candidate_size_free_to_move
-        delta_mem = candidate_delta_mem + candidate_size_free_to_move
         for gn in gns:
-            gn_post_alloc = _curr_memory[gn][0] + delta_mem
-            _post_alloc_update[gn] = gn_post_alloc
-            potential_peak = max(potential_peak, gn_post_alloc)
-            gn_size_free_to_add = 0
-            if gn in group_n_to_bufs_after_swap_dealloc_instead_of_candidate:
-                bufs = group_n_to_bufs_after_swap_dealloc_instead_of_candidate[gn]
-                for buf in bufs:
-                    gn_size_free_to_add += buf.mpi_buffer.size_free
-                _size_free_delta_update[gn] = gn_size_free_to_add
-            delta_mem -= gn_size_free_to_add
-        return potential_peak, _post_alloc_update, _size_free_delta_update
-
-    def _perform_double_linked_list_swap(candidate, group_head, group_tail):
-        # group_head_prev -0-> candidate -1-> group_head...group_tail -2-> candidate_next
-        # 0:
-        group_head_prev = _prev[group_head]
-        if group_head_prev:
-            _next[group_head_prev] = candidate
-        _prev[candidate] = group_head_prev
-
-        # 2:
-        candidate_next = _next[candidate]
-        if candidate_next:
-            _prev[candidate_next] = group_tail
-        _next[group_tail] = candidate_next
-
-        # 1:
-        _prev[group_head] = candidate
-        _next[candidate] = group_head
-        nonlocal _head
-        if group_head == _head:
-            _head = candidate
-
-    def _update_memory_tracking_after_swap(
-        candidate,
-        gns,
-        group_n_to_bufs_after_swap_dealloc_instead_of_candidate,
-        _post_alloc_update,
-        _size_free_delta_update,
-    ):
-        group_head = gns[0]
-        pre_group_mem = (
-            _curr_memory[group_head][0] - snodes_allocfree[group_head].size_alloc
-        )
-        if not group_n_to_bufs_after_swap_dealloc_instead_of_candidate:
-            candidate_post_alloc = pre_group_mem + candidate_allocfree.size_alloc
-            _curr_memory[candidate] = (
-                candidate_post_alloc,
-                candidate_post_alloc - candidate_allocfree.size_free,
-            )
-            for gn in gns:
-                cm = _curr_memory[gn]
-                _curr_memory[gn] = (
-                    cm[0] + candidate_delta_mem,
-                    cm[1] + candidate_delta_mem,
-                )
-            return
-
-        for n in [candidate, *gns]:
-            post_alloc = _post_alloc_update[n]
-            snodes_allocfree[n].size_free += _size_free_delta_update[n]
-            _curr_memory[n] = (
-                post_alloc,
-                post_alloc - snodes_allocfree[n].size_free,
+            cm = curr_memory[gn]
+            curr_memory[gn] = (
+                cm[0] + candidate_delta_mem,
+                cm[1] + candidate_delta_mem,
             )
+        return
 
-    curr = snodes[-1]
-
-    processed_waits = OrderedSet()  # type: ignore[var-annotated]
-    debug_iterative_memory_recompute = config.reorder_iterative_debug_memory_recompute
-    debug_num_sink_waits_to_reorder: Optional[int] = (
-        config.sink_waits_iterative_debug_limit_to_sink
-    )
-
-    iterative_recompute_error = False
-
-    while _prev[curr] is not None:
-        if iterative_recompute_error:
-            break
-        if (
-            debug_num_sink_waits_to_reorder is not None
-            and len(processed_waits) >= debug_num_sink_waits_to_reorder
-        ):
-            break
+    for n in [candidate, *gns]:
+        post_alloc = post_alloc_update[n]
+        snodes_allocfree[n].size_free += size_free_delta_update.get(n, 0)
+        curr_memory[n] = (
+            post_alloc,
+            post_alloc - snodes_allocfree[n].size_free,
+        )
 
-        # pyrefly: ignore [bad-argument-type]
-        if contains_wait(curr) and curr not in processed_waits:
-            processed_waits.add(curr)
-            info = stats[curr] = SinkWaitInfo()
-            candidate = _next[curr]
-            wait_snode = curr
-            group_head = curr
-            group_tail = curr
-            group_peak_memory = _curr_memory[curr][0]
-            while candidate is not None:
-                if iterative_recompute_error:
-                    break
-                gns: list[BaseSchedulerNode] = _group_nodes(group_head, group_tail)
-                group = GroupedSchedulerNode(
-                    wait_snode.scheduler,
-                    gns,
-                    temp_grouping=True,
-                )
 
-                # We can have multiple deps with the same name.
-                # As we ignore WeakDep(is_fake=True) =>
-                # filter them out first to avoid overwriting  of real dep.
-                data_deps = {
-                    d.name: d
-                    for d in candidate.unmet_dependencies
-                    if not _is_fake_dep(d)
-                }
-
-                group_outs = group.get_outputs()
-                data_dep = None
-                for o in group_outs:
-                    if d := data_deps.get(o.get_name(), None):
-                        data_dep = d
-                        break
-                # 1. If we have data_dep - we can not swap => trying to group
-                # 2. If swap candidate and current node both contain collectives => trying to group
-                if data_dep is not None or (
-                    both_contain_comms := (
-                        contains_collective(group) and contains_collective(candidate)
-                    )
-                ):
+def _calculate_potential_peak_memory_sink_waits(
+    candidate: BaseSchedulerNode,
+    gns: list[BaseSchedulerNode],
+    group_head: BaseSchedulerNode,
+    group_peak_memory: int,
+    candidate_delta_mem: int,
+    candidate_allocfree: SNodeMemory,
+    group_n_to_bufs_after_swap_dealloc_instead_of_candidate: dict,
+    curr_memory: dict,
+    snodes_allocfree: dict,
+) -> tuple[int, dict[BaseSchedulerNode, int], dict[BaseSchedulerNode, int]]:
+    """
+    Calculate potential peak memory after swapping candidate with group (sink_waits version).
 
-                    def is_groupable(snode):
-                        # We do not want to group with collectives to not reorder them forward.
-                        if contains_collective(snode):
-                            return (
-                                False,
-                                f"candidate contains collective {snode.get_name()}",
-                            )
-                        if contains_gemm_like(snode):
-                            return (
-                                False,
-                                f"candidate contains gemm_like {snode.get_name()}",
-                            )
-                        return True, None
+    Computes new memory levels for all affected nodes and returns the potential
+    peak memory along with cached post-allocation and size-free delta values.
 
-                    is_grp, grp_reason = is_groupable(candidate)
-                    if is_grp:
-                        group_tail = candidate
-                        group_peak_memory = max(
-                            group_peak_memory, _curr_memory[candidate][0]
-                        )
-                        info.grouped += 1
-                        info.grouped_info = _group_names(gns)
-                        candidate = _next[candidate]
-                        continue
+    Args:
+        candidate: Node being moved
+        gns: Group nodes
+        group_head: First node of group
+        group_peak_memory: Current peak memory within the group
+        candidate_delta_mem: Net memory change from candidate (alloc - free)
+        candidate_allocfree: Candidate's allocation/free info
+        group_n_to_bufs_after_swap_dealloc_instead_of_candidate: Buffers whose deallocation moves from candidate to group
+        curr_memory: Current memory state dict
+        snodes_allocfree: Allocation/free info for all nodes
 
-                    elif (data_dep is None) and both_contain_comms:
-                        info.limiting_factor = (
-                            f"collective ordering {_group_names(gns)}"
-                            f" with candidate:{candidate.get_name()}"
-                        )
-                        break
-                    else:
-                        info.limiting_factor = (
-                            f"data dependency {data_dep}(dep_names:{list(data_deps.keys())})"
-                            f"\n candidate:{candidate.get_name()}(os:{[candidate.get_buffer_names()]})"
-                            f"dep on {gns}"
-                            f"\n outs:{[o.get_name() for o in group_outs]}"
-                            f"\n non_group_reason:{grp_reason}"
-                        )
-                        break
-                candidate_allocfree: SNodeMemory = snodes_allocfree[candidate]
-                candidate_delta_mem = (
-                    candidate_allocfree.size_alloc - candidate_allocfree.size_free
-                )
-                # [group] candidate -> candidate [group]
-                # Check for buffers with successors in group and candidate last successor
-                #
-                # Buf that  changes its last use snode,
-                # It was deallocated by candidate,
-                # but after swap it will be deallocated by group node.
-                group_n_to_bufs_after_swap_dealloc_instead_of_candidate: dict[
-                    BaseSchedulerNode, list[Union[FreeableInputBuffer, SchedulerBuffer]]
-                ] = defaultdict(list)
-                for (
-                    buf,
-                    snode_last_use,
-                ) in buf_to_snode_last_use.items():
-                    succ_nodes = buf.mpi_buffer.succ_nodes
-                    if snode_last_use != candidate:  # noqa: E711
-                        continue
-                    # candidate is last use of buf
-                    last_succ_gn = None
-                    for gn in gns:
-                        if gn in succ_nodes:
-                            last_succ_gn = gn
-                    if last_succ_gn is None:
-                        continue
+    Returns:
+        Tuple of (potential_peak_memory, post_alloc_update_dict, size_free_delta_update_dict)
+    """
+    pre_group_mem = curr_memory[group_head][0] - snodes_allocfree[group_head].size_alloc
+    # Stash memory tracing updates to not recompute them after swap
+    _post_alloc_update: dict[BaseSchedulerNode, int] = {}
+    _size_free_delta_update: dict[BaseSchedulerNode, int] = {}
+
+    potential_peak = 0
+    if not group_n_to_bufs_after_swap_dealloc_instead_of_candidate:
+        # Not accounting for buffers liveliness change
+        potential_peak = max(
+            group_peak_memory + candidate_delta_mem,
+            pre_group_mem + candidate_allocfree.size_alloc,
+        )
+        return potential_peak, _post_alloc_update, _size_free_delta_update
 
-                    # gn has successors of buf that after potential swap will become
-                    # last use of buf and start deallocating buf instead of candidate
-                    group_n_to_bufs_after_swap_dealloc_instead_of_candidate[
-                        last_succ_gn
-                    ].append(buf)
-
-                potential_peak, _post_alloc_update, _size_free_delta_update = (
-                    _calculate_potential_peak_memory(
-                        candidate,
-                        gns,
-                        group_n_to_bufs_after_swap_dealloc_instead_of_candidate,
-                    )
-                )
-                if potential_peak > peak_memory:
-                    info.limiting_factor = (
-                        f"peak memory new:{potential_peak} vs base:{peak_memory}"
-                    )
-                    break
+    candidate_post_alloc = pre_group_mem + candidate_allocfree.size_alloc
+    _post_alloc_update[candidate] = candidate_post_alloc
+    potential_peak = candidate_post_alloc
+    candidate_size_free_to_move = sum(
+        buf.mpi_buffer.size_free  # type: ignore[attr-defined]
+        for buf in itertools.chain.from_iterable(
+            group_n_to_bufs_after_swap_dealloc_instead_of_candidate.values()
+        )
+    )
+    _size_free_delta_update[candidate] = -candidate_size_free_to_move
+    delta_mem = candidate_delta_mem + candidate_size_free_to_move
+    for gn in gns:
+        gn_post_alloc = curr_memory[gn][0] + delta_mem
+        _post_alloc_update[gn] = gn_post_alloc
+        potential_peak = max(potential_peak, gn_post_alloc)
+        gn_size_free_to_add = 0
+        if gn in group_n_to_bufs_after_swap_dealloc_instead_of_candidate:
+            bufs = group_n_to_bufs_after_swap_dealloc_instead_of_candidate[gn]
+            for buf in bufs:
+                gn_size_free_to_add += buf.mpi_buffer.size_free
+            _size_free_delta_update[gn] = gn_size_free_to_add
+        delta_mem -= gn_size_free_to_add
+    return potential_peak, _post_alloc_update, _size_free_delta_update
+
+
+def _perform_double_linked_list_swap_sink_waits(
+    candidate: BaseSchedulerNode,
+    group_head: BaseSchedulerNode,
+    group_tail: BaseSchedulerNode,
+    prev_dict: dict[BaseSchedulerNode, Optional[BaseSchedulerNode]],
+    next_dict: dict[BaseSchedulerNode, Optional[BaseSchedulerNode]],
+    head: BaseSchedulerNode,
+) -> BaseSchedulerNode:
+    """
+    Swap positions of candidate and group in doubly-linked list (sink_waits version).
 
-                info.moves += 1
-                info.moves_info += f"+{candidate.get_name()}"
+    Transforms (moves candidate to the left):
+    group_head_prev -> group_head...group_tail -> candidate -> candidate_next
+    Into:
+    group_head_prev -> candidate -> group_head...group_tail -> candidate_next
 
-                _perform_double_linked_list_swap(candidate, group_head, group_tail)
+    Args:
+        candidate: Node to swap with group
+        group_head: First node of group
+        group_tail: Last node of group
+        prev_dict: Dictionary mapping nodes to their previous nodes
+        next_dict: Dictionary mapping nodes to their next nodes
+        head: Current head of the linked list
 
-                _update_memory_tracking_after_swap(
-                    candidate,
-                    gns,
-                    group_n_to_bufs_after_swap_dealloc_instead_of_candidate,
-                    _post_alloc_update,
-                    _size_free_delta_update,
-                )
+    Returns:
+        New head of the linked list (may change if group_head was the head)
+    """
+    # 0: Update group_head's previous node
+    group_head_prev = prev_dict[group_head]
+    if group_head_prev:
+        next_dict[group_head_prev] = candidate
+    prev_dict[candidate] = group_head_prev
+
+    # 2: Update candidate's next node
+    candidate_next = next_dict[candidate]
+    if candidate_next:
+        prev_dict[candidate_next] = group_tail
+    next_dict[group_tail] = candidate_next
+
+    # 1: Link candidate to group_head
+    prev_dict[group_head] = candidate
+    next_dict[candidate] = group_head
+
+    # Update head if group_head was the head
+    if group_head == head:
+        return candidate
+    return head
+
+
+def _format_and_log_sink_waits_stats(
+    stats: dict[BaseSchedulerNode, SinkWaitInfo],
+    head: BaseSchedulerNode,
+    next_dict: dict[BaseSchedulerNode, Optional[BaseSchedulerNode]],
+    original_snodes_num: int,
+    peak_memory: int,
+    name_to_freeable_input_buf: dict,
+    graph_outputs: OrderedSet[str],
+) -> list[BaseSchedulerNode]:
+    """
+    Format sink_waits statistics, log them, and return final node list.
 
-                if debug_iterative_memory_recompute:
-                    from .comms_debug import _debug_iterative_memory_recompute
-
-                    iterative_recompute_error = _debug_iterative_memory_recompute(
-                        candidate,
-                        gns,
-                        _group_names(gns),
-                        _group_nodes(_head, None),
-                        name_to_freeable_input_buf,
-                        graph_outputs,
-                        peak_memory,
-                        _curr_memory,
-                        snodes_allocfree,
-                        "sink_waits_iterative",
-                        group_n_to_bufs_after_swap_dealloc_instead_of_candidate,
-                    )
-                    if iterative_recompute_error:
-                        break
+    Computes improvement metrics, creates a formatted table (using tabulate if
+    available), validates the reordered node count, recalculates peak memory,
+    and logs all information.
 
-                candidate = _next[group_tail]
-        curr = _prev[curr]  # type: ignore[assignment]
+    Args:
+        stats: Per-node sink_waits statistics
+        head: Head of the reordered linked list
+        next_dict: Linked list next pointers
+        original_snodes_num: Original number of nodes (for validation)
+        peak_memory: Initial peak memory before reordering
+        name_to_freeable_input_buf: Buffer memory tracking info
+        graph_outputs: Graph output names
 
+    Returns:
+        Final reordered list of scheduler nodes
+    """
     headers = [
         "Wait node",
+        "comm_time(us)",
+        "comp_time(us)",
+        "initial exposed(us)",
+        "final exposed(us)",
+        "improvement(us)",
+        "limiting factor",
         "grouped",
         "grouped_info",
         "moves",
         "moves_info",
-        "limiting factor",
+        "overlap_info",
     ]
     rows = [
         [
             node_summary(snode),
+            info.comm_time / 1e3,
+            info.comp_time / 1e3,
+            info.initial_exposed / 1e3,
+            info.final_exposed / 1e3,
+            info.improvement / 1e3,
+            info.limiting_factor,
             info.grouped,
             info.grouped_info,
             info.moves,
             info.moves_info,
-            info.limiting_factor,
+            info.overlap_info,
         ]
         for snode, info in stats.items()
     ]
     log_str = ""
     if importlib.util.find_spec("tabulate"):
+        # pyrefly: ignore[import-error]
         from tabulate import tabulate
 
         log_str += tabulate(
@@ -1224,7 +1624,7 @@ def is_groupable(snode):
         log_str += str(headers) + "\n"
         log_str += "\n".join(map(str, rows))
     overlap_log.info(log_str)
-    new_snodes = _group_nodes(_head, None)
+    new_snodes = _group_nodes_from_linked_list(head, None, next_dict)
     assert len(new_snodes) == original_snodes_num
     new_peak_memory, _, _, _ = estimate_peak_memory_allocfree(
         new_snodes, name_to_freeable_input_buf, graph_outputs
@@ -1239,18 +1639,409 @@ def is_groupable(snode):
         },
         payload_fn=lambda: log_str,
     )
-    return new_snodes, stats
+    return new_snodes
+
+
+def _find_buffers_with_changed_last_use_sink_waits(
+    candidate: BaseSchedulerNode,
+    gns: list[BaseSchedulerNode],
+    buf_to_snode_last_use: dict,
+) -> dict[BaseSchedulerNode, list[Union[FreeableInputBuffer, Any]]]:
+    """
+    Find buffers whose last use will change after swapping in sink_waits pass.
+
+    When we swap [group] candidate to candidate [group], some buffers that
+    were last used by candidate will now be last used by a group node instead.
+    This is the opposite direction from the reorder version.
 
+    Args:
+        candidate: The node being moved (currently last use)
+        gns: Group nodes being swapped with candidate
+        buf_to_snode_last_use: Mapping of buffers to their current last-use nodes
+
+    Returns:
+        Dict mapping group nodes to buffers that will change their last-use node
+    """
+    group_n_to_bufs_after_swap_dealloc_instead_of_candidate: dict[
+        BaseSchedulerNode, list[Union[FreeableInputBuffer, Any]]
+    ] = defaultdict(list)
+    for (
+        buf,
+        snode_last_use,
+    ) in buf_to_snode_last_use.items():
+        succ_nodes = buf.mpi_buffer.succ_nodes
+        if snode_last_use != candidate:  # noqa: E711
+            continue
+        # candidate is last use of buf
+        last_succ_gn = None
+        for gn in gns:
+            if gn in succ_nodes:
+                last_succ_gn = gn
+        if last_succ_gn is None:
+            continue
 
-def sink_waits_iterative(
+        # gn has successors of buf that after potential swap will become
+        # last use of buf and start deallocating buf instead of candidate
+        group_n_to_bufs_after_swap_dealloc_instead_of_candidate[last_succ_gn].append(
+            buf
+        )
+
+    return group_n_to_bufs_after_swap_dealloc_instead_of_candidate
+
+
+def _sink_waits_iterative_internal(
     snodes: list[BaseSchedulerNode],
-) -> list[BaseSchedulerNode]:
+) -> tuple[list[BaseSchedulerNode], dict[BaseSchedulerNode, SinkWaitInfo]]:
+    from torch._inductor.scheduler import GroupedSchedulerNode
+
+    original_snodes_num = len(snodes)
+    if original_snodes_num == 0:
+        return snodes, {}
+    graph_inputs: OrderedSet[str] = OrderedSet(V.graph.graph_inputs.keys())
+    graph_outputs: OrderedSet[str] = OrderedSet(V.graph.get_output_names())
+    (
+        peak_memory,
+        _curr_memory,
+        snodes_allocfree,
+        buf_to_snode_last_use,
+        name_to_freeable_input_buf,
+    ) = _initialize_memory_tracking(snodes, graph_inputs, graph_outputs)
+
+    _prev, _next, _head = _initialize_double_linked_list(snodes)
+
+    stats: dict[BaseSchedulerNode, SinkWaitInfo] = {}
+
+    runtimes: dict[BaseSchedulerNode, float] = {
+        snode: estimate_op_runtime(snode) * _op_runtime_estimate_mult(snode)
+        for snode in snodes
+    }
+
+    curr: Optional[BaseSchedulerNode] = snodes[-1]
+
+    processed_waits = OrderedSet()  # type: ignore[var-annotated]
+    debug_iterative_memory_recompute = (
+        config_comms.reorder_iterative_debug_memory_recompute
+    )
+    debug_num_sink_waits_to_reorder: Optional[int] = (
+        config_comms.sink_waits_iterative_debug_limit_to_sink
+    )
+
+    iterative_recompute_error = False
+    while curr is not None and _prev[curr] is not None:
+        _prev_curr = _prev[curr]
+        if iterative_recompute_error:
+            break
+        if (
+            debug_num_sink_waits_to_reorder is not None
+            and len(processed_waits) >= debug_num_sink_waits_to_reorder
+        ):
+            break
+
+        # pyrefly: ignore [bad-argument-type]
+        if not (contains_wait(curr) and curr not in processed_waits):
+            curr = _prev_curr
+            continue
+
+        processed_waits.add(curr)
+        info = stats[curr] = SinkWaitInfo()
+        comm_time, comp_time, overlap_info = wait_exposed_communication_time(
+            _group_nodes_from_linked_list(_head, curr, _next), runtimes
+        )
+        info.initial_exposed = info.final_exposed = comm_time - comp_time
+        info.comm_time = comm_time
+        info.comp_time = comp_time
+        info.overlap_info = overlap_info
+
+        candidate = _next[curr]
+        wait_snode = curr
+        group_head = curr
+        group_tail = curr
+        group_colls = {}
+        group_runtime = 0.0
+        group_peak_memory = _curr_memory[curr][0]
+
+        while candidate is not None:
+            if config_comms.sink_iterative_use_runtime_estimations and (
+                info.final_exposed
+                < -config_comms.sink_iterative_extra_comm_comp_overlap * info.comm_time
+            ):
+                info.limiting_factor = "unexposed by runtime estimations"
+                break
+
+            gns: list[BaseSchedulerNode] = _group_nodes_from_linked_list(
+                group_head, group_tail, _next
+            )
+            group = GroupedSchedulerNode(
+                wait_snode.scheduler,
+                gns,
+                temp_grouping=True,
+            )
+
+            # We can have multiple deps with the same name.
+            # As we ignore WeakDep(is_fake=True) =>
+            # filter them out first to avoid overwriting  of real dep.
+            data_deps = {
+                d.name: d for d in candidate.unmet_dependencies if not _is_fake_dep(d)
+            }
+
+            group_outs = group.get_outputs()
+            data_dep = None
+            for o in group_outs:
+                if d := data_deps.get(o.get_name(), None):
+                    data_dep = d
+                    break
+            # Conservative sink wait, limiting by space before next collective.
+            # The global strategy is that bucketing should create space.
+            # For 2D we can experiment with allowing to sink Wait beyond non current group collective.
+            # pyrefly: ignore[unbound-name]
+            if not config_comms.sink_waits_iterative_swap_with_collectives:
+                if contains_async_collective(candidate):
+                    info.limiting_factor = (
+                        f"candidate contains_async_collective {candidate.get_name()}"
+                    )
+                    break
+
+            # 1. If we have data_dep - we can not swap => trying to group
+            # 2. If swap candidate and current node both contain collectives => trying to group
+            if data_dep is not None or (
+                both_contain_comms := (
+                    contains_collective(group) and contains_collective(candidate)
+                )
+            ):
+                _is_groupable, groupable_reason = _is_node_groupable_for_sink_waits(
+                    candidate
+                )
+                if _is_groupable:
+                    group_tail = candidate
+                    if (
+                        # pyrefly: ignore[unbound-name]
+                        config_comms.sink_iterative_use_runtime_estimations
+                        and contains_collective(candidate)
+                    ):
+                        comm_time, comp_time, _ = coll_exposed_communication_time(
+                            _group_nodes_from_linked_list(candidate, None, _next),
+                            runtimes,
+                        )
+                        group_colls[candidate] = (comm_time, comp_time)
+                        if not contains_async_collective(candidate):
+                            group_runtime += runtimes[candidate]
+
+                    group_peak_memory = max(
+                        group_peak_memory, _curr_memory[candidate][0]
+                    )
+                    info.grouped += 1
+                    info.grouped_info = _group_names(gns)
+                    candidate = _next[candidate]
+                    continue
+                elif data_dep is None:
+                    if (
+                        # pyrefly: ignore[unbound-name]
+                        not config_comms.sink_waits_iterative_unsafe_collectives_reorder
+                        and both_contain_comms
+                    ):
+                        info.limiting_factor = (
+                            f"collective ordering {_group_names(gns)}"
+                            f"\n with candidate:{candidate.get_name()}"
+                        )
+                        break
+                else:
+                    info.limiting_factor = (
+                        f"data dependency {data_dep}(dep_names:{list(data_deps.keys())})"
+                        f"\n candidate:{candidate.get_name()}(os:{[candidate.get_buffer_names()]})"
+                        f"\n dep on {_group_names(gns)}"
+                        f"\n outs:{[o.get_name() for o in group_outs]}"
+                        f"\n non_group_reason:{groupable_reason}"
+                    )
+                    break
+
+            # pyrefly: ignore[unbound-name]
+            if config_comms.sink_iterative_use_runtime_estimations:
+                if is_wait(candidate.node):
+                    # Corresponding collective is before the group,
+                    # Swap can increase exposed time of corresponding collective
+                    comm_time, comp_time, _ = wait_exposed_communication_time(
+                        _group_nodes_from_linked_list(_head, candidate, _next), runtimes
+                    )
+                    # pyrefly: ignore[no-matching-overload]
+                    exposed_before = max(0, comm_time - comp_time)
+                    # pyrefly: ignore[no-matching-overload]
+                    exposed_after = max(0, comm_time - comp_time + group_runtime)
+                    # We do not know how much we can sink more after this swap,
+                    # Just comparing advantage at the moment for now.
+                    if exposed_after > exposed_before:
+                        info.limiting_factor = (
+                            "candidate is wait,"
+                            f" exposed_before:{exposed_before} vs exposed_after:{exposed_after}"
+                        )
+                        break
+
+                # Check if candidate has sync runtime
+                if not contains_async_collective(candidate):
+                    # If candidate has sync runtime,
+                    # Waits of gorup_colls are on the right from group.
+                    # Swap can increase their exposed time.
+                    c_runtime = runtimes[candidate]
+
+                    if c_runtime > 0 and len(group_colls) > 0:
+                        # Advantage for current Wait to do the Swap
+                        # pyrefly: ignore[no-matching-overload]
+                        exposed_delta = max(
+                            0,
+                            info.comm_time - info.comp_time,
+                        )
+                        # pyrefly: ignore[no-matching-overload]
+                        -max(0, info.comm_time - info.comp_time - c_runtime)
+                        for gc_comm_time, gc_comp_time in group_colls.values():
+                            exposed_delta += max(0, gc_comm_time - gc_comp_time) - max(
+                                0, gc_comm_time - gc_comp_time + c_runtime
+                            )
+                        if exposed_delta > 0:
+                            info.limiting_factor = (
+                                f"candidate has compute {c_runtime}, group contains collectives,"
+                                f" total_exposed_delta {exposed_delta}"
+                            )
+                            break
+                        else:
+                            # Update all group_colls comm_time, comp_time
+                            for gc, (
+                                gc_comm_time,
+                                gc_comp_time,
+                            ) in group_colls.items():
+                                group_colls[gc] = (
+                                    gc_comm_time,
+                                    gc_comp_time - c_runtime,
+                                )
+
+            candidate_allocfree: SNodeMemory = snodes_allocfree[candidate]
+            candidate_delta_mem = (
+                candidate_allocfree.size_alloc - candidate_allocfree.size_free
+            )
+            # [group] candidate -> candidate [group]
+            # Check for buffers with successors in group and candidate last successor
+            #
+            # Buf that  changes its last use snode,
+            # It was deallocated by candidate,
+            # but after swap it will be deallocated by group node.
+            group_n_to_bufs_after_swap_dealloc_instead_of_candidate = (
+                _find_buffers_with_changed_last_use_sink_waits(
+                    candidate, gns, buf_to_snode_last_use
+                )
+            )
+
+            potential_peak, _post_alloc_update, _size_free_delta_update = (
+                _calculate_potential_peak_memory_sink_waits(
+                    candidate,
+                    gns,
+                    group_head,
+                    group_peak_memory,
+                    candidate_delta_mem,
+                    candidate_allocfree,
+                    group_n_to_bufs_after_swap_dealloc_instead_of_candidate,
+                    _curr_memory,
+                    snodes_allocfree,
+                )
+            )
+            if (
+                potential_peak - peak_memory
+                # pyrefly: ignore[unbound-name]
+                > peak_memory * config_comms.sink_iterative_peak_memory_budget
+            ):
+                info.limiting_factor = (
+                    f"peak memory new:{potential_peak} vs base:{peak_memory}"
+                )
+                break
+
+            info.moves += 1
+            info.moves_info += f"+{candidate.get_name()}"
+
+            _head = _perform_double_linked_list_swap_sink_waits(
+                candidate, group_head, group_tail, _prev, _next, _head
+            )
+
+            comm_time, comp_time, overlap_info = wait_exposed_communication_time(
+                _group_nodes_from_linked_list(_head, curr, _next), runtimes
+            )
+            info.comm_time = comm_time
+            info.comp_time = comp_time
+            info.final_exposed = comm_time - comp_time
+            info.overlap_info = overlap_info
+
+            _update_memory_tracking_after_swap_sink_waits(
+                candidate,
+                gns,
+                candidate_delta_mem,
+                candidate_allocfree,
+                group_n_to_bufs_after_swap_dealloc_instead_of_candidate,
+                _post_alloc_update,
+                _size_free_delta_update,
+                _curr_memory,
+                snodes_allocfree,
+            )
+
+            if debug_iterative_memory_recompute:
+                from .comms_debug import _debug_iterative_memory_recompute
+
+                iterative_recompute_error = _debug_iterative_memory_recompute(
+                    candidate,
+                    gns,
+                    _group_names(gns),
+                    _group_nodes_from_linked_list(_head, None, _next),
+                    name_to_freeable_input_buf,
+                    graph_outputs,
+                    peak_memory,
+                    _curr_memory,
+                    snodes_allocfree,
+                    "sink_waits_iterative",
+                    group_n_to_bufs_after_swap_dealloc_instead_of_candidate,
+                )
+                if iterative_recompute_error:
+                    break
+
+            candidate = _next[group_tail]
+        curr = _prev_curr
+
+    new_snodes = _format_and_log_sink_waits_stats(
+        stats,
+        _head,
+        _next,
+        original_snodes_num,
+        peak_memory,
+        name_to_freeable_input_buf,
+        graph_outputs,
+    )
+
+    return new_snodes, stats
+
+
+def sink_waits_iterative(snodes: list[BaseSchedulerNode]) -> list[BaseSchedulerNode]:
+    """
+    Similarly to reorder_communication_preserving_peak_memory this pass will try to iteratively
+    push Wait nodes later, recomputing estimated peak memory before each swap,
+    and preventing peak memory regressions.
+
+    Pass will be applied to every Wait node. If there are immediate dependencies with next node,
+    pass will try to group them together and on the next step to swap the group with next candidate.
+
+    If _inductor.config_comms.sink_iterative_use_runtime_estimations is set True,
+    pass will stop reordering of Wait once corresponding Collective is unexposed,
+    based on runtime estimations.
+
+    inductor.config_comms.sink_iterative_peak_memory_budget allows to tune how much pass
+    can regress initial peak memory.
+    E.g.:
+    sink_iterative_peak_memory_budget == 0.0 - No regression of initial peak memory is allowed
+    sink_iterative_peak_memory_budget == 0.2 - Pass can improve comm-compute overlap, sacrificing
+    20% of initial peak memory value.
+
+    inductor.config_comms.sink_iterative_extra_comm_comp_overlap config allows to more aggressively
+    sink waits, stopping only when overlap_compute >= (1 + extra_comm_comp_overlap) * comm_time
+    """
     return _sink_waits_iterative_internal(snodes)[0]
 
 
 def estimate_op_runtime(snode: BaseSchedulerNode) -> float:
     """
-    Returns estimated op runtime in nanoseconds (ns)
+    Returns estimated op runtime in milliseconds (ms)
     """
     if config.estimate_op_runtime == "default":
         runtime = snode.get_estimated_runtime()
@@ -1267,7 +2058,7 @@ def node_summary(snode):
         if isinstance(snode.node, (ir.ExternKernelOut, ir._CollectiveKernel)):
             outs_str = f"outs:{[o.get_name() for o in snode.get_outputs()]}"
             ins_str = f"ins:{[d.name for d in snode.unmet_dependencies]}"
-            detail = f" {snode.get_name()} ({snode.node.python_kernel_name})\n {outs_str}\n ({ins_str})"
+            detail = f" {snode.get_name()} ({snode.node.python_kernel_name})\n {outs_str}({ins_str})"
         layouts = [child.node.get_output_spec() for child in snode.get_nodes()]
         out_tensor_info = ",".join(
             [
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
index 94df451b5f1d0..46ca60483828d 100644
--- a/torch/_inductor/compile_fx.py
+++ b/torch/_inductor/compile_fx.py
@@ -91,6 +91,7 @@
     tensor_is_aligned,
 )
 from torch._library.fake_class_registry import FakeScriptObject
+from torch._library.opaque_object import is_opaque_type
 from torch._logging import trace_structured
 from torch._utils_internal import compile_time_strobelight_meta
 from torch.fx import GraphModule
@@ -104,7 +105,7 @@
 from ..fx._lazy_graph_module import _use_lazy_graph_module
 from ..fx.graph import _PyTreeCodeGen
 from ..utils._triton import has_triton
-from . import config, metrics
+from . import config, distributed_autotune, metrics
 from .codegen.common import get_wrapper_codegen_for_device, init_backend_registration
 from .debug import DebugContext
 from .decomposition import select_decomp_table
@@ -214,7 +215,7 @@ def _fx_compile_mode_default() -> FxCompileConfig:
             "Invalid value of %s for %s. Expected one of %s. Using default.",
             value,
             name,
-            ", ".join(sorted(repr(x) for x in FxCompileMode.__members__.keys())),
+            ", ".join(sorted(repr(x) for x in FxCompileMode.__members__)),
         )
         # Remove from the environment so subprocesses don't ALSO complain.
         os.environ.pop(name)
@@ -508,6 +509,9 @@ def _recursive_pre_grad_passes(
         log_pt2_compile_event=True,
         dynamo_compile_column_us="pre_grad_pass_time_us",
     ):
+        if not config.use_pre_grad_passes:
+            return gm
+
         add_passes = config.add_pre_grad_passes
         remove_passes = config.remove_pre_grad_passes
         for subgraph_name in _get_subgraph_names(gm):
@@ -526,6 +530,9 @@ def _recursive_joint_graph_passes(
         log_pt2_compile_event=True,
         dynamo_compile_column_us="joint_graph_pass_time_us",
     ):
+        if not config.use_joint_graph_passes:
+            return
+
         # invoke_subgraph already runs the _recursive_joint_graph_passes.  In
         # AOTAutograd, `run_joint_graph_passes_on_hops` partitions the
         # invoke_subgraph HOP before calling the partitioner on the outer graph.
@@ -544,6 +551,9 @@ def _recursive_post_grad_passes(gm: GraphModule, is_inference: bool = False) ->
         log_pt2_compile_event=True,
         dynamo_compile_column_us="post_grad_pass_time_us",
     ):
+        if not config.use_post_grad_passes:
+            return
+
         for subgraph_name in _get_subgraph_names(gm):
             subgraph = getattr(gm, subgraph_name)
             _recursive_post_grad_passes(subgraph, is_inference)
@@ -1431,7 +1441,11 @@ def codegen_and_compile(
                 # We are going to start code generating runtime asserts, so make sure
                 # you don't start adding new ones in the lowering process
                 graph.freeze_runtime_asserts()
-                with V.set_graph_handler(graph), V.set_extern_kernel_nodes([]):
+                with (
+                    V.set_graph_handler(graph),
+                    V.set_extern_kernel_nodes([]),
+                    distributed_autotune.graph_context(),
+                ):
                     graph.run(*example_inputs)
                     output_strides: list[Optional[tuple[_StrideExprStr, ...]]] = []
                     if graph.graph_outputs is not None:
@@ -1626,7 +1640,9 @@ def codegen_and_compile(
                             # pyrefly: ignore [unbound-name]
                             (str, list, torch.fx.GraphModule),
                         ), type(compiled_fn)
-                        return CompiledAOTI(compiled_fn)
+                        return CompiledAOTI(
+                            filename=compiled_fn, device_type=graph.device_type
+                        )
 
                     # TODO: Hoist this above V.aot_compilation
                     # pyrefly: ignore [unbound-name]
@@ -2521,16 +2537,19 @@ def _extract_inputs_from_exported_gm(
     fake_inputs = [
         node.meta.get("val") for node in gm.graph.nodes if node.op == "placeholder"
     ]
-    # Replace non-tensor (constant) inputs with Nones, since these are not being
-    # used anyways by the graph
-    fake_inputs = [
-        inp if isinstance(inp, torch.Tensor) else None for inp in fake_inputs
-    ]
+
+    if not config.fx_wrapper:
+        # Replace non-tensor inputs with Nones
+        # constant scalars embedded in the graph
+        # symbolic scalars (symint) are not supported in non-fx_wrapper mode
+        fake_inputs = [
+            inp if isinstance(inp, torch.Tensor) else None for inp in fake_inputs
+        ]
 
     if any(v is not None for v in fake_inputs):
         # Validate devices before switching to fake tensors.
         for idx, fi, i in zip(count(), fake_inputs, example_inputs_):
-            if fi is not None:
+            if fi is not None and isinstance(fi, torch.Tensor):
                 assert isinstance(i, torch.Tensor)
                 if fi.device != i.device:
                     raise ValueError(
@@ -2699,12 +2718,15 @@ def bw_compiler(
             or torch._guards.TracingContext(fake_mode)
         )
 
-        if V.aot_compilation:
+        if V.aot_compilation and not config.enable_autograd_for_aot:
             from .utils import is_valid_aoti_model_name
 
             is_valid_aoti_model_name()
 
-            with functorch_config.patch(unlift_effect_tokens=True):
+            with functorch_config.patch(
+                unlift_effect_tokens=True,
+                selective_decompose=config.selective_decompose,
+            ):
                 gm, graph_signature = aot_export_module(
                     model_,
                     example_inputs_,
@@ -2729,7 +2751,9 @@ def bw_compiler(
                             node.meta["val"] = fake_mode.from_tensor(
                                 target, static_shapes=True
                             )
-                        elif isinstance(target, torch.ScriptObject):
+                        elif isinstance(target, torch.ScriptObject) or is_opaque_type(
+                            type(target)
+                        ):
                             node.meta["val"] = (
                                 torch._library.fake_class_registry.maybe_to_fake_obj(
                                     fake_mode, target
@@ -2764,7 +2788,10 @@ def bw_compiler(
             V.set_fake_mode(fake_mode),
             torch._guards.tracing(tracing_context),
             compiled_autograd._disable(),
-            functorch_config.patch(unlift_effect_tokens=True),
+            functorch_config.patch(
+                unlift_effect_tokens=True,
+                selective_decompose=config.selective_decompose,
+            ),
         ):
             try:
                 return aot_autograd(
diff --git a/torch/_inductor/compile_fx_ext.py b/torch/_inductor/compile_fx_ext.py
index f02939225c462..24048ccdda12c 100644
--- a/torch/_inductor/compile_fx_ext.py
+++ b/torch/_inductor/compile_fx_ext.py
@@ -468,6 +468,8 @@ def serialize_compile(
         fake_mode = _current_fake_mode()
         fake_tensor_mode = _FakeTensorModeSerializer(fake_mode)
 
+        from pickle import PicklingError
+
         try:
             input = _WireProtocolInput(
                 gm,
@@ -483,7 +485,7 @@ def serialize_compile(
                 fake_tensor_mode,
             ).serialize()
             return (input, constants)
-        except (AttributeError, BypassFxGraphCache):
+        except (AttributeError, BypassFxGraphCache, PicklingError):
             # For example: AttributeError: Can't pickle local object
             # 'make_opaque_unary_fn.<locals>.OpaqueUnaryFn'
 
diff --git a/torch/_inductor/compile_worker/subproc_pool.py b/torch/_inductor/compile_worker/subproc_pool.py
index 037b0e438adaa..b0e0d4ba58495 100644
--- a/torch/_inductor/compile_worker/subproc_pool.py
+++ b/torch/_inductor/compile_worker/subproc_pool.py
@@ -24,6 +24,7 @@
 import torch._thread_safe_fork  # noqa: F401
 from torch._inductor import config
 from torch._inductor.codecache import torch_key
+from torch._inductor.compile_worker.timer import Timer
 from torch._inductor.compile_worker.tracked_process_pool import (
     TrackedProcessPoolExecutor,
 )
@@ -132,6 +133,7 @@ def __init__(
         nprocs: int,
         pickler: Optional[SubprocPickler] = None,
         kind: SubprocKind = SubprocKind.FORK,
+        quiesce: bool = False,
     ) -> None:
         entry = os.path.join(os.path.dirname(__file__), "__main__.py")
         self.pickler = pickler or SubprocPickler()
@@ -216,6 +218,13 @@ def __init__(
             "pytorch.wait_counter.subproc_pool.first_job"
         ).guard()
 
+        if quiesce:
+            self.timer: Optional[Timer] = Timer(
+                config.quiesce_async_compile_time, self.quiesce
+            )
+        else:
+            self.timer = None
+
         # Start thread last to ensure all member variables are initialized
         # before any access.
         self.read_thread.start()
@@ -288,6 +297,8 @@ def _read_thread(self) -> None:
             with self.futures_lock:
                 if not self.running:
                     return
+                if self.timer:
+                    self.timer.record_call()
                 if isinstance(result, _SubprocExceptionInfo):
                     # An exception occurred in the submitted job
                     self.pending_futures[job_id].set_exception(
@@ -308,11 +319,11 @@ def _read_thread(self) -> None:
 
     def quiesce(self) -> None:
         self._send(MsgHeader.QUIESCE)
-        assert self.quiesce_waitcounter is None
-        self.quiesce_waitcounter = _WaitCounter(
-            "pytorch.wait_counter.subproc_pool.running"
-        ).guard()
-        self.quiesce_waitcounter.__enter__()
+        if self.quiesce_waitcounter is None:
+            self.quiesce_waitcounter = _WaitCounter(
+                "pytorch.wait_counter.subproc_pool.quiesced"
+            ).guard()
+            self.quiesce_waitcounter.__enter__()
 
     def wakeup(self) -> None:
         self._send(MsgHeader.WAKEUP)
@@ -322,6 +333,8 @@ def shutdown(self) -> None:
             with self.write_lock:
                 if not self.running:
                     return
+                if self.timer:
+                    self.timer.quit()
                 self.running = False
                 self.running_waitcounter.__exit__()
                 _send_msg(self.write_pipe, MsgHeader.SHUTDOWN)
diff --git a/torch/_inductor/compile_worker/timer.py b/torch/_inductor/compile_worker/timer.py
index d4b0c0dc9e281..7c495403b3a55 100644
--- a/torch/_inductor/compile_worker/timer.py
+++ b/torch/_inductor/compile_worker/timer.py
@@ -1,6 +1,7 @@
+from collections.abc import Callable
 from threading import Lock, Thread
 from time import monotonic, sleep
-from typing import Callable, Optional, Union
+from typing import Optional, Union
 
 
 class Timer:
@@ -17,7 +18,7 @@ def __init__(
         self.background_thread: Optional[Thread] = None
         self.last_called: Optional[float] = None
         self.duration = duration
-        self.sleep_time = 60
+        self.sleep_time = duration / 2
         self.call = call
         self.exit = False
 
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index b78ade758f80b..e4660f90e1eb4 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -1,6 +1,7 @@
 import os
 import sys
-from typing import Any, Callable, Literal, Optional, TYPE_CHECKING, Union
+from collections.abc import Callable
+from typing import Any, Literal, Optional, TYPE_CHECKING, Union
 
 import torch
 import torch._inductor.custom_graph_pass
@@ -235,9 +236,6 @@ def prologue_fusion_enabled() -> bool:
 # Enable to allow using ftz variant of exponenet instruction in triton codegen.
 use_fast_math = os.environ.get("TORCHINDUCTOR_USE_FAST_MATH") == "1"
 
-# Enable bfloat16 atomic adds (fbcode only until upstreamed to triton)
-bfloat16_atomic_adds_enabled = True
-
 # How to organize memory under memory_planning=True:
 # - "none": do not try to pool storage, just reuse
 # - "intermediates": all non-outputs share storage, outputs each get unique storage
@@ -379,6 +377,15 @@ def prologue_fusion_enabled() -> bool:
 # for built-in passes, use string name; for user-defined passes, pass in the function handle
 # WARNING: Inductor scheduler IR is at prototype stage and subject to change,
 # hence custom IR passes built on top of it might break in the future.
+#
+# See aten_distributed_optimizations, it is recommended way for distributed optimizations.
+#
+# Recommended configuration for reorder_for_compute_comm_overlap_passes:
+# [
+#     "reorder_communication_preserving_peak_memory",
+#     "sink_waits_iterative",
+#     "reorder_communication_preserving_peak_memory",
+# ]
 reorder_for_compute_comm_overlap_passes: list[
     Union[
         str,
@@ -387,11 +394,7 @@ def prologue_fusion_enabled() -> bool:
             list["torch._inductor.scheduler.BaseSchedulerNode"],
         ],
     ]
-] = [
-    "reorder_compute_for_overlap",
-    "sink_waits",
-    "raise_comms",
-]
+] = []
 
 # Maximum number of positions to advance a given collective, unlimited by default
 reorder_prefetch_limit: Optional[int] = None
@@ -407,16 +410,6 @@ def prologue_fusion_enabled() -> bool:
 # is zero, which turns off this optimization.
 size_threshold_for_succ_based_strategy: int = 0
 
-reorder_iterative_debug_memory_recompute: bool = False
-reorder_iterative_debug_limit_to_reorder: Optional[int] = (
-    None
-    if (env_str := os.getenv("PYTORCH_REORDER_COLLECTIVES_LIMIT")) is None
-    else int(env_str)
-)
-sink_waits_iterative_debug_limit_to_sink: Optional[int] = (
-    # pyrefly: ignore [unbound-name]
-    None if (env_str := os.getenv("PYTORCH_SINK_WAITS_LIMIT")) is None else int(env_str)
-)
 
 bucket_all_gathers_fx: Literal["none", "all", "only_fsdp"] = "none"
 # By default torch._inductor.fx_passes.bucketing.bucket_size_determinator is used
@@ -428,6 +421,10 @@ def prologue_fusion_enabled() -> bool:
     None
 )
 
+bucket_all_reduces_fx: Literal["none", "all"] = "none"
+# By default torch._inductor.fx_passes.bucketing.bucket_size_determinator is used
+bucket_all_reduces_fx_bucket_size_determinator: Optional[Callable[[int], int]] = None
+
 # runtime estimation function for ops
 # for built-in estimation function, pass in "default"; for user-defined estimation function, pass in the function handle
 estimate_op_runtime = "default"
@@ -452,6 +449,14 @@ def prologue_fusion_enabled() -> bool:
     justknob="pytorch/inductor:use_experimental_benchmarker",
 )
 
+# Enable distributed autotuning. When this is enabled we will distribute the
+# autotuning across distributed ranks in the same program group - so instead of
+# each rank autotuning every kernel they only autotune 1/world size kernels and
+# then share the results.
+distributed_max_autotune_gemm = (
+    os.environ.get("TORCHINDUCTOR_DISTRIBUTED_MAX_AUTOTUNE_GEMM") == "1"
+)
+
 # enable slow autotuning passes to select algorithms
 max_autotune = os.environ.get("TORCHINDUCTOR_MAX_AUTOTUNE") == "1"
 
@@ -546,6 +551,32 @@ def prologue_fusion_enabled() -> bool:
     "TORCHINDUCTOR_MAX_AUTOTUNE_FLEX_SEARCH_SPACE", "DEFAULT"
 ).upper()  # type: ignore[assignment]
 
+
+# Fall back to ATen for all ops by default, except those nodes that users explicitly
+# annotated with regional inductor compile. Please read torch.fx.passes.regional_inductor
+# on to explicitly annotate. This is currently only used by inductor lite mode.
+# Different from default inductor mode that fuses all nodes, this config enables an
+# opt-in mode that only fuse for user-specified nodes. The motivation is to provide
+# guaranteed numeric correctness and give full control to users.
+fallback_by_default: bool = False
+
+
+# This config allows selective decomposition of certain operators in the graph.
+# Currently the only use case is to patch the same-name config in functorch, for
+# inductor lite mode. See more details in [Note: Selective Decomposition]
+selective_decompose: bool = False
+
+
+# Use dead code elimination
+use_dce: bool = True
+
+
+# Use fx graph passes
+use_pre_grad_passes: bool = True
+use_joint_graph_passes: bool = True
+use_post_grad_passes: bool = True
+
+
 cutedsl_enable_autotuning: bool = (
     os.environ.get("CUTEDSL_ENABLE_AUTOTUNING", "0") == "1"
 )
@@ -678,6 +709,17 @@ def use_autoheuristic(name: str) -> bool:
     == "1"
 )
 
+
+# When trying to fuse two nodes, one with:
+# a[contiguous_writes] = fn(...)
+# and another node:
+# b[contiguous_writes] = a[discontiguous_reads]
+# If b is unary, and we can figure out an inverse formula for
+# discontiguous writes, invert b as :
+# b[inverse(discontiguous_writes)] = a[contiguous_reads]
+# so that the nodes can fuse. for more details: https://gist.github.com/eellison/6f9f4a7ec10a860150b15b719f9285a9
+loop_index_inversion_in_fusion: bool = True
+
 # If fusing two nodes only save less then score_fusion_memory_threshold memory,
 # we should not bother fusing the nodes.
 #
@@ -903,6 +945,11 @@ class aten_distributed_optimizations:
         None
     )
 
+    # Method for estimating collective runtime
+    # "analytical": Use bandwidth formulas (default)
+    # "benchmark": Use CUDA events with power-of-2 rounding and interpolation
+    collective_estimator: Literal["analytical", "benchmark"] = "analytical"
+
 
 def parallel_compile_enabled_internally() -> bool:
     """
@@ -964,6 +1011,11 @@ def decide_compile_threads() -> int:
     default=False,
 )
 
+# Time in seconds to wait before quiescing
+quiesce_async_compile_time: int = Config(
+    default=60,
+)
+
 # Whether or not to enable statically launching CUDA kernels
 # compiled by triton (instead of using triton's own launcher)
 use_static_cuda_launcher: bool = static_cuda_launcher_default()
@@ -1105,11 +1157,21 @@ def decide_compile_threads() -> int:
 # decompose some memory bound matmul/bmm to mul
 decompose_mem_bound_mm: bool = False
 
+# Wrap compiled regions in inductor_compiled_code HOP to make them visible to
+# TorchDispatchModes like DebugMode and Selective Activation Checkpointing.
+wrap_inductor_compiled_regions: bool = False
+
 # assume_aligned_inputs means that we assume that inputs will be aligned; we generate
 # code using this assumption, and clone tensors before use if they aren't aligned.
 # In the common case, most inputs will be aligned.
 assume_aligned_inputs: bool = False
 
+# assume_32bit_indexing means that we assume 32-bit indexing is always safe; we always
+# use 32-bit indices regardless of tensor sizes. If assume_32bit_indexing contradicts
+# with example inputs we throw. This is useful when all dynamic shapes are unbacked and
+# you know you only operate with 32-bit sizes.
+assume_32bit_indexing: bool = False
+
 # For the user-written Triton kernels compiled with the model, ignore the unsupported
 # arguments passed to the @triton.autotune in the user's code; this is unsafe, as
 # ignoring the unsupported args may lead to unexpected autotuning behavior: don't
@@ -1142,6 +1204,8 @@ def decide_compile_threads() -> int:
 
 file_lock_timeout: int = int(os.environ.get("TORCHINDUCTOR_FILE_LOCK_TIMEOUT", "600"))
 
+enable_autograd_for_aot: bool = False
+
 
 def get_worker_log_path() -> Optional[str]:
     log_loc = None
@@ -1348,6 +1412,10 @@ class triton:
         default=False,
     )
 
+    # reorder nodes to minimize the number of graph partitions while
+    # not incurring large memory overhead
+    reorder_for_reducing_graph_partitions: bool = True
+
     # assertions on the fast path
     fast_path_cudagraph_asserts = False
 
@@ -1556,11 +1624,16 @@ class triton:
     enable_pdl = False
 
     mix_order_reduction = (
-        os.environ.get("TORCHINDUCTOR_MIX_ORDER_REDUCTION", "0") == "1"
+        os.environ.get("TORCHINDUCTOR_MIX_ORDER_REDUCTION", "0" if is_fbcode() else "1")
+        == "1"
     )
+    mix_order_reduction_initial_xblock = 1
 
     mix_order_reduction_split_size: Optional[int] = None
-    mix_order_reduction_autotune_split_size = True
+    mix_order_reduction_autotune_split_size = (
+        os.environ.get("TORCHINDUCTOR_MIX_ORDER_REDUCTION_AUTOTUNE_SPLIT_SIZE", "0")
+        == "1"
+    )
 
 
 class aot_inductor:
@@ -1577,6 +1650,7 @@ class aot_inductor:
     output_path = ""
 
     debug_compile = os.environ.get("AOT_INDUCTOR_DEBUG_COMPILE", "0") == "1"
+    debug_symbols = os.environ.get("AOT_INDUCTOR_DEBUG_SYMBOLS", "0") == "1"
 
     # Annotate generated main wrapper function, i.e. AOTInductorModel::run_impl,
     # to use which cpp compiler optimization level, default to O1
@@ -1946,11 +2020,12 @@ class rocm:
     contiguous_threshold: int = 16
 
 
-# Backend to use for CPU codegen either "cpp" or "triton" (experimental) or "halide" (experimental)
-cpu_backend: Literal["cpp", "triton", "halide"] = "cpp"
+# Backend to use for CPU codegen either "cpp" or "triton" (experimental) or "halide" (experimental) or "pallas" (experimental)
+cpu_backend: Literal["cpp", "triton", "halide", "pallas"] = "cpp"
 
-# Backend to use for CUDA codegen either "triton" or "halide" (experimental)
-cuda_backend: Literal["triton", "halide"] = "triton"
+# Backend to use for CUDA codegen either
+# "triton", "halide" (experimental) or "pallas" (experimental)
+cuda_backend: Literal["triton", "halide", "pallas"] = "triton"
 
 # Backend to use for XPU codegen either "triton"
 xpu_backend: Literal["triton"] = "triton"
diff --git a/torch/_inductor/config_comms.py b/torch/_inductor/config_comms.py
index b5dbf424f35b4..31f38b867dd5e 100644
--- a/torch/_inductor/config_comms.py
+++ b/torch/_inductor/config_comms.py
@@ -1,4 +1,6 @@
+import os
 import sys
+from typing import Optional
 
 from torch.utils._config_module import install_config_module
 
@@ -11,5 +13,59 @@
 # decisions on different distributed ranks.
 runtime_estimations_align_across_all_distributed_ranks: bool = False
 
+reorder_iterative_debug_memory_recompute: bool = False
+reorder_iterative_debug_limit_to_reorder: Optional[int] = (
+    None
+    # pyrefly: ignore[unbound-name]
+    if (env_str := os.getenv("PYTORCH_REORDER_COLLECTIVES_LIMIT")) is None
+    else int(env_str)
+)
+sink_waits_iterative_debug_limit_to_sink: Optional[int] = (
+    # pyrefly: ignore[unbound-name]
+    None if (env_str := os.getenv("PYTORCH_SINK_WAITS_LIMIT")) is None else int(env_str)
+)
+
+
+# Should be used with config.runtime_estimations_mms_benchmark = True
+reorder_iterative_use_runtime_estimations: bool = False
+sink_iterative_use_runtime_estimations: bool = False
+
+# Broadcast runtime estimations doing real Collective operation between all ranks.
+# If non-deterministic runtime estimations are used this must be used to make
+# all ranks to do identical decisions and prevent global Collectives reordering,
+# (that will result un NCCL hangs)
+reorder_for_compute_comm_overlap_broadcast_runtime_estimations: bool = False
+
+# Block of Ratios to workaround imperfection of current runtime estimations
+# for collectives and compute for different scenarios.
+# Multiplier of collectives estimated durations
+reorder_sink_runtime_estimations_comm_mult: float = 2.0
+# Multiplier of compute estimated durations
+reorder_sink_runtime_estimations_non_comm_mult: float = 1.0
+# The reordering will stop to reorder
+# when overlap_comp >= (1 + extra_overlap_ratio) * comm_time
+# Allows to configure more aggressive overlap
+reorder_iterative_extra_comm_comp_overlap: float = 0.5
+# The sink waits reordering will stop to reorder
+# when overlap_comp >= (1 + extra_overlap_ratio) * comm_time
+# Allows to configure more aggressive sink waits
+sink_iterative_extra_comm_comp_overlap: float = 0.5
+
+# Allow reorder iterative pass to increase peak memory
+# up to peak_memory_before_pass * (1 + budget)
+reorder_iterative_peak_memory_budget: float = 0.2
+# Allow sink waits iterative pass to increase peak memory
+# up to peak_memory_before_pass * (1 + budget)
+sink_iterative_peak_memory_budget: float = 0.2
+
+# Experimental unsafe configuration that allows changing relative collectives order.
+# Must be used with runtime_estimations_align_across_all_distributed_ranks = True
+reorder_iterative_unsafe_collectives_reorder: bool = True
+sink_waits_iterative_unsafe_collectives_reorder: bool = True
+
+# Allow group and move other collectives during reordering
+reorder_iterative_group_with_collectives: bool = False
+sink_waits_iterative_swap_with_collectives: bool = False
+
 # adds patch, save_config, etc
 install_config_module(sys.modules[__name__])
diff --git a/torch/_inductor/cpp_builder.py b/torch/_inductor/cpp_builder.py
index 8e072178099c6..6a6b7d15ae3ea 100644
--- a/torch/_inductor/cpp_builder.py
+++ b/torch/_inductor/cpp_builder.py
@@ -881,23 +881,34 @@ def _get_optimization_cflags(
     cflags: list[str] = []
     ldflags: list[str] = []
 
-    b_debug_build = (
+    should_use_optimized_flags = not (
         config.aot_inductor.debug_compile
+        or os.environ.get("TORCHINDUCTOR_DEBUG_COMPILE", "0") == "1"
+    )
+    should_add_debug_symbol_flags = (
+        config.aot_inductor.debug_compile
+        or config.aot_inductor.debug_symbols
+        or os.environ.get("TORCHINDUCTOR_DEBUG_COMPILE", "0") == "1"
         or os.environ.get("TORCHINDUCTOR_DEBUG_SYMBOL", "0") == "1"
     )
-    wrapper_opt_level = config.aot_inductor.compile_wrapper_opt_level
-
-    if b_debug_build:
-        cflags, ldflags = _get_inductor_debug_symbol_cflags()
+    if should_use_optimized_flags:
         if _IS_WINDOWS:
-            cflags += ["Od", "Ob0", "Oy-"]
+            cflags += ["O1" if min_optimize else "O2"]
         else:
-            cflags.append("O0")
+            cflags += [
+                config.aot_inductor.compile_wrapper_opt_level if min_optimize else "O3",
+                "DNDEBUG",
+            ]
     else:
         if _IS_WINDOWS:
-            cflags = ["O1" if min_optimize else "O2"]
+            cflags += ["Od", "Ob0", "Oy-"]
         else:
-            cflags = [wrapper_opt_level if min_optimize else "O3", "DNDEBUG"]
+            cflags += ["O0"]
+
+    if should_add_debug_symbol_flags:
+        debug_cflags, debug_ldflags = _get_inductor_debug_symbol_cflags()
+        cflags += debug_cflags
+        ldflags += debug_ldflags
 
     cflags += _get_ffast_math_flags()
 
@@ -913,6 +924,10 @@ def _get_optimization_cflags(
             if not config.is_fbcode():
                 if platform.machine() == "ppc64le":
                     cflags.append("mcpu=native")
+                elif platform.machine() == "riscv64":
+                    cflags.append("march=rv64gc")
+                elif platform.machine() == "riscv32":
+                    cflags.append("march=rv32gc")
                 else:
                     cflags.append("march=native")
 
diff --git a/torch/_inductor/cpu_vec_isa.py b/torch/_inductor/cpu_vec_isa.py
index 515f628c9938c..1c4a394d1eb28 100644
--- a/torch/_inductor/cpu_vec_isa.py
+++ b/torch/_inductor/cpu_vec_isa.py
@@ -430,7 +430,7 @@ def get_isa_from_cpu_capability(
         "avx2": "avx2",
         "avx512": "avx512",
     }
-    if capability in capability_to_isa_str.keys():
+    if capability in capability_to_isa_str:
         # pyrefly: ignore [index-error]
         isa_str = capability_to_isa_str[capability]
         if isa_str == "INVALID_VEC_ISA":
diff --git a/torch/_inductor/cudagraph_trees.py b/torch/_inductor/cudagraph_trees.py
index 20cd5ca9a8888..98280b5af783c 100644
--- a/torch/_inductor/cudagraph_trees.py
+++ b/torch/_inductor/cudagraph_trees.py
@@ -536,9 +536,14 @@ def expired(self) -> bool:
         if self.extra_ref_check is not None and not self.extra_ref_check():
             return False
 
-        # if extra_ref_check is not None we expect an additional reference
         stor_count = torch._C._storage_Use_Count(self.ref.cdata)
-        return (stor_count - (self.extra_ref_check is not None)) == 0
+        if self.extra_ref_check is not None:
+            # if extra_ref_check is not None we expect two additional references:
+            #  - one from the Python storage object
+            #  - one from the cached Tensor
+            stor_count -= 2
+        assert stor_count >= 0
+        return stor_count == 0
 
     def __repr__(self) -> str:
         if self.ref is None or self.ref.expired():
@@ -1439,7 +1444,15 @@ def check_refcount(i: int) -> bool:
                 self_loc = self_ref()
                 if self_loc is None:
                     return False
-                return self_loc.get_output_refcount(i) == 2
+                refcount = self_loc.get_output_refcount(i)
+                # pyrefly: ignore
+                if self_loc.cached_tensor_outputs[i]._use_count() > 1:
+                    # c10::Tensor may also holds one reference count
+                    assert refcount >= 3
+                    return refcount == 3
+                else:
+                    assert refcount >= 2
+                    return refcount == 2
 
             check = functools.partial(check_refcount, i=i)
 
diff --git a/torch/_inductor/cudagraph_utils.py b/torch/_inductor/cudagraph_utils.py
index 668becdded469..50d986d48e6c2 100644
--- a/torch/_inductor/cudagraph_utils.py
+++ b/torch/_inductor/cudagraph_utils.py
@@ -192,7 +192,7 @@ def check_multiple_devices_or_any_cpu_nodes(
     ):
         return None
 
-    keys_repr = (repr(key) for key in device_node_mapping.keys())
+    keys_repr = (repr(key) for key in device_node_mapping)
     return format_default_skip_message(f"multiple devices: {', '.join(keys_repr)}")
 
 
diff --git a/torch/_inductor/distributed_autotune.py b/torch/_inductor/distributed_autotune.py
new file mode 100644
index 0000000000000..ec53d25efcd5b
--- /dev/null
+++ b/torch/_inductor/distributed_autotune.py
@@ -0,0 +1,386 @@
+from __future__ import annotations
+
+import contextlib
+import dataclasses
+from typing import Any, TYPE_CHECKING, Union
+from unittest.mock import patch
+
+import sympy
+
+import torch._logging
+import torch.distributed as dist
+import torch.fx
+from torch.utils._ordered_set import OrderedSet
+
+from . import config, select_algorithm
+from .ir import (
+    Buffer,
+    ChoiceCaller,
+    Layout,
+    MultiTemplateBuffer,
+    OperationBuffer,
+    ShapeAsConstantBuffer,
+    StorageBox,
+    TensorBox,
+)
+from .kernel_inputs import KernelInputs, MMKernelInputs
+from .scheduler import SchedulerNode
+from .virtualized import NullHandler, V
+
+
+if TYPE_CHECKING:
+    from collections.abc import Generator, Sequence
+
+
+_DISTRIBUTED_AUTOTUNE_KEY = "distributed_autotune"
+
+_AUTOTUNE_PG: dist.ProcessGroup | None = None
+
+
+@dataclasses.dataclass
+class _DistributedAutotuneState:
+    """
+    State used to track autotuning during a graph_context()
+    """
+
+    # This is the next operator index. Used to figure out which rank should do
+    # the autotuning.
+    autotuned_index: int = 0
+
+    # For debugging - used to make sure that we autotune the same number of
+    # local operators that we expected to.
+    autotuned_local_count: int = 0
+
+
+@dataclasses.dataclass
+class _DistributedAutotuneInfo:
+    index: int
+    local: bool
+
+
+def get_autotune_pg() -> dist.ProcessGroup | None:
+    if dist.is_available() and dist.is_initialized():
+        global _AUTOTUNE_PG
+        if _AUTOTUNE_PG is None:
+            _AUTOTUNE_PG = dist.distributed_c10d._new_group_with_tag(
+                pg_tag="pt2_distributed_autotune_pg"
+            )
+        return _AUTOTUNE_PG
+
+    return None
+
+
+def schedule(scheduler: torch._inductor.scheduler.Scheduler) -> None:
+    """
+    Finish the distributed autotuning by propagating the autotuning results
+    between the ranks and then replacing the placeholder with the real Buffer.
+    """
+    assert config.distributed_max_autotune_gemm
+    autotune_results = _autotune_local_nodes(scheduler)
+    choices_by_index = _sync(autotune_results)
+    _autotune_remote_nodes(scheduler, choices_by_index)
+
+
+@contextlib.contextmanager
+def graph_context() -> Generator[None, None, None]:
+    """
+    Wrapped around processing a graph, sets up figuring out which ranks tune
+    which shapes.
+    """
+    assert not isinstance(
+        V.get_distributed_autotune_state(check_poisoned=False),  # type: ignore[call-arg]
+        _DistributedAutotuneState,
+    )
+    V.set_distributed_autotune_state(_DistributedAutotuneState())
+    try:
+        yield
+    finally:
+        V.set_distributed_autotune_state(NullHandler())
+
+
+def maybe_autotune_remote(
+    name: str, choices: list[ChoiceCaller], inputs: list[Buffer], layout: Layout
+) -> TensorBox | ShapeAsConstantBuffer | None:
+    """
+    Used by an op (like `mm`) to determine if the op should be autotuned
+    locally (returns None) or remotely (returns a placeholder Buffer).
+    """
+    if not config.distributed_max_autotune_gemm:
+        return None
+
+    if not (autotune_pg := get_autotune_pg()):
+        return None
+
+    if len(choices) <= 1:
+        return None
+
+    state = V.distributed_autotune_state
+    index = state.autotuned_index
+    state.autotuned_index += 1
+    local = index % autotune_pg.size() == autotune_pg.rank()
+
+    V.current_node.meta[_DISTRIBUTED_AUTOTUNE_KEY] = _DistributedAutotuneInfo(
+        index, local
+    )
+    if local:
+        state.autotuned_local_count += 1
+        return None
+
+    return torch._inductor.ir.TensorBox.create(
+        _DistributedAutotuneBuffer(name, inputs, layout)
+    )
+
+
+class _DistributedAutotuneBuffer(MultiTemplateBuffer):
+    """
+    A MultiTemplateBuffer which represents a kernel being autotuned on a
+    different rank. When `schedule` is called this will be replaced by the
+    "real" buffer.
+    """
+
+    # Name of the kernel being autotuned.
+    _kernel_name: str
+
+    def __init__(
+        self,
+        kernel_name: str,
+        inputs: list[Buffer],
+        layout: Layout,
+    ) -> None:
+        super().__init__(
+            layout,
+            inputs,
+            choice_timings_fn=self._dummy_choice_timings,
+            unfiltered_choices=[],
+            allowed_prologue_inps=OrderedSet({}),
+        )
+
+        self._kernel_name = kernel_name
+
+    def _dummy_choice_timings(
+        self, _hint_override: int | None
+    ) -> dict[ChoiceCaller, float]:
+        # This should never get called. It means that a remote autotune was
+        # scheduled but never filled in.
+        raise NotImplementedError
+
+    def autotune(self, ser_choice: _SerializedChoice) -> TensorBox:
+        """
+        Given a _SerializedChoice (autotune results from another rank)
+        compute the final TensorBox.
+        """
+
+        from .select_algorithm import autotune_select_algorithm
+
+        with patch.object(V.graph, "scheduler", None):
+            kernel_inputs = MMKernelInputs([*self.original_inputs])
+            assert isinstance(self.layout, Layout)
+            choice = ser_choice.get_choice(self.layout, kernel_inputs)
+            buffer = autotune_select_algorithm(
+                self._kernel_name,
+                [choice],
+                kernel_inputs.nodes(),
+                self.layout,
+            )
+            assert isinstance(buffer, TensorBox)
+            return buffer
+
+
+# Can we make this async?
+def _sync(autotune_results: list[_SerializedChoice]) -> Sequence[_SerializedChoice]:
+    """
+    Perform the all_gather to collect the autotune results from all the ranks.
+    """
+
+    autotune_pg = get_autotune_pg()
+    assert autotune_pg
+
+    # Perform allgather
+    all_states: list[list[_SerializedChoice]] = [None] * autotune_pg.size()  # type: ignore[list-item]
+    torch.distributed.all_gather_object(all_states, autotune_results, group=autotune_pg)
+
+    node_count = sum(len(x) for x in all_states)
+    # It's faster to briefly lie about the type than to unzip the results and append.
+    choices_by_index: list[_SerializedChoice] = [None] * node_count  # type: ignore[list-item]
+
+    check_count = 0
+    for other_results in all_states:
+        for choice in other_results:
+            assert isinstance(choice, _SerializedChoice)
+            assert choices_by_index[choice.index] is None
+            choices_by_index[choice.index] = choice
+            check_count += 1
+
+    assert node_count == check_count, f"count mismatch: {node_count} != {check_count}"
+    return choices_by_index
+
+
+class _SerializedChoice:
+    """
+    This is a serializer for the autotune choice. KernelTemplateChoice can't
+    be serialized directly (the template and inputs prevent this) so we need to
+    serialize it by parts and reconstruct later on.
+    """
+
+    def __init__(self, index: int, choice: ChoiceCaller) -> None:
+        self.index = index
+        self.template_uid = _SerializedChoice._template_uid_from_choice(choice)
+        self.kwargs = self._compute_kwargs(choice.description)
+
+    def get_choice(self, layout: Layout, inputs: KernelInputs) -> ChoiceCaller | None:
+        """
+        Deserialize the ChoiceCaller and return it.
+        """
+
+        template = self._template_from_uid()
+
+        kwargs = {**self.kwargs}
+        if "BLOCK_K" in kwargs:
+            # TODO: Do we really need to externally compute this value? If it's
+            # needed I'm surprised it's not just part of the original template
+            # description.
+            # This needs the actual 'k' to figure out the value.
+            k = inputs.nodes()[0].get_size()[1]
+            kwargs["EVEN_K"] = sympy.gcd(k, kwargs["BLOCK_K"]) == kwargs["BLOCK_K"]
+
+        extra_kwargs: dict[str, Any] = {}
+        from .kernel_template_choice import (
+            DictKernelTemplateParams,
+            KernelTemplateChoice,
+        )
+
+        params = DictKernelTemplateParams(kwargs)
+        ktc = KernelTemplateChoice(template, params, extra_kwargs, layout, inputs)
+        return ktc.choice
+
+    @staticmethod
+    def _compute_kwargs(description: str) -> dict[str, Union[int, str, bool]]:
+        """
+        Given a template description turn it into input kwargs.
+        """
+        if not description:
+            return {}
+
+        # TODO: It seems like it would be better if the template could provide
+        # this directly instead of having to parse a string.
+        kwargs: dict[str, Union[int, str, bool]] = {}
+        for cfg in description.split(","):
+            key, val = cfg.split("=", 1)
+            key, val = key.strip(), val.strip()
+            if val == "True":
+                kwargs[key] = True
+            elif val == "False":
+                kwargs[key] = False
+            elif val.isdigit():
+                kwargs[key] = int(val)
+            else:
+                assert val.startswith("'") and val.endswith("'")
+                kwargs[key] = val[1:-1]
+        return kwargs
+
+    @staticmethod
+    def _template_uid_from_choice(choice: ChoiceCaller) -> str:
+        """
+        Given a ChoiceCaller figure out which template represents it. This
+        is reversed by _template_from_uid().
+        """
+
+        # We need a better way to do this - right now we need to add each
+        # supported template directly.
+        if isinstance(choice, select_algorithm.ExternKernelCaller):
+            if choice.choice.name == "mm":
+                return "torch._inductor.kernel.mm.aten_mm"
+            else:
+                raise RuntimeError(f"TODO: kernel {choice.choice.name!r}")
+        elif isinstance(choice, select_algorithm.TritonTemplateCaller):
+            return "torch._inductor.kernel.mm.mm_template"
+        else:
+            raise RuntimeError(f"TODO: {type(choice)}")
+
+    def _template_from_uid(self) -> Any:
+        """
+        See _template_uid_from_choice().
+        """
+        parts = self.template_uid.split(".")
+        obj = globals()[parts[0]]
+        for k in parts[1:]:
+            obj = getattr(obj, k)
+        return obj
+
+
+def _autotune_local_nodes(
+    scheduler: torch._inductor.scheduler.Scheduler,
+) -> list[_SerializedChoice]:
+    """
+    Go through the nodes in the scheduler and autotune the kernels which
+    should be autotuned by this rank.
+    """
+
+    autotune_results: list[_SerializedChoice] = []
+
+    for node in scheduler.nodes:
+        if not isinstance(node, SchedulerNode):
+            continue
+
+        if (inner_node := node.node) is None:
+            continue
+
+        if isinstance(inner_node, _DistributedAutotuneBuffer):
+            # This is marked for remote autotuning.
+            continue
+
+        if not isinstance(inner_node, MultiTemplateBuffer):
+            continue
+
+        if (origin_node := inner_node.origin_node) is None:
+            continue
+
+        if (meta := origin_node.meta) is None:
+            continue
+
+        info = meta.get(_DISTRIBUTED_AUTOTUNE_KEY)
+        if info is None:
+            continue
+
+        assert info.local
+
+        # We force autotuning here
+        # Still takes advantage of async precompile
+        # We need all the configs before fusion
+        min_choice, _ = inner_node.get_min_choice()
+
+        choice = _SerializedChoice(info.index, min_choice)
+        autotune_results.append(choice)
+
+    state = V.distributed_autotune_state
+    assert len(autotune_results) == state.autotuned_local_count, (
+        f"incorrect local autotuned nodes found ({len(autotune_results)} != {state.autotuned_local_count})"
+    )
+    return autotune_results
+
+
+def _autotune_remote_nodes(
+    scheduler: torch._inductor.scheduler.Scheduler,
+    choices_by_index: Sequence[_SerializedChoice],
+) -> None:
+    """
+    Go through the nodes in the scheduler and autotune the nodes that were
+    autotuned on remote ranks.
+    """
+
+    for i, node in enumerate(scheduler.nodes):
+        if isinstance(node, SchedulerNode) and isinstance(
+            (dist_node := node.node), _DistributedAutotuneBuffer
+        ):
+            assert dist_node.origin_node is not None
+            info = dist_node.origin_node.meta[_DISTRIBUTED_AUTOTUNE_KEY]
+            out_tensorbox = dist_node.autotune(choices_by_index[info.index])
+
+            out_storage = out_tensorbox.data
+            assert isinstance(out_storage, StorageBox)
+            out_buffer = out_storage.data
+            assert isinstance(out_buffer, OperationBuffer)
+
+            assert out_buffer.layout == dist_node.layout
+
+            scheduler._replace_node(out_buffer, dist_node, i, node)
diff --git a/torch/_inductor/fuzzer.py b/torch/_inductor/fuzzer.py
index 9565c76b2dde4..152dce2026766 100644
--- a/torch/_inductor/fuzzer.py
+++ b/torch/_inductor/fuzzer.py
@@ -912,7 +912,7 @@ def visualize_results(
     assert len(results) > 0
 
     input_set: OrderedSet[str] = OrderedSet({})
-    for key in results.keys():
+    for key in results.keys():  # noqa: SIM118
         input_set.add(key[0])
         input_set.add(key[1])
     input_list = sorted(input_set)
diff --git a/torch/_inductor/fx_passes/b2b_gemm.py b/torch/_inductor/fx_passes/b2b_gemm.py
index 9faec788e9e3a..5a8dc65c08ec4 100644
--- a/torch/_inductor/fx_passes/b2b_gemm.py
+++ b/torch/_inductor/fx_passes/b2b_gemm.py
@@ -641,7 +641,7 @@ def all_reach_via_pointwise_with_no_other_inputs(
                 if node is dst:
                     visited.add(node)
                 elif (node is src) or is_pointwise_node(node):
-                    for user in node.users.keys():
+                    for user in node.users:
                         # for nodes other than dst, bookkeep their users' input counts
                         if user not in input_counter:
                             input_counter[user] = len(user.all_input_nodes)
diff --git a/torch/_inductor/fx_passes/bucketing.py b/torch/_inductor/fx_passes/bucketing.py
index ab831c96c94ba..5641c4294356f 100644
--- a/torch/_inductor/fx_passes/bucketing.py
+++ b/torch/_inductor/fx_passes/bucketing.py
@@ -2,13 +2,18 @@
 import logging
 import operator
 from collections import defaultdict
-from typing import Any, Callable, Literal, TypeAlias
+from collections.abc import Callable
+from typing import Any, Literal, TypeAlias
 
 import torch
 import torch.distributed as dist
 import torch.utils._pytree as pytree
 from torch._dispatch.python import enable_python_dispatcher
 from torch._dynamo.utils import detect_fake_mode
+from torch._inductor.comm_analysis import (
+    get_collective_type_from_kernel_name,
+    NCCL_COLL,
+)
 from torch._inductor.runtime.runtime_utils import dynamo_timed
 from torch._logging import trace_structured
 from torch.fx.experimental.proxy_tensor import make_fx
@@ -51,6 +56,23 @@ def _ar_group_key(node: torch.fx.Node) -> tuple[str, str, torch.dtype]:
     return (group_name, reduce_op, dtype)
 
 
+def _schedulable_wait_node(node: torch.fx.Node) -> bool:
+    """
+    Add additional check on if the wait node is schedulable
+    We should not schedule a fx node that is:
+        1. wait on a collective that is not callable
+        2. wait on a non-NCCL communication node
+    """
+    if not is_wait_tensor(node):
+        return False
+    assert isinstance(node.args[0], torch.fx.Node)
+    assert isinstance(node.args[0].target.name(), str)
+    is_callable: bool = node.args[0].op == "call_function"
+    coll: NCCL_COLL = get_collective_type_from_kernel_name(node.args[0].target.name())
+    is_collective: bool = coll != NCCL_COLL.UNSUPPORTED
+    return is_callable and is_collective
+
+
 def bucket_key(node: torch.fx.Node, mode: BucketMode | None = None) -> object | None:
     if is_all_gather_into_tensor(node):
         group_key_fn = (
@@ -120,9 +142,9 @@ def bucket_reduce_scatter(
 
 
 def is_all_gather_into_tensor(node: torch.fx.Node) -> bool:  # type: ignore[arg-type]
-    return (
-        node.op == "call_function"
-        and node.target is torch.ops._c10d_functional.all_gather_into_tensor.default
+    return node.op == "call_function" and (
+        node.target == torch.ops._c10d_functional.all_gather_into_tensor.default
+        or node.target == torch.ops._c10d_functional.all_gather_into_tensor_out.default
     )
 
 
@@ -147,6 +169,13 @@ def is_all_reduce_tensor(node: torch.fx.Node) -> bool:
     )
 
 
+def is_all_to_all_tensor(node: torch.fx.Node) -> bool:
+    return (
+        node.op == "call_function"
+        and node.target is torch.ops._c10d_functional.all_to_all_single.default
+    )
+
+
 def is_wait_tensor_from_all_gather_into_tensor(node: torch.fx.Node) -> bool:
     return is_wait_tensor(node) and is_all_gather_into_tensor(node.args[0])  # type: ignore[arg-type]
 
diff --git a/torch/_inductor/fx_passes/ddp_fusion.py b/torch/_inductor/fx_passes/ddp_fusion.py
index 8a4de1a604869..44314b912786f 100644
--- a/torch/_inductor/fx_passes/ddp_fusion.py
+++ b/torch/_inductor/fx_passes/ddp_fusion.py
@@ -4,10 +4,10 @@
 import logging
 import math
 import operator
-from collections.abc import Generator
+from collections.abc import Callable, Generator
 from dataclasses import dataclass
 from functools import partial
-from typing import Any, Callable, cast
+from typing import Any, cast
 
 import torch
 import torch.fx as fx
diff --git a/torch/_inductor/fx_passes/fsdp.py b/torch/_inductor/fx_passes/fsdp.py
index 6b0c2ad2c94a7..1e71c350ed7b6 100644
--- a/torch/_inductor/fx_passes/fsdp.py
+++ b/torch/_inductor/fx_passes/fsdp.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Callable
+from collections.abc import Callable
 
 import torch
 from torch._inductor.fx_passes.bucketing import (
diff --git a/torch/_inductor/fx_passes/graph_view.py b/torch/_inductor/fx_passes/graph_view.py
new file mode 100644
index 0000000000000..88a78747ec607
--- /dev/null
+++ b/torch/_inductor/fx_passes/graph_view.py
@@ -0,0 +1,200 @@
+from __future__ import annotations
+
+import itertools
+import re
+from typing import Any, Optional, Union
+
+import torch.fx as fx  # noqa: TC001
+from torch.utils._ordered_set import OrderedSet
+
+
+def _get_module_stack(node: fx.Node) -> list[tuple[str, type[Any]]]:
+    nn_stack = node.meta.get("nn_module_stack", "")
+    if nn_stack:
+        return list(nn_stack.values())
+
+    fwd_nn_stack = node.meta.get("fwd_nn_module_stack", "")
+    if fwd_nn_stack:
+        return list(fwd_nn_stack.values())
+
+    return []
+
+
+def _addindent(s_: str, num_spaces: int) -> str:
+    s: list[str] = s_.split("\n")
+    # don't do anything for single-line stuff
+    if len(s) == 1:
+        return s_
+    first: str = s.pop(0)
+    s: list[str] = [(num_spaces * " ") + line for line in s]
+    joint_s: str = "\n".join(s)
+    joint_s = first + "\n" + joint_s
+    return joint_s
+
+
+class GraphView:
+    """
+    A hierarchical class for organizing and managing torch.fx nodes by their module stack.
+
+    This class provides a tree-like structure where each node in the hierarchy corresponds
+    to a module or submodule in a traced FX graph. Each `GraphView` instance can hold a list
+    of FX nodes (`self.data`) belonging to that module scope, maintain a unique set of nodes
+    (`self.unique_nodes`), and manage its child containers (`self.children`).
+
+    Attributes:
+        name (str): The name of the module or container scope.
+        klass (type[Any]): The class type associated with this module/container.
+        data (list[fx.Node]): A list of FX graph nodes belonging to this module.
+        unique_nodes (OrderedSet[fx.Node]): A deduplicated set of nodes to ensure no duplicates.
+        children (dict[str, GraphView]): A mapping of child module names to their corresponding GraphView instances.
+    """
+
+    def __init__(self, name: str, klass: type[Any]) -> None:
+        self.name: str = name
+        self.klass: type[Any] = klass
+        self.data: list[fx.Node] = []
+        self.unique_nodes: OrderedSet[fx.Node] = OrderedSet()
+        self.children: dict[str, GraphView] = {}
+
+    def add(self, data: fx.Node) -> None:
+        if data not in self.unique_nodes:
+            self.data.append(data)
+            self.unique_nodes.add(data)
+
+    def get_child(
+        self, module_stack: str, klass: Optional[type[Any]] = None
+    ) -> GraphView:
+        if module_stack not in self.children:
+            new_stack = GraphView(module_stack, klass or self.klass)
+            self.children[module_stack] = new_stack
+        return self.children[module_stack]
+
+    def __getitem__(self, name: str) -> GraphView:
+        return self.children[name]
+
+    def __getattr__(self, name: str) -> GraphView:
+        return self.children[name]
+
+    def __repr__(self) -> str:
+        child_lines: list[str] = []
+        for name, child in self.children.items():
+            mod_str = repr(child)
+            mod_str = _addindent(mod_str, 2)
+            child_lines.append(f"({name}): {mod_str}")
+        main_str = f"{self.klass.__name__}("
+        if child_lines:
+            main_str += "\n  " + "\n  ".join(child_lines) + "\n"
+        main_str += ")"
+        return main_str
+
+
+def _clean_stack_name(stack_name: str) -> str:
+    """
+    Clean up FX node's nn_module_stack metadata string to match the module name hierarchies
+
+    Example:
+        Input: "L['self']._modules['layers']['0']._modules['attention']"
+        Output: "layers.0.attention"
+    """
+    cleaned = re.sub(r"^L\['self'\]\.?", "", stack_name)
+    parts = re.findall(r"\['([^']+)'\]", cleaned)
+    return ".".join(parts) if parts else cleaned
+
+
+def _is_root(stack: str) -> bool:
+    return stack == ""
+
+
+def make_graph_view(graph: fx.Graph) -> Optional[GraphView]:
+    """
+    Code from: https://github.com/meta-pytorch/autoparallel/pull/158
+
+    Make a graph view from the fx.Graph. This is a tree structure that
+    represents the module hierarchy of the graph, and enables us to
+    easily find the nodes that belong to each module, and gives a slightly
+    easier way of visualize different parts of the graph by extracting
+    subgraphs that belong to a particular module FQN.
+
+    For example, if we have the following model with module hierarchy:
+
+    Transformer(
+        (tok_embeddings): Embedding(128256, 4096)
+        (layers): ModuleDict(
+            (0): TransformerBlock(
+            (attention): Attention(
+                (wq): Linear(in_features=4096, out_features=4096, bias=False)
+                (wk): Linear(in_features=4096, out_features=1024, bias=False)
+                (wv): Linear(in_features=4096, out_features=1024, bias=False)
+                (wo): Linear(in_features=4096, out_features=4096, bias=False)
+                (sdpa): ScaledDotProductAttention()
+            )
+            (feed_forward): FeedForward(
+                (w1): Linear(in_features=4096, out_features=14336, bias=False)
+                (w2): Linear(in_features=14336, out_features=4096, bias=False)
+                (w3): Linear(in_features=4096, out_features=14336, bias=False)
+            )
+            (attention_norm): RMSNorm((4096,), eps=1e-05, elementwise_affine=True)
+            (ffn_norm): RMSNorm((4096,), eps=1e-05, elementwise_affine=True)
+            )
+        )
+        (norm): RMSNorm((4096,), eps=1e-05, elementwise_affine=True)
+        (output): Linear(in_features=4096, out_features=128256, bias=False)
+    )
+
+    Then we can get a GraphView for the fx.Graph that enables us to do
+
+    graph_view = make_graph_view(graph)
+    subgraph = get_subgraph_by_path(graph_view, "layers.0")
+
+    where subgraph contains all the nodes that belong to this region
+    """
+    nodes: list[fx.Node] = list(graph.nodes)
+    nodes_by_module_stack_root: GraphView | None = None
+    for node in nodes:
+        for module_stack, module_class in _get_module_stack(node):
+            module_stack = _clean_stack_name(module_stack)
+            nodes_by_module_stack: GraphView | None = nodes_by_module_stack_root
+            for name in module_stack.split("."):
+                if nodes_by_module_stack is None:
+                    nodes_by_module_stack = GraphView(name, module_class)
+                    nodes_by_module_stack_root = nodes_by_module_stack
+                if _is_root(module_stack):
+                    new_stack: GraphView = nodes_by_module_stack
+                else:
+                    new_stack = nodes_by_module_stack.get_child(name, module_class)
+                nodes_by_module_stack = new_stack
+                nodes_by_module_stack.add(node)
+
+    return nodes_by_module_stack_root
+
+
+def get_subgraph_by_path(
+    graph_view: GraphView, paths: Union[str, list[str]]
+) -> list[fx.Node]:
+    """
+    Get subgraph by path(s).
+    Args:
+        graph_view (object): Root graph view object.
+        paths (str or list of str): Path(s) to subgraph.
+    Returns:
+        list[fx.Node]: fx nodes belong to the subgraph
+    """
+
+    def get_node_by_path(node: GraphView, path: str) -> GraphView:
+        for p in path.split("."):
+            if p in node.children:
+                node = node.children[p]
+            else:
+                return GraphView("", object)
+        return node
+
+    if isinstance(paths, list):
+        nodes = list(
+            itertools.chain.from_iterable(
+                get_node_by_path(graph_view, p).data for p in paths
+            )
+        )
+        return nodes
+    else:
+        node = get_node_by_path(graph_view, paths)
+        return node.data
diff --git a/torch/_inductor/fx_passes/group_batch_fusion.py b/torch/_inductor/fx_passes/group_batch_fusion.py
index 295c720382853..f46d4d3ba216f 100644
--- a/torch/_inductor/fx_passes/group_batch_fusion.py
+++ b/torch/_inductor/fx_passes/group_batch_fusion.py
@@ -198,7 +198,7 @@ def match(self, node: torch.fx.Node) -> tuple[str, int, int, int, bool, str] | N
             return None
         # get the user of the node
         if self.graph_search_options.get("fuse_nodes_with_same_users", False):
-            users = [user.target for user in node.users.keys()]
+            users = [user.target for user in node.users]
         else:
             users = ""  # type: ignore[assignment]
         # only handle the cases where inputs are 2D tensors
@@ -627,7 +627,7 @@ def match(self, node: torch.fx.Node):
             weight = get_arg_value(node, 1, "weight")
             bias = get_arg_value(node, 2, "bias")
             if self.graph_search_options.get("fuse_nodes_with_same_users", False):
-                users = [user.target for user in node.users.keys()]
+                users = [user.target for user in node.users]
             else:
                 users = ""  # type: ignore[assignment]
             group_key = (
@@ -742,7 +742,7 @@ def match(self, node: torch.fx.Node):
             weight = get_arg_value(node, 2, "weight")
             bias = get_arg_value(node, 3, "bias")
             if self.graph_search_options.get("fuse_nodes_with_same_users", False):
-                users = [user.target for user in node.users.keys()]
+                users = [user.target for user in node.users]
             else:
                 users = ""  # type: ignore[assignment]
             group_key = (
@@ -1425,7 +1425,7 @@ def group_batch_fusion_passes(graph: torch.fx.Graph, pre_grad=True):
         }
         non_fbgemm_fusions = {
             fusion: config.post_grad_fusion_options[fusion]
-            for fusion in config.post_grad_fusion_options.keys()
+            for fusion in config.post_grad_fusion_options
             if fusion not in fbgemm_fusion_keys
         }
         fusions += generate_fusion_from_config(non_fbgemm_fusions, pre_grad=False)
diff --git a/torch/_inductor/fx_passes/joint_graph.py b/torch/_inductor/fx_passes/joint_graph.py
index 25b10966cfa96..9db694f1d8629 100644
--- a/torch/_inductor/fx_passes/joint_graph.py
+++ b/torch/_inductor/fx_passes/joint_graph.py
@@ -893,6 +893,9 @@ def _other_is_broadcasted_in_dim(match):
     if isinstance(dim, int):
         dim = (dim,)
 
+    if any(d >= len(other_shape) for d in dim):
+        return False
+
     return all(statically_known_true(other_shape[d] == 1) for d in dim)
 
 
diff --git a/torch/_inductor/fx_passes/memory_estimator.py b/torch/_inductor/fx_passes/memory_estimator.py
index c6b7c51b948e5..e887d4bf62c8e 100644
--- a/torch/_inductor/fx_passes/memory_estimator.py
+++ b/torch/_inductor/fx_passes/memory_estimator.py
@@ -1,8 +1,8 @@
 import itertools
 import logging
 from collections import defaultdict
+from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Callable
 
 import torch
 import torch.fx as fx
diff --git a/torch/_inductor/fx_passes/misc_patterns.py b/torch/_inductor/fx_passes/misc_patterns.py
index 2159e8811ad9e..ff0981e72e8b2 100644
--- a/torch/_inductor/fx_passes/misc_patterns.py
+++ b/torch/_inductor/fx_passes/misc_patterns.py
@@ -113,7 +113,7 @@ def __call__(self, graph: torch.fx.Graph):
                 signatures = () if signatures is None else signatures
                 replaceable_kwargs = OrderedSet()
                 for sig in signatures:
-                    for param_name in sig.parameters.keys():
+                    for param_name in sig.parameters:
                         if param_name in self.numpy_compat:
                             replaceable_kwargs.update(self.numpy_compat[param_name])
 
diff --git a/torch/_inductor/fx_passes/mkldnn_fusion.py b/torch/_inductor/fx_passes/mkldnn_fusion.py
index 70b3a3c355dde..214d3bf02f7f4 100644
--- a/torch/_inductor/fx_passes/mkldnn_fusion.py
+++ b/torch/_inductor/fx_passes/mkldnn_fusion.py
@@ -2,7 +2,7 @@
 import functools
 import operator
 from functools import reduce
-from typing import Any, Callable
+from typing import Any, TYPE_CHECKING
 
 import torch
 from torch._dynamo.utils import counters
@@ -35,6 +35,10 @@
 )
 
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
 if torch._C._has_mkldnn:
     aten = torch.ops.aten
     mkldnn = torch.ops.mkldnn
diff --git a/torch/_inductor/fx_passes/node_runtime_estimation.py b/torch/_inductor/fx_passes/node_runtime_estimation.py
new file mode 100644
index 0000000000000..43d3647b916a2
--- /dev/null
+++ b/torch/_inductor/fx_passes/node_runtime_estimation.py
@@ -0,0 +1,179 @@
+"""
+Collective runtime estimation using CUDA events and power-of-2 rounding.
+"""
+
+from __future__ import annotations
+
+from functools import lru_cache
+from typing import Any, Optional
+
+import torch
+from torch._inductor.utils import clear_on_fresh_cache
+from torch._logging import getArtifactLogger
+from torch.fx.operator_schemas import normalize_function
+
+
+# Setup logger for artifact logging
+log = getArtifactLogger(__name__, "node_runtime_estimation")
+
+
+# TODO: Consider using a distributed-aware cache or rank-local disk cache
+# not using local cache because different ranks might write to it concurrently.
+# solvable in future, potentially with workflow to seed cache
+@clear_on_fresh_cache
+@lru_cache
+def _get_collective_cache() -> dict[str, float]:
+    """Get process-local cache for collective benchmarks."""
+    return {}
+
+
+def get_cached_runtime(key: str) -> Optional[float]:
+    """Get cached runtime from process-local cache."""
+    return _get_collective_cache().get(key)
+
+
+def set_cached_runtime(key: str, value: float) -> None:
+    """Set cached runtime in process-local cache."""
+    _get_collective_cache()[key] = value
+
+
+def get_hint(x: int | torch.SymInt) -> Optional[int]:
+    if isinstance(x, int):
+        return x
+    assert isinstance(x, torch.SymInt)
+    return x.node.hint if x.node.has_hint() else None
+
+
+def can_benchmark_collective() -> bool:
+    """Check if we can benchmark collectives (not fake process group)."""
+    import torch.distributed as c10d
+
+    if not c10d.is_initialized():
+        return False
+
+    pg = c10d.distributed_c10d._get_default_group()
+    if torch.distributed.distributed_c10d.get_backend(pg) == "fake":
+        return False
+
+    return True
+
+
+def _benchmark_collective_with_cuda_events_impl(
+    n: torch.fx.Node,
+    args: tuple[Any, ...],
+    kwargs: dict[str, Any],
+    nruns: int,
+) -> float | None:
+    """
+    Core benchmarking logic using CUDA events and barriers.
+    Returns runtime in ms or None on failure.
+    """
+    import torch.distributed as c10d
+
+    # Warmup: call collective once and wait
+    torch.cuda.synchronize()
+    result = n.target(*args, **kwargs)  # type: ignore[operator]
+    torch.ops._c10d_functional.wait_tensor(result)
+
+    # Benchmark with CUDA events
+    comm_time = 0.0
+    for _ in range(nruns):
+        c10d.barrier()
+        torch.cuda.synchronize()
+
+        start_evt = torch.cuda.Event(enable_timing=True)
+        end_evt = torch.cuda.Event(enable_timing=True)
+
+        start_evt.record()
+        result = n.target(*args, **kwargs)  # type: ignore[operator]
+        torch.ops._c10d_functional.wait_tensor(result)
+        end_evt.record()
+        end_evt.synchronize()
+
+        comm_time += start_evt.elapsed_time(end_evt)
+
+    return comm_time / nruns
+
+
+def benchmark_collective_with_cuda_events(
+    n: torch.fx.Node,
+    nruns: int = 2,
+) -> tuple[float | None, str]:
+    """
+    Benchmark collective with CUDA events. Returns (runtime_ms, cache_key) or (None, "") on failure.
+    """
+    # context manager not allowed with profiler.
+    with torch.utils._python_dispatch._disable_current_modes():
+        return benchmark_collective_with_cuda_events_impl(n, nruns)
+
+
+def benchmark_collective_with_cuda_events_impl(
+    n: torch.fx.Node,
+    nruns: int = 2,
+) -> tuple[float | None, str]:
+    """
+    Benchmark collective with CUDA events. Returns (runtime_ms, cache_key) or (None, "") on failure.
+    """
+    from torch._inductor import fx_utils
+    from torch.distributed.distributed_c10d import _get_group_size_by_name
+
+    # Early check: can we actually run collectives?
+    if not can_benchmark_collective():
+        return None, ""
+
+    success, args, kwargs = fx_utils.get_fake_args_kwargs(n)
+
+    opt_args_kwargs = normalize_function(
+        n.target,  # type: ignore[arg-type]
+        args=n.args,
+        kwargs=n.kwargs,
+        normalize_to_only_use_kwargs=True,
+    )
+    assert opt_args_kwargs is not None
+    group_name = opt_args_kwargs[1]["group_name"]
+    group_size = _get_group_size_by_name(group_name)
+
+    if not success:
+        return None, ""
+
+    # Extract actual input size in BYTES (first tensor argument)
+    actual_bytes: Optional[int] = None
+
+    def extract_tensor_info(t: torch.Tensor) -> torch.Tensor:
+        nonlocal actual_bytes
+        if actual_bytes is None:
+            shape = [get_hint(dim) for dim in t.shape]
+            if any(s is None for s in shape):
+                return t
+
+            total_elems = 1
+            for dim in shape:
+                assert dim is not None
+                total_elems *= dim
+
+            actual_bytes = total_elems * t.dtype.itemsize
+        else:
+            raise RuntimeError(f"should only be one input tensor to collective {n}")
+        return t
+
+    torch.utils._pytree.tree_map_only(torch.Tensor, extract_tensor_info, (args, kwargs))
+
+    if actual_bytes is None:
+        return None, ""
+
+    # Cache key by BYTES (dtype-agnostic)
+    key = f"{n.target}: ({group_size} group size, {actual_bytes} bytes)"
+
+    # Check cache
+    if (cached := get_cached_runtime(key)) is not None:
+        return cached, key
+
+    # Benchmark using CUDA events with actual args/kwargs
+    runtime = _benchmark_collective_with_cuda_events_impl(n, args, kwargs, nruns)
+
+    if runtime is None:
+        return None, key
+
+    # Cache the result
+    set_cached_runtime(key, runtime)
+    return runtime, key
diff --git a/torch/_inductor/fx_passes/numeric_utils.py b/torch/_inductor/fx_passes/numeric_utils.py
index b50859448f072..d1db82f21f7ec 100644
--- a/torch/_inductor/fx_passes/numeric_utils.py
+++ b/torch/_inductor/fx_passes/numeric_utils.py
@@ -49,7 +49,7 @@ def compare_dict_tensors(dict_base, dict_control, precision):
         logger.debug("keys after pre/post grad fx passes %s", dict_control.keys())
         return False
     is_allclose = True
-    for key in dict_base.keys():
+    for key in dict_base:
         if key not in dict_control:
             logger.warning(
                 "Mismatch parameter name %s does not exist after pre/post grad fx passes",
diff --git a/torch/_inductor/fx_passes/overlap_manual_scheduling.py b/torch/_inductor/fx_passes/overlap_manual_scheduling.py
new file mode 100644
index 0000000000000..f5c131a7eab96
--- /dev/null
+++ b/torch/_inductor/fx_passes/overlap_manual_scheduling.py
@@ -0,0 +1,360 @@
+from __future__ import annotations
+
+import heapq
+from collections import Counter, defaultdict
+from typing import Any, Optional
+
+import torch
+import torch.fx as fx
+from torch._dynamo.graph_deduplication import _stable_topological_sort
+from torch._inductor.fx_passes.bucketing import (
+    _schedulable_wait_node,
+    is_all_gather_into_tensor as is_all_gather,
+    is_reduce_scatter_tensor as is_reduce_scatter,
+    merge_all_gather_bucket,
+    merge_reduce_scatter_bucket,
+)
+from torch._inductor.fx_passes.overlap_preserving_bucketer import (
+    bucket_key,
+    OverlapPreservingBucketer,
+)
+from torch._inductor.fx_passes.overlap_scheduling import (
+    CollectiveInfo,
+    is_compute_node,
+    OverlapScheduler,
+)
+from torch.utils._ordered_set import OrderedSet
+
+from .graph_view import get_subgraph_by_path, GraphView, make_graph_view
+
+
+class ManualOverlapPreservingBucketer(OverlapPreservingBucketer):
+    """
+    Buckets collective operations based on user specifications.
+    The actual bucket happens in bucket_collectives, where all-gathers/reduce-scatters in
+        `nodes` will be buckted one single all-gather/reduce-scatter.
+    """
+
+    def __init__(
+        self,
+        node_users: dict[fx.Node, OrderedSet[fx.Node]],
+        *args: Any,
+        **kwargs: Any,
+    ):
+        super().__init__(*args, **kwargs)
+        self.node_users = node_users
+        self.wait_to_node_map: dict[fx.Node, fx.Node] = defaultdict()
+
+    def _check_recursive_dep(
+        self,
+        node: fx.Node,
+        target_op: str,
+        dep_dict: dict[torch.fx.Node, OrderedSet[torch.fx.Node]],
+    ) -> bool:
+        """
+        Check if the node is directly used for fetch parameters/gradients
+
+        TODO (ruisizhang123): currently, we assume the node only pre-fetch/update one parameter/gradient
+            We should handle multiple parameters/gradients update case by checking if there are non closure
+            computes along the path from primal/output to coll_node
+        """
+        deps: OrderedSet[fx.Node] = dep_dict[node]
+        seen_target_op = 0
+        for d in deps:
+            if d.op == target_op:
+                seen_target_op += 1
+
+        return seen_target_op == 1
+
+    def _bucket_group(self, coll_nodes: list[fx.Node]) -> None:
+        assert len(coll_nodes) > 0, "bucketed coll_nodes should have nonzero node"
+
+        waits = [self.collective_info[n].wait_node for n in coll_nodes]
+        # Use earliest wait insertion point
+        first_wait = min(waits, key=lambda w: self.node_idx[w])
+        # Find insertion location
+        first = coll_nodes[0]
+        next_node = first
+        while next_node in coll_nodes:
+            next_node = next_node.next
+
+        if is_all_gather(first):
+            new_nodes, replacements = merge_all_gather_bucket(
+                self.graph,
+                coll_nodes,
+                wait_insertion_point=first_wait,
+                insert_before=next_node,
+                mode="custom_ops",
+            )
+        elif is_reduce_scatter(first):
+            new_nodes, replacements = merge_reduce_scatter_bucket(
+                self.graph,
+                coll_nodes,
+                wait_insertion_point=first_wait,
+                insert_before=next_node,
+                mode="custom_ops",
+            )
+        else:
+            raise ValueError(
+                "bucket non all_gather/reduce_scatter node is not supported"
+            )
+
+        # Identify the new wait and start
+        new_waits = [n for n in new_nodes if _schedulable_wait_node(n)]
+        assert len(new_waits) == 1, f"Expected exactly one new wait, got {new_waits}"
+        new_wait = new_waits[0]
+        new_start = new_wait.args[0]
+        assert isinstance(new_start, fx.Node)
+
+        node_type = (
+            "bucketed_all_gather" if is_all_gather(first) else "bucketed_reduce_scatter"
+        )
+        for n in new_nodes:
+            n.meta["nn_module_stack"] = coll_nodes[0].meta.get("nn_module_stack", "")
+            n.meta["fwd_nn_module_stack"] = coll_nodes[0].meta.get(
+                "fwd_nn_module_stack", ""
+            )
+            if n == new_wait:
+                node_type = node_type + "_wait"
+            n.meta["manual_bucket_node_type"] = node_type
+            if "wait" in node_type:
+                self.wait_to_node_map[n] = new_wait
+
+    def manual_bucket_collectives(self, nodes: list[fx.Node]) -> None:
+        """
+        Bucket all all-gather/reduce-scatter nodes from nodes into one all-gather/reduce-scatter.
+        """
+        # Filter out valid collectives
+        collectives = [n for n in nodes if n in self.collective_info]
+        if collectives == []:
+            return
+        grouped_collectives: dict[object, OrderedSet[fx.Node]] = defaultdict(OrderedSet)
+        for node in collectives:
+            key = bucket_key(node)
+            if not (is_all_gather(node) or is_reduce_scatter(node)):
+                continue
+            # We only want to bucket all-gather/reduce-scatter that
+            # 1. all_gather that have ancestors dependent only on input placeholder(parameters)
+            # 2. reduce scatter that the wait user node is returned as output(gradients)
+            if is_all_gather(node) and not self._check_recursive_dep(
+                node, "placeholder", self.node_ancestors
+            ):
+                continue
+            if is_reduce_scatter(node) and not self._check_recursive_dep(
+                self.collective_info[node].wait_node, "output", self.node_users
+            ):
+                continue
+            if key is not None:
+                grouped_collectives[key].add(node)
+
+        for key, nodes in grouped_collectives.items():  # type: ignore[arg-type]
+            self._bucket_group(list(nodes))
+
+
+class ManualOverlapScheduler(OverlapScheduler):
+    """
+    Scheduler that manual buckets and reorders collective nodes based on module_bucket_plans
+    """
+
+    def __init__(
+        self,
+        gm: fx.GraphModule,
+        module_bucket_plans: list[list[str] | str],
+        insert_overlap_deps: bool,
+    ):
+        super().__init__(
+            gm,
+            max_in_flight_gb=0.0,
+            max_compute_pre_fetch=0,
+            collective_bucketing=True,
+            insert_overlap_deps=insert_overlap_deps,
+            compute_overlap_multipler=0.0,
+            max_coll_distance=0,
+            custom_runtime_estimation=None,
+            collective_estimator="analytical",
+        )
+        self.module_bucket_plans = module_bucket_plans
+        self.nodes_in_subgraph: list[list[fx.Node]] = []
+
+        self.node_users: dict[fx.Node, OrderedSet[fx.Node]] = self._collect_node_users()
+        self.bucketer = ManualOverlapPreservingBucketer(
+            graph=self.graph,
+            collective_info=self.collective_info,
+            node_ancestors=self.node_ancestors,
+            node_users=self.node_users,
+            scheduled=OrderedSet(self.graph.nodes),
+        )
+        self.insert_overlap_deps = insert_overlap_deps
+
+    def _identify_collectives(self) -> None:
+        """Identify all collective operations."""
+        for node in self.nodes:
+            if _schedulable_wait_node(node):
+                start = node.args[0]
+                info = CollectiveInfo(
+                    start_node=start,
+                    wait_node=node,
+                    size_bytes=0,
+                    estimated_time_ms=0,
+                    exposed_time_ms=0,
+                )
+                self.collective_info[start] = info
+                self.wait_to_start[node] = start
+                self.unscheduled_collectives.add(start)
+
+    def run(self) -> torch.fx.GraphModule:
+        """Entry point to run the manual bucket algorithm"""
+        # Bucket collectives in each bucket_module
+        self._manual_bucket_collectives()
+
+        # Reorder collectives with last/next bucket_module
+        self._manual_reorder_graph()
+
+        return self.gm
+
+    def _manual_reorder_graph(self) -> None:
+        """
+        Reorder nodes in the FX graph to enforce manual overlap dependencies.
+
+        Enforce:
+        - all_gather_start_i depends on all_gather_wait_(i-1)
+        - reduce_scatter_wait_i must happen before reduce_scatter_start_(i+1)
+        """
+        delayed_rs_nodes: list[fx.Node] = []
+        overlap_deps: dict[fx.Node, OrderedSet[fx.Node]] = defaultdict(OrderedSet)
+
+        # schedule reduce scatter normally in self._schedule
+        while self.ready:
+            _, node = heapq.heappop(self.ready)
+            node_type = node.meta.get("manual_bucket_node_type", "")
+
+            if node in self.scheduled:
+                continue
+
+            if node_type == "bucketed_reduce_scatter":
+                # Ensure all delayed waits execute before this reduce_scatter
+                for delayed in delayed_rs_nodes:
+                    self._schedule(delayed)
+                    overlap_deps[delayed].add(node)
+                delayed_rs_nodes.clear()
+
+            elif node_type == "bucketed_reduce_scatter_wait":
+                # Defer until next reduce_scatter
+                delayed_rs_nodes.append(node)
+                continue
+            self._schedule(node)
+
+        for delayed in delayed_rs_nodes:
+            self._schedule(delayed)
+
+        self.scheduled = OrderedSet(reversed(list(self.scheduled)))
+        picked_ag: list[fx.Node] = []
+        last_compute: Optional[fx.Node] = None
+
+        for node in self.scheduled:
+            node_type = node.meta.get("manual_bucket_node_type", "")
+            if node_type == "bucketed_all_gather":
+                picked_ag.append(node)
+                continue
+
+            if node_type == "bucketed_all_gather_wait":
+                # Connect corresponding all_gather_wait -> all_gather edges
+                if picked_ag:
+                    for ag in picked_ag:
+                        overlap_deps[self.bucketer.wait_to_node_map[node]].add(ag)
+                picked_ag.clear()
+            if is_compute_node(node):
+                last_compute = node
+
+        if last_compute is not None and not bool(
+            OrderedSet(picked_ag) & OrderedSet(self.node_ancestors[last_compute])
+        ):
+            for ag in picked_ag:
+                overlap_deps[last_compute].add(ag)
+
+        _stable_topological_sort(self.graph, overlap_deps)
+        self.graph.lint()
+
+        if self.insert_overlap_deps:
+            from torch._inductor.fx_passes.control_dependencies import (
+                preserve_node_ordering,
+            )
+
+            preserve_node_ordering(self.graph, overlap_deps)
+
+    def _manual_bucket_collectives(self) -> None:
+        """Bucket nodes in each module_bucket from module_bucket_plans."""
+        self._obtain_nodes_in_subgraph()
+        for i, nodes in enumerate(self.nodes_in_subgraph):
+            self.bucketer.manual_bucket_collectives(nodes=nodes)
+
+        _stable_topological_sort(self.graph, {})
+        self.graph.lint()
+        self.nodes = list(self.graph.nodes)
+        self.in_degree = Counter(user for node in self.nodes for user in node.users)
+
+    def _collect_node_users(self) -> dict[fx.Node, OrderedSet[fx.Node]]:
+        """Collect all users for each node."""
+        node_users: dict[fx.Node, OrderedSet[fx.Node]] = defaultdict(OrderedSet)
+        for node in self.nodes:
+            for output_node in list(node.users.keys()):
+                node_users[node].add(output_node)
+                node_users[node] |= node_users[output_node]
+        return node_users
+
+    def _schedule(self, node: fx.Node) -> None:
+        """Schedule a node."""
+        assert node not in self.scheduled
+        assert all(n in self.scheduled for n in node.all_input_nodes)
+        self.scheduled.add(node)
+        for user in node.users:
+            self.in_degree[user] -= 1
+            if self.in_degree[user] == 0:
+                heapq.heappush(self.ready, ((), user))
+
+    def _obtain_nodes_in_subgraph(self) -> None:
+        """
+        Obtain nodes in each subgraph from module_bucket_plans
+        """
+        graph_view: GraphView | None = make_graph_view(self.graph)
+        if graph_view is None:
+            return
+
+        for module in self.module_bucket_plans:
+            subgraph_view = get_subgraph_by_path(graph_view, module)
+            self.nodes_in_subgraph.append(subgraph_view)
+
+        all_subgraph_nodes = [
+            node for sublist in self.nodes_in_subgraph for node in sublist
+        ]
+        unique_subgraph_nodes = list(OrderedSet(all_subgraph_nodes))
+        assert len(all_subgraph_nodes) <= len(unique_subgraph_nodes), (
+            f"Overlapping FX nodes detected across subgraphs in `module_bucket_plans`. "
+            f"Expected disjoint node sets but found "
+            f"{len(all_subgraph_nodes) - len(unique_subgraph_nodes)} duplicated node(s)."
+        )
+
+
+def manual_overlap_bucketing(
+    gm: torch.fx.GraphModule,
+    module_bucket_plans: list[list[str] | str],
+    insert_overlap_deps: bool = False,
+) -> torch.fx.GraphModule:
+    """Schedule nodes based on user specifications in module_bucket_plans
+    The manual overlapping consists of two steps:
+    Step 1: bucket all-gather/reduce-scatter in each module in module_bucket_plans
+    Step 2: reorder all-gather to overlap with last module_bucket &
+        reorder reduce-scatter to overlap with next module_bucket
+    TODO(ruisizhang123): allow users to explicitly specify which
+        module_bucket they want to overlap.
+
+    Args:
+        gm: input graph module to optimize.
+        module_bucket_plans: user specified FQNs
+    """
+    # decode abbreviated FQNs to actual FQNs
+    overlapped_gm = ManualOverlapScheduler(
+        gm, module_bucket_plans, insert_overlap_deps
+    ).run()
+    overlapped_gm.recompile()
+    return overlapped_gm
diff --git a/torch/_inductor/fx_passes/overlap_preserving_bucketer.py b/torch/_inductor/fx_passes/overlap_preserving_bucketer.py
index 1d25896cb8c4a..4060a29c7c3db 100644
--- a/torch/_inductor/fx_passes/overlap_preserving_bucketer.py
+++ b/torch/_inductor/fx_passes/overlap_preserving_bucketer.py
@@ -176,6 +176,7 @@ def build_timeline(self, pg: str) -> Optional[PGEvent]:
         head = None
         prev_event = None
         position = 0
+        hiding_nodes = OrderedSet()
 
         for node in self.scheduled:
             node_type = None
@@ -183,11 +184,12 @@ def build_timeline(self, pg: str) -> Optional[PGEvent]:
             # Determine if this node is relevant for this PG
             if node in self.collective_info and get_group_name(node) == pg:
                 node_type = "starts"
+                hiding_nodes |= self.collective_info[node].hiding_nodes
             elif is_wait_tensor(node):
                 wait_input = node.args[0]
                 if isinstance(wait_input, fx.Node) and get_group_name(wait_input) == pg:
                     node_type = "waits"
-            elif is_compute_node(node):
+            elif is_compute_node(node) or node in hiding_nodes:
                 node_type = "compute"
 
             if node_type is None:
@@ -205,7 +207,6 @@ def build_timeline(self, pg: str) -> Optional[PGEvent]:
 
             prev_event = event
             position += 1
-
         return head
 
     def _populate_node_to_event(self, pg: str) -> None:
@@ -222,10 +223,12 @@ def _add_hiding_interval_constraints(self) -> None:
         Add hiding interval constraints: start -> compute -> wait.
         """
         for start, info in self.collective_info.items():
-            if info.hiding_node and not info.is_exposed:
+            if info.is_exposed:
+                continue
+            for hn in info.hiding_nodes:
                 # Enforce: start -> compute -> wait
-                self.aug_graph.add_extra_dep(n=info.hiding_node, dep=start)
-                self.aug_graph.add_extra_dep(n=info.wait_node, dep=info.hiding_node)
+                self.aug_graph.add_extra_dep(n=hn, dep=start)
+                self.aug_graph.add_extra_dep(n=info.wait_node, dep=hn)
 
     def bucket_collectives(self) -> None:
         """Main entry point for bucketing collectives."""
@@ -358,13 +361,13 @@ def _ancestor_dep(self, n1: fx.Node, n2: fx.Node) -> bool:
 
     def _get_intervals(
         self, event: PGEvent
-    ) -> tuple[Optional[tuple[int, int]], Optional[tuple[int, int]]]:
-        """Get (execution_interval, hiding_interval) for a collective event.
+    ) -> tuple[Optional[tuple[int, int]], list[tuple[int, int]]]:
+        """Get (execution_interval, hiding_intervals) for a collective event.
 
         Returns:
-            (execution_interval, hiding_interval) where:
+            (execution_interval, hiding_intervals) where:
             - execution_interval is (start_pos, wait_pos) or None
-            - hiding_interval is (start_pos, compute_pos) or None if no hiding node
+            - hiding_intervals is a list of (start_pos, compute_pos) tuples, one for each hiding node
 
         Works for both start and wait events by looking up the collective info.
         """
@@ -375,13 +378,13 @@ def _get_intervals(
         elif event.is_wait:
             wait_input = event.node.args[0]
             if not isinstance(wait_input, fx.Node):
-                return None, None
+                return None, []
             coll = wait_input
         else:
-            return None, None
+            return None, []
 
         if coll not in self.collective_info:
-            return None, None
+            return None, []
 
         info = self.collective_info[coll]
         start_event = self.node_to_event[coll]
@@ -389,14 +392,17 @@ def _get_intervals(
 
         execution_interval = (start_event.position, wait_event.position)
 
-        hiding_interval = None
-        if info.hiding_node:
-            hiding_interval = (
-                start_event.position,
-                self.node_to_event[info.hiding_node].position,
-            )
+        hiding_intervals = []
+        if info.hiding_nodes:
+            for hiding_node in info.hiding_nodes:
+                hiding_intervals.append(
+                    (
+                        start_event.position,
+                        self.node_to_event[hiding_node].position,
+                    )
+                )
 
-        return execution_interval, hiding_interval
+        return execution_interval, hiding_intervals
 
     def _preserves_hiding_intervals(
         self,
@@ -424,9 +430,9 @@ def _preserves_hiding_intervals(
         # Collect hiding compute positions for the bucket
         bucket_hiding_compute_positions = []
         for coll in all_bucketed_colls:
-            if hiding_node := self.collective_info[coll].hiding_node:
+            for coll_hiding_node in self.collective_info[coll].hiding_nodes:
                 bucket_hiding_compute_positions.append(
-                    self.node_to_event[hiding_node].position
+                    self.node_to_event[coll_hiding_node].position
                 )
 
         # Get new positions
@@ -478,11 +484,10 @@ def get_pos(n: fx.Node) -> int:
                 curr_event.node not in all_bucketed_colls
                 and curr_event.node not in all_bucketed_waits
             ):
-                exec_interval, hiding_interval = self._get_intervals(curr_event)
+                exec_interval, hiding_interval_list = self._get_intervals(curr_event)
                 if exec_interval:
                     execution_intervals.append(exec_interval)
-                if hiding_interval:
-                    hiding_intervals.append(hiding_interval)
+                hiding_intervals.extend(hiding_interval_list)
             curr_event = curr_event.next
 
         curr_event = new_wait_event.prev
@@ -491,11 +496,10 @@ def get_pos(n: fx.Node) -> int:
                 curr_event.node not in all_bucketed_colls
                 and curr_event.node not in all_bucketed_waits
             ):
-                exec_interval, hiding_interval = self._get_intervals(curr_event)
+                exec_interval, hiding_interval_list = self._get_intervals(curr_event)
                 if exec_interval:
                     execution_intervals.append(exec_interval)
-                if hiding_interval:
-                    hiding_intervals.append(hiding_interval)
+                hiding_intervals.extend(hiding_interval_list)
             curr_event = curr_event.prev
 
         # Check: no hiding interval should be enclosed by any execution interval
@@ -659,12 +663,12 @@ def _has_ancestor_conflicts(
                 return True
 
             # Check if existing hiding node conflicts with candidate wait
-            if hiding_node := self.collective_info[coll].hiding_node:
-                if self._ancestor_dep(hiding_node, candidate_wait):
+            for old_hiding_node in self.collective_info[coll].hiding_nodes:
+                if self._ancestor_dep(old_hiding_node, candidate_wait):
                     return True
 
             # Check if candidate hiding node conflicts with existing wait
-            if new_hiding_node := candidate_info.hiding_node:
+            for new_hiding_node in candidate_info.hiding_nodes:
                 if self._ancestor_dep(new_hiding_node, coll_wait):
                     return True
 
diff --git a/torch/_inductor/fx_passes/overlap_scheduling.py b/torch/_inductor/fx_passes/overlap_scheduling.py
index a47aa960e58c5..0649e36f23361 100644
--- a/torch/_inductor/fx_passes/overlap_scheduling.py
+++ b/torch/_inductor/fx_passes/overlap_scheduling.py
@@ -4,14 +4,15 @@
 import logging
 import sys
 from collections import Counter, defaultdict
-from collections.abc import Iterable
-from dataclasses import dataclass
-from typing import Any, Callable
+from collections.abc import Callable, Iterable
+from dataclasses import dataclass, field
+from typing import Any, Literal
 
 import torch
 import torch.fx as fx
 from torch._dynamo.utils import counters, dynamo_timed
-from torch._inductor.fx_passes.bucketing import is_wait_tensor
+from torch._inductor.comm_analysis import estimate_fx_collective_size
+from torch._inductor.fx_passes.bucketing import _schedulable_wait_node, is_wait_tensor
 from torch._inductor.fx_passes.memory_estimator import (
     _is_releasable,
     build_memory_profile,
@@ -61,21 +62,12 @@ def estimate_collective_time(
     if (est := get_custom_estimation(n, custom_runtime_estimation)) is not None:
         return est
 
+    # Use analytical model (benchmarking is handled separately in alignment)
     return torch._inductor.comm_analysis.estimate_nccl_collective_runtime_from_fx_node(
         n, override_size
     )
 
 
-def estimate_fx_collective_size(fx_node: torch.fx.Node) -> int:
-    size = 0
-    for node in fx_node.all_input_nodes:
-        if (t := node.meta.get("val")) is not None:
-            # todo - symbolic
-            size += t.numel() * t.element_size()
-
-    return size
-
-
 def is_compute_node(n: fx.Node) -> bool:
     """
     Should we consider this node computationally expensive ?
@@ -109,6 +101,7 @@ def benchmark_node_with_cache_key(
     n: fx.Node,
     custom_runtime_estimation: Callable[[fx.Node], float | None] | None = None,
 ) -> tuple[float, str | None]:
+    """Benchmark a compute node and return (runtime, cache_key)."""
     assert is_compute_node(n)
 
     from torch._dynamo.testing import rand_strided
@@ -188,7 +181,7 @@ class CollectiveInfo:
     size_bytes: int
     estimated_time_ms: float
     exposed_time_ms: float  # How much of this collective is still exposed
-    hiding_node: fx.Node | None = None  # Node that hides this collective
+    hiding_nodes: OrderedSet[fx.Node] = field(default_factory=OrderedSet)
 
     @property
     def is_exposed(self) -> bool:
@@ -244,6 +237,7 @@ def __init__(
         compute_overlap_multipler: float,
         max_coll_distance: int,
         custom_runtime_estimation: Callable[[fx.Node], float | None] | None,
+        collective_estimator: Literal["analytical", "benchmark"],
     ):
         self.gm = gm
         self.graph = gm.graph
@@ -254,6 +248,7 @@ def __init__(
         self.collective_bucketing = collective_bucketing
         self.insert_overlap_deps = insert_overlap_deps
         self.max_compute_pre_fetch = max_compute_pre_fetch
+        self.collective_estimator = collective_estimator
 
         # Build structures
         stable_topological_sort(self.graph)
@@ -314,7 +309,7 @@ def off_compute_path(self, n: fx.Node) -> bool:
     def _identify_collectives(self) -> None:
         """Identify all collective operations."""
         for node in self.nodes:
-            if is_wait_tensor(node):
+            if _schedulable_wait_node(node):
                 start = node.args[0]
                 coll_time_ms = estimate_collective_time(
                     start, custom_runtime_estimation=self.custom_runtime_estimation
@@ -356,25 +351,104 @@ def _calculate_compute_node_domination_index(self) -> dict[fx.Node, int]:
 
         return domination_index
 
+    def _log_collective_benchmarks(
+        self,
+        collective_nodes: list[fx.Node],
+        collective_keys: list[str],
+        benchmarked_medians: list[float],
+        world_size: int,
+    ) -> None:
+        """Log collective benchmarks with analytical comparisons for tlparse."""
+        collective_benchmarks = {}
+        for key, benchmarked_ms, coll_node in zip(
+            collective_keys, benchmarked_medians, collective_nodes
+        ):
+            # NCCL estimator (deterministic, no need to align)
+            nccl_ms = torch._inductor.comm_analysis.estimate_nccl_collective_runtime_from_fx_node(
+                coll_node, None, use_nccl_estimator=True
+            )
+
+            # Inductor analytical (deterministic, no need to align)
+            inductor_ms = torch._inductor.comm_analysis.estimate_nccl_collective_runtime_from_fx_node(
+                coll_node, None, use_nccl_estimator=False
+            )
+
+            collective_benchmarks[key] = {
+                "benchmarked_ms": benchmarked_ms,
+                "analytical_nccl_ms": nccl_ms,
+                "analytical_inductor_ms": inductor_ms,
+            }
+
+        # Emit tlparse artifact
+        from torch._logging import trace_structured
+
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "node_runtime_estimation",
+                "encoding": "json",
+            },
+            payload_fn=lambda: {
+                "world_size": world_size,
+                "collective_benchmarks": collective_benchmarks,
+            },
+        )
+
     def _align_compute_nodes_runtime_estimations_across_all_distributed_ranks(
         self,
     ) -> None:
+        """Align runtime estimations across ranks (compute + collectives)."""
         log.info(
             "Overlap scheduling: Aligning runtime estimations across all distributed ranks"
         )
+
+        # Benchmark compute nodes
         runtime_estimations_keys: list[str | None] = []
         runtime_estimations: list[float] = []
+        compute_key_count = 0
+
         for n in self.compute_nodes:
             val, key = benchmark_node_with_cache_key(n, self.custom_runtime_estimation)
             runtime_estimations.append(val)
             runtime_estimations_keys.append(key)
+            compute_key_count += 1
+
+        # Benchmark collectives if enabled (only CUDA events - others are deterministic)
+        # Skip if custom estimation is provided for collectives
+        collective_nodes: list[fx.Node] = []
+        benchmarked_collective_nodes: list[
+            fx.Node
+        ] = []  # Track which were actually benchmarked
+        if self.collective_estimator == "benchmark":
+            from torch._inductor.fx_passes.node_runtime_estimation import (
+                benchmark_collective_with_cuda_events,
+            )
+
+            collective_nodes = [
+                info.start_node for info in self.collective_info.values()
+            ]
 
+            # Benchmark CUDA events (non-deterministic, needs alignment)
+            # Skip collectives with custom estimation
+            for n in collective_nodes:
+                if get_custom_estimation(n, self.custom_runtime_estimation) is not None:
+                    continue
+
+                # Benchmark actual size
+                cuda_val, cuda_key = benchmark_collective_with_cuda_events(n, nruns=2)
+                if cuda_val is not None:
+                    runtime_estimations.append(cuda_val)
+                    runtime_estimations_keys.append(cuda_key)
+                    benchmarked_collective_nodes.append(n)
+
+        # Single all_gather and compute medians
         import torch.distributed as dist
         from torch._subclasses.fake_tensor import unset_fake_temporarily
         from torch.distributed.distributed_c10d import _get_default_group
 
         world_size = dist.get_world_size()
         pg = _get_default_group()
+
         with unset_fake_temporarily():
             gathered_runtime_estimations: list[list[float]] = [
                 [] for _ in range(world_size)
@@ -385,15 +459,46 @@ def _align_compute_nodes_runtime_estimations_across_all_distributed_ranks(
             median_runtime_estimations = torch.median(
                 torch.tensor(gathered_runtime_estimations), dim=0
             ).values.tolist()
-        for key, median_runtime_estimation in zip(
-            runtime_estimations_keys, median_runtime_estimations
+
+        # Cache medians
+        collective_keys = []
+        collective_medians = []
+        for idx, (key, median_runtime_estimation) in enumerate(
+            zip(runtime_estimations_keys, median_runtime_estimations)
         ):
             if key is None:
                 continue
-            set_cached_node_time(key, median_runtime_estimation)
-        log.info(
-            "Overlap scheduling: Runtime estimations across all distributed ranks were aligned"
-        )
+            if idx < compute_key_count:
+                # Compute node
+                set_cached_node_time(key, median_runtime_estimation)
+            else:
+                # Collective CUDA event benchmark
+                from torch._inductor.fx_passes.node_runtime_estimation import (
+                    set_cached_runtime,
+                )
+
+                set_cached_runtime(key, median_runtime_estimation)
+
+                # Update CollectiveInfo with aligned benchmark
+                coll_idx = idx - compute_key_count
+                coll_node = benchmarked_collective_nodes[coll_idx]
+                info = self.collective_info[coll_node]
+                info.estimated_time_ms = median_runtime_estimation
+                info.exposed_time_ms = median_runtime_estimation
+
+                collective_keys.append(key)
+                collective_medians.append(median_runtime_estimation)
+
+        # Log benchmarks with analytical comparisons
+        if collective_keys:
+            self._log_collective_benchmarks(
+                benchmarked_collective_nodes,
+                collective_keys,
+                collective_medians,
+                world_size,
+            )
+
+        log.info("Overlap scheduling: Runtime estimations aligned")
 
     def run(self) -> torch.fx.GraphModule:
         """Run the scheduling algorithm."""
@@ -417,8 +522,10 @@ def run(self) -> torch.fx.GraphModule:
                 self._handle_compute(node)
             elif node in self.collective_info:
                 self._handle_collective_start(node)
-            elif is_wait_tensor(node):
+            elif _schedulable_wait_node(node):
                 self._handle_wait(node)
+            elif node.op == "placeholder":
+                self._schedule(node)
             else:
                 self._handle_other(node)
 
@@ -447,11 +554,13 @@ def _add_effect_tokens_for_overlap(self) -> None:
         additional_deps: dict[fx.Node, OrderedSet[fx.Node]] = defaultdict(OrderedSet)
 
         for start_node, info in self.collective_info.items():
-            if info.hiding_node and not info.is_exposed:
+            if info.is_exposed:
+                continue
+            for hn in info.hiding_nodes:
                 # Compute depends on collective start (compute must wait for collective to start)
-                additional_deps[info.hiding_node].add(start_node)
+                additional_deps[hn].add(start_node)
                 # Wait depends on compute (wait must wait for compute to finish)
-                additional_deps[info.wait_node].add(info.hiding_node)
+                additional_deps[info.wait_node].add(hn)
 
         # Apply effect tokens to preserve these dependencies
         if additional_deps:
@@ -482,7 +591,7 @@ def _schedule(self, node: fx.Node) -> None:
     def _compute_score(self, node: fx.Node) -> object:
         """Compute priority score for a node"""
 
-        if is_wait_tensor(node):
+        if _schedulable_wait_node(node):
             info = self.collective_info[self.wait_to_start[node]]
             # defer waits locally if they are exposed.
             compute_local_priority = int(info.is_exposed)
@@ -592,9 +701,8 @@ def _handle_compute(self, node: fx.Node) -> None:
             overlap_amount = min(info.exposed_time_ms, available_compute)
             info.exposed_time_ms -= overlap_amount
             available_compute -= overlap_amount
-            if info.exposed_time_ms == 0:
-                info.hiding_node = node
-            elif available_compute == 0:
+            info.hiding_nodes.add(node)
+            if available_compute == 0:
                 break
 
         # Then, look for unscheduled collectives we can overlap
@@ -687,8 +795,7 @@ def _schedule_collectives_for_overlap(
             # after scheduling, which will account for latency reduction of bucketing
             overlap_amount = min(available_compute_time, info.exposed_time_ms)
             info.exposed_time_ms -= overlap_amount
-            if info.exposed_time_ms == 0:
-                info.hiding_node = compute_node
+            info.hiding_nodes.add(compute_node)
             available_compute_time -= overlap_amount
 
     def _find_schedulable_path(
@@ -713,9 +820,9 @@ def _find_schedulable_path(
             # thus forcing it to be exposed.
             # however, if it is already hidden or it cannot be possible hidden,
             # it's fine to schedule it
-            if is_wait_tensor(node):
+            if _schedulable_wait_node(node):
                 info = self.collective_info[self.wait_to_start[node]]
-                if info.hiding_node and info.hiding_node != curr_compute_node:
+                if info.hiding_nodes and curr_compute_node not in info.hiding_nodes:
                     continue
                 elif node not in self.potentially_hidden_waits:
                     continue
@@ -736,7 +843,7 @@ def should_assume_bucketed(self, node: fx.Node) -> bool:
         if key is None:
             return False
 
-        for in_flight_coll in self.in_flight.keys():
+        for in_flight_coll in self.in_flight:
             if bucket_key(in_flight_coll, mode="custom_ops_multidtype") == key:
                 return True
 
@@ -751,7 +858,7 @@ def _wait_is_hidden(
     ) -> bool:
         assert is_wait_tensor(wait_node)
         info = self.collective_info[self.wait_to_start[wait_node]]
-        return not info.is_exposed and info.hiding_node != compute_node
+        return not info.is_exposed and compute_node not in info.hiding_nodes
 
     def _schedule_path_to_collective(
         self, path: OrderedSet[fx.Node], curr_compute_node: fx.Node
@@ -761,7 +868,7 @@ def _schedule_path_to_collective(
         assert all(n not in self.scheduled for n in path)
         for node in sorted(path, key=lambda n: self.node_idx[n]):
             assert not (is_compute_node(node) or node in self.unscheduled_collectives)
-            if is_wait_tensor(node):
+            if _schedulable_wait_node(node):
                 # When we schedule wait tensors, we also force realization of all
                 # collectives enqueued prior to their corresponding collective.
                 # It's possible the scheduling of one wait tensor here has forced
@@ -770,7 +877,7 @@ def _schedule_path_to_collective(
                     continue
 
                 info = self.collective_info[self.wait_to_start[node]]
-                assert info.hiding_node != curr_compute_node
+                assert curr_compute_node not in info.hiding_nodes
                 self._handle_wait(node)
                 continue
 
@@ -894,6 +1001,7 @@ def schedule_overlap_bucketing(
     compute_overlap_multipler: float = 1.0,
     max_coll_distance: int = 1000,
     custom_runtime_estimation: Callable[[fx.Node], float | None] | None = None,
+    collective_estimator: Literal["analytical", "benchmark"] = "analytical",
 ) -> torch.fx.GraphModule:
     """Schedule nodes to maximize compute-collective overlap.
 
@@ -910,6 +1018,8 @@ def schedule_overlap_bucketing(
         max_coll_distance: Maximum node distance for overlap or bucketing. Mostly intended to reduce compile time.
         custom_runtime_estimation: Custom runtime estimation function that estimates runtime in ms for an fx node.
             If None, uses default estimations. This is currently limited to collectives and compute nodes.
+        collective_estimator: Method for estimating collective runtime. "analytical" uses bandwidth formulas,
+            "benchmark" uses CUDA events with power-of-2 rounding and interpolation.
     """
 
     return OverlapScheduler(
@@ -921,4 +1031,5 @@ def schedule_overlap_bucketing(
         custom_runtime_estimation=custom_runtime_estimation,
         collective_bucketing=collective_bucketing,
         insert_overlap_deps=insert_overlap_deps,
+        collective_estimator=collective_estimator,
     ).run()
diff --git a/torch/_inductor/fx_passes/pad_mm.py b/torch/_inductor/fx_passes/pad_mm.py
index 30768fda9bb72..556b32562dcd5 100644
--- a/torch/_inductor/fx_passes/pad_mm.py
+++ b/torch/_inductor/fx_passes/pad_mm.py
@@ -2,8 +2,8 @@
 import itertools
 import operator
 import typing
-from collections.abc import Sequence
-from typing import Any, Callable
+from collections.abc import Callable, Sequence
+from typing import Any
 
 import torch
 import torch._inductor.runtime.runtime_utils
@@ -76,7 +76,7 @@ def get_alignment_size_dtype(dtype: torch.dtype) -> int:
 
 
 def check_device(a: Tensor, b: Tensor) -> bool:
-    return a.is_cuda and b.is_cuda
+    return (a.is_cuda and b.is_cuda) or (a.is_xpu and b.is_xpu)
 
 
 def check_dtype(a: Tensor, b: Tensor) -> bool:
@@ -225,7 +225,7 @@ def is_mm_compute_bound(M: int, K: int, N: int, dtype: torch.dtype) -> bool:
         dtype is torch.bfloat16
         and K > M
         and K > N
-        and torch.cuda.get_device_capability() < (9, 0)
+        and (torch.xpu.is_available() or torch.cuda.get_device_capability() < (9, 0))
     ):  # doesn't repro on h100s:
         return True
 
@@ -280,7 +280,9 @@ def tensor_key(t: Tensor) -> tuple[torch.Size, tuple[int, ...], torch.dtype]:
         return (t.shape, t.stride(), t.dtype)
 
     tf32_key = (
-        None if mat1.dtype != torch.float32 else torch.backends.cuda.matmul.allow_tf32
+        None
+        if mat1.dtype != torch.float32
+        else torch.backends.cuda.matmul.allow_tf32 or torch.backends.mkldnn.allow_tf32
     )
 
     def fmt_pad(name: str) -> str | None:
@@ -381,7 +383,7 @@ def should_pad_mm_bf16(dtype: torch.dtype, M: int, N: int, K: int) -> bool:
         and K > N
         and N % 2 == 1
         and K >= large_k_threshold_to_pad
-        and torch.cuda.get_device_capability() < (9, 0)
+        and (torch.xpu.is_available() or torch.cuda.get_device_capability() < (9, 0))
     ):  # doesn't repro on h100s:
         return True
     return False
@@ -549,7 +551,7 @@ def write_pad():
 
         if op is torch.ops.aten.addmm:
             input_pad = None
-            if input is not None and input.is_cuda:
+            if input is not None and (input.is_cuda or input.is_xpu):
                 input_pad = torch.randn_like(input)
             fns.append(
                 lambda: pad_addmm(
@@ -870,6 +872,8 @@ def _pad_mm_init() -> None:
     if torch.cuda.is_available():
         # workaround https://github.com/pytorch/pytorch/issues/97894
         device = "cuda"
+    elif torch.xpu.is_available():
+        device = "xpu"
     else:
         device = "cpu"
 
diff --git a/torch/_inductor/fx_passes/post_grad.py b/torch/_inductor/fx_passes/post_grad.py
index f11817e1d4c51..e0362f2aaafd4 100644
--- a/torch/_inductor/fx_passes/post_grad.py
+++ b/torch/_inductor/fx_passes/post_grad.py
@@ -5,7 +5,8 @@
 import logging
 import operator
 from collections import Counter, defaultdict
-from typing import Any, Callable, TypeVar
+from collections.abc import Callable
+from typing import Any, TypeVar
 from typing_extensions import ParamSpec
 
 import torch
@@ -221,6 +222,18 @@ def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):
         )
         collectives_bucketing = True
 
+    if config.bucket_all_reduces_fx != "none":
+        from torch._inductor.fx_passes.bucketing import bucket_all_reduce
+
+        GraphTransformObserver(gm, "bucket_all_reduce").apply_graph_pass(
+            lambda graph: bucket_all_reduce(
+                graph.owning_module,
+                config.bucket_all_reduces_fx_bucket_size_determinator,
+                config.bucket_all_reduces_fx,  # type: ignore[arg-type]
+            )
+        )
+        collectives_bucketing = True
+
     # Fx all_gather bucketing introduces mutation op
     # Keeping it in the end to keep invariant of functional graph for previous passes.
     if config.bucket_all_gathers_fx != "none":
@@ -289,6 +302,7 @@ def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):
             "max_compute_pre_fetch",
             "custom_runtime_estimation",
             "insert_overlap_deps",
+            "collective_estimator",
         )
         for key in config_keys:
             if (val := getattr(dist_opts, key)) is not None:
@@ -1515,17 +1529,29 @@ def should_prefer_unfused_addmm(match):
 
 
 @register_graph_pattern(
-    CallFunction(aten.addmm, KeywordArg("inp"), Arg(), Arg()),
+    CallFunction(
+        aten.addmm,
+        KeywordArg("inp"),
+        Arg(),
+        Arg(),
+        beta=KeywordArg("beta"),
+        alpha=KeywordArg("alpha"),
+    ),
     # pyrefly: ignore [bad-argument-type]
     pass_dict=pass_patterns[2],
     extra_check=should_prefer_unfused_addmm,
 )
-def unfuse_bias_add_to_pointwise(match: Match, mat1, mat2, *, inp):
-    def repl(inp, x1, x2):
-        return x1 @ x2 + inp
+def unfuse_bias_add_to_pointwise(match: Match, mat1, mat2, *, inp, alpha, beta):
+    def repl(inp, x1, x2, alpha, beta):
+        mm_result = x1 @ x2
+        if alpha != 1:
+            mm_result = alpha * mm_result
+        if beta != 1:
+            inp = beta * inp
+        return inp + mm_result
 
     # pyrefly: ignore [bad-argument-type]
-    match.replace_by_example(repl, [inp, mat1, mat2])
+    match.replace_by_example(repl, [inp, mat1, mat2, alpha, beta])
 
 
 def is_valid_addmm_fusion(match):
diff --git a/torch/_inductor/fx_passes/pre_grad.py b/torch/_inductor/fx_passes/pre_grad.py
index 051c75b2c2a90..2fd81f9b8cd57 100644
--- a/torch/_inductor/fx_passes/pre_grad.py
+++ b/torch/_inductor/fx_passes/pre_grad.py
@@ -264,13 +264,14 @@ def _run_pre_dispatch_passes(
                 f"[Pre grad(predispatch IR)] Apply {pass_name} pass",
             )
 
-    # Remove noops at the end, which may be generated other passes.
-    pass_execution_and_save(
-        remove_noop_pass,
-        gm,
-        example_inputs,
-        "[Pre grad(predispatch IR)]Apply remove_noop pass",
-    )
+    if "remove_noop" not in remove_passes_list:
+        # Remove noops at the end, which may be generated other passes.
+        pass_execution_and_save(
+            remove_noop_pass,
+            gm,
+            example_inputs,
+            "[Pre grad(predispatch IR)]Apply remove_noop pass",
+        )
     shape_prop(gm)
 
 
diff --git a/torch/_inductor/fx_passes/reinplace.py b/torch/_inductor/fx_passes/reinplace.py
index 52222f3da8344..e42e8a1139770 100644
--- a/torch/_inductor/fx_passes/reinplace.py
+++ b/torch/_inductor/fx_passes/reinplace.py
@@ -3,10 +3,10 @@
 import logging
 import operator
 from collections import defaultdict
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from contextlib import nullcontext
 from dataclasses import dataclass
-from typing import Any, Callable, cast
+from typing import Any, cast
 
 import torch
 import torch.fx.node
diff --git a/torch/_inductor/fx_passes/split_cat.py b/torch/_inductor/fx_passes/split_cat.py
index 92e1e6f375f44..6347bda3b525c 100644
--- a/torch/_inductor/fx_passes/split_cat.py
+++ b/torch/_inductor/fx_passes/split_cat.py
@@ -4,9 +4,8 @@
 import operator
 import os
 from collections import defaultdict
-from collections.abc import Sequence
-from typing import Any, Callable
-from typing_extensions import TypeAlias
+from collections.abc import Callable, Sequence
+from typing import Any, TypeAlias
 
 import torch
 from torch._dynamo.utils import counters
@@ -405,8 +404,8 @@ def normalize_stack_default(match: Match, *args, **kwargs):
 
 def find_next_users(split_node: torch.fx.Node) -> list[torch.fx.Node]:
     next_users = []
-    for getitem_node in split_node.users.keys():
-        for getitem_user in getitem_node.users.keys():
+    for getitem_node in split_node.users:
+        for getitem_user in getitem_node.users:
             if getitem_user not in next_users:
                 next_users.append(getitem_user)
     return next_users
@@ -624,7 +623,7 @@ def merge_splits(
             )
         first_split_num_to_user = {
             user.args[1]: user
-            for user in first_split.users.keys()  # type: ignore[union-attr]
+            for user in first_split.users  # type: ignore[union-attr]
         }
 
         new_split_num = 0
@@ -638,9 +637,7 @@ def merge_splits(
                 old_getitem.update_arg(1, new_split_num)
                 new_split_num += 1
             else:
-                next_split_num_to_user = {
-                    user.args[1]: user for user in node.users.keys()
-                }
+                next_split_num_to_user = {user.args[1]: user for user in node.users}
                 # It is not necessary all getitems from the split node are used.
                 for next_split_num in range(len(next_split_sections)):
                     with graph.inserting_after(new_split):
@@ -1161,9 +1158,7 @@ def remove_unbind(
             return
         # we need to check if the getitem indices from unbind are consecutive and all go to the same cat node
         # before we do the unbind remove, otherwise it will hit the error when we unbind part of them
-        getitem_indices = [
-            getitem_node.args[1] for getitem_node in unbind_node.users.keys()
-        ]
+        getitem_indices = [getitem_node.args[1] for getitem_node in unbind_node.users]
         if not is_sorted_and_consecutive(getitem_indices) or len(  # type: ignore[arg-type]
             getitem_indices
         ) != len(unbind_node.meta["example_value"]):
@@ -1315,10 +1310,7 @@ def merge_split_squeeze(
                 split_input.meta["example_value"], dim=dim
             )
         for item_index, getitem_node in sorted(
-            [
-                (getitem_node.args[1], getitem_node)
-                for getitem_node in split.users.keys()
-            ]
+            [(getitem_node.args[1], getitem_node) for getitem_node in split.users]
         ):
             squeeze = next(iter(getitem_node.users.keys()))
             new_get_item = graph.call_function(
@@ -2754,14 +2746,12 @@ def unbind_stack_to_slices(match: Match, unbind_input: torch.fx.Node, dim: int):
 def get_view_shape_list(cat_arg: torch.fx.Node, stack_dim: int) -> list[int]:
     # cat_arg must be the split input
     view_shape_list = []
-    for user in cat_arg.users.keys():
+    for user in cat_arg.users:
         if user.target is torch.split:
-            for getitem in user.users.keys():
+            for getitem in user.users:
                 if getitem.target is operator.getitem:
                     reshape_user = [
-                        user
-                        for user in getitem.users.keys()
-                        if user.target is torch.reshape
+                        user for user in getitem.users if user.target is torch.reshape
                     ]
                     if len(reshape_user) > 0:
                         view_shape_list = list(
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index 2e89ea5ca461b..1eaab41130675 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -110,6 +110,7 @@
     maybe_get_suppress_shape_guards_ctx,
     normalize_name,
     should_assume_input_aligned,
+    should_fallback_by_default,
     SUPPORTED_MKLDNN_DEVICES,
     ValueWithLineMap,
 )
@@ -1590,7 +1591,7 @@ def maybe_propagate(
 
         schema_kwargs = {arg.name: arg for arg in schema.arguments}
 
-        for key in old_kwargs.keys():
+        for key in old_kwargs:
             old_arg = old_kwargs[key]
             new_arg = new_kwargs[key]
             schema_arg = schema_kwargs[key]
@@ -1634,6 +1635,20 @@ def debug(msg: str) -> None:
                     *args,  # type: ignore[possibly-undefined]
                     **kwargs,  # type: ignore[possibly-undefined]
                 )
+            elif (
+                n.op == "call_function"
+                and isinstance(
+                    n.target, (torch._ops.OpOverload, torch._ops.HigherOrderOperator)
+                )
+                and should_fallback_by_default(n)
+            ):
+                # this path supports fallback due to inductor lite mode. It supports
+                # both OpOverload and HOPs (e.g., triton_kernel_wrapper_functional).
+                debug("fallback_handler")
+                result = fallback_handler(n.target, add_to_fallback_set=False)(
+                    *args,  # type: ignore[possibly-undefined]
+                    **kwargs,  # type: ignore[possibly-undefined]
+                )
             elif (
                 n.op == "call_function"
                 and n.target is torch.ops.higher_order.triton_kernel_wrapper_mutation
@@ -1936,7 +1951,7 @@ def format_new_defs() -> str:
         # we already know facts for.
         renamed_unbacked_bindings = OrderedSet(
             V.fake_mode.shape_env.unbacked_renamings.get(s, s)
-            for s in unbacked_bindings.keys()
+            for s in unbacked_bindings
         )
 
         assert new_unbacked_defs >= renamed_unbacked_bindings, (
@@ -2481,7 +2496,7 @@ def is_unspec_arg(self, name: str) -> bool:
         # dynamo wraps unspec variable as 0d CPU tensor,
         # need to convert to scalar during codegen (triton only)
         return (
-            name in self.graph_inputs.keys()
+            name in self.graph_inputs
             and self.graph_inputs[name].get_numel() == 1
             and len(self.graph_inputs[name].get_size()) == 0
             and get_device_type(self.graph_inputs[name]) == "cpu"
diff --git a/torch/_inductor/invert_expr_analysis.py b/torch/_inductor/invert_expr_analysis.py
new file mode 100644
index 0000000000000..816482dba020c
--- /dev/null
+++ b/torch/_inductor/invert_expr_analysis.py
@@ -0,0 +1,208 @@
+from dataclasses import dataclass
+from typing import Optional
+
+import sympy
+
+from torch._inductor.utils import _IntLike, argsort_sym
+from torch.utils._sympy.functions import FloorDiv, ModularIndexing
+
+from .virtualized import V
+
+
+def static_eq(a: _IntLike, b: _IntLike) -> bool:
+    return V.graph.sizevars.statically_known_equals(a, b)
+
+
+@dataclass
+class Term:
+    coefficient: _IntLike
+    range: Optional[_IntLike]  # None for unbounded
+    original_expr: sympy.Expr
+    reconstruction_multiplier: _IntLike  # The multiplier needed for reconstruction
+
+
+def generate_inverse_formula(
+    expr: sympy.Expr, var: sympy.Symbol
+) -> Optional[sympy.Expr]:
+    """
+     Analyze an expression to see if it matches a specific invertible pattern that we
+     know how to reverse.
+
+     We're looking for expressions that are sums of terms where each term extracts a
+     distinct bounded range from the input variable, like:
+
+         y = c₀*a₀ + c₁*a₁ + c₂*a₂ + ... + cₙ*aₙ
+
+     where each aᵢ must be one of these specific patterns:
+     - ModularIndexing(var, divisor, modulo)
+     - FloorDiv(ModularIndexing(var, 1, modulo), divisor)
+     - FloorDiv(var, divisor)
+     - var (the variable itself)
+
+     The key pattern we need is:
+     - Coefficients are strictly decreasing: c₀ > c₁ > c₂ > ... > cₙ
+     - Each coefficient matches the product of ranges of later terms (mixed-radix property)
+     - Each term extracts a bounded range, creating non-overlapping "slots"
+
+     If we find this pattern, we can generate the reconstruction transformation that
+     decomposes the variable and rebuilds it using the correct multipliers.
+
+     EXAMPLE:
+     Input: 100*((p//100)) + 10*((p%100)//10) + (p%10)
+
+     Returns the reconstruction expression:
+         remainder₀ = p
+         component₀ = remainder₀ // 100          # hundreds digit
+         remainder₁ = remainder₀ % 100
+         component₁ = remainder₁ // 10           # tens digit
+         remainder₂ = remainder₁ % 10
+         component₂ = remainder₂                 # ones digit
+         result = component₀*100 + component₁*10 + component₂*1
+
+    This decomposes p into its components and rebuilds it using the original
+     multipliers, which should equal the input expression.
+
+     Args:
+         expr: Expression to analyze (sum of terms with ModularIndexing, FloorDiv, etc.)
+         var: The variable being decomposed
+
+     Returns:
+         None if not invertible, or the reconstruction expression
+
+     References:
+         Mixed-radix systems: https://en.wikipedia.org/wiki/Mixed_radix
+    """
+    # Step 1: Parse all terms
+    terms = parse_terms(expr, var)
+    if not terms:
+        return None
+
+    # Step 2: Sort by coefficient (descending)
+    coeffs = [t.coefficient for t in terms]
+    idxs = reversed(argsort_sym(V.graph.sizevars.shape_env, coeffs))
+    terms = [terms[i] for i in idxs]
+
+    # Step 3: Check invertibility conditions
+    if not check_invertibility(terms):
+        return None
+
+    return generate_reconstruction_expr(terms, var)
+
+
+def parse_terms(expr: sympy.Expr, var: sympy.Symbol) -> Optional[list[Term]]:
+    """Parse expression into terms."""
+    if not isinstance(expr, sympy.Add):
+        # Single term
+        term = parse_single_term(expr, var)
+        return [term] if term else []
+
+    terms = []
+    for arg in expr.args:
+        term = parse_single_term(arg, var)
+        if term:
+            terms.append(term)
+        else:
+            return None  # If any term fails to parse, fail completely
+
+    return terms
+
+
+def parse_single_term(term: sympy.Expr, var: sympy.Symbol) -> Optional[Term]:
+    """Parse a single term and extract coefficient, range, and reconstruction multiplier."""
+    # Extract coefficient and expression parts
+    coefficient, expr_parts = term.as_coeff_mul()
+
+    if len(expr_parts) == 0:
+        # Pure constant term
+        return Term(
+            coefficient=coefficient,
+            range=1,
+            original_expr=1,
+            reconstruction_multiplier=0,
+        )
+    elif len(expr_parts) == 1:
+        expr = expr_parts[0]
+    else:
+        # Multiple non-constant factors, too complex
+        return None
+
+    # Now determine the range and reconstruction multiplier
+    range_val, reconstruction_multiplier = analyze_expression_properties(expr, var)
+    if reconstruction_multiplier is None:
+        return None
+
+    return Term(
+        coefficient=coefficient,
+        range=range_val,
+        original_expr=expr,
+        reconstruction_multiplier=reconstruction_multiplier,
+    )
+
+
+def analyze_expression_properties(
+    expr: sympy.Expr, var: sympy.Symbol
+) -> tuple[Optional[_IntLike], Optional[_IntLike]]:
+    """Analyze an expression to determine its range and reconstruction multiplier."""
+    # ModularIndexing(var, divisor, modulo) = (var // divisor) % modulo
+    if isinstance(expr, ModularIndexing):
+        x, div, mod = expr.args
+        if static_eq(x, var):
+            return mod, div  # Range is mod, multiplier is div
+
+    # FloorDiv cases
+    if isinstance(expr, FloorDiv):
+        base, divisor = expr.args
+
+        # FloorDiv(ModularIndexing(var, 1, mod), div) = (var % mod) // div
+        if isinstance(base, ModularIndexing):
+            x, inner_div, mod = base.args
+            if static_eq(x, var) and static_eq(inner_div, 1):
+                range_val = FloorDiv(mod, divisor)
+                return range_val, divisor  # Range is mod//div, multiplier is div
+
+        # FloorDiv(var, divisor) = var // divisor (unbounded)
+        elif static_eq(base, var):
+            return None, divisor  # Unbounded range, multiplier is div
+
+    return None, None
+
+
+def check_invertibility(terms: list[Term]) -> bool:
+    """Check if the terms represent an invertible transformation."""
+    if not terms:
+        return False
+
+    # Coefficients must be strictly decreasing
+    coeffs = [t.coefficient for t in terms]
+    if argsort_sym(V.graph.sizevars.shape_env, coeffs) != list(
+        reversed(range(len(coeffs)))
+    ):
+        return False
+
+    # Check mixed-radix property: each coeff[i] = coeff[i+1] * range[i+1]
+    expected_coeff = 1
+    for term in reversed(terms):
+        if not static_eq(term.coefficient, expected_coeff):
+            return False
+        if term.range is not None:
+            expected_coeff *= term.range
+
+    return True
+
+
+def generate_reconstruction_expr(terms: list[Term], var: sympy.Symbol) -> sympy.Expr:
+    y = var
+    reconstruction = sympy.S.Zero
+    remainder = y
+
+    for i, term in enumerate(terms):
+        if i < len(terms) - 1:
+            component = FloorDiv(remainder, term.coefficient)
+            remainder = ModularIndexing(remainder, 1, term.coefficient)
+        else:
+            # Last term should also divide by its coefficient
+            component = FloorDiv(remainder, term.coefficient)
+
+        reconstruction += component * term.reconstruction_multiplier
+
+    return reconstruction
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index b1a3071cb7ba4..72d8383d2b812 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -64,6 +64,7 @@
 )
 from torch.fx.node import Node
 from torch.utils._ordered_set import OrderedSet
+from torch.utils._python_dispatch import _disable_current_modes
 from torch.utils._sympy.functions import CleanDiv, FloorDiv, Mod, ModularIndexing
 from torch.utils._sympy.symbol import SymT
 
@@ -1434,7 +1435,9 @@ def get_read_indices(r: Reduction) -> tuple[Sequence[Expr], bool]:
             strides = V.graph.sizevars.stride_hints(
                 j, reduction_vars, list(ranges1.keys())
             )
-            outer = all(s > 1 for s in strides)
+            # A 0 stride does not make a reduction contiguous.
+            # This can happen when the reduction ranges contains a 1.
+            outer = all(s == 0 or s > 1 for s in strides)
             if outer:
                 num_outer += 1
             else:
@@ -1534,7 +1537,7 @@ def py_cnst(val: object) -> Union[bool, float, int]:
                 # "all" is desugared to `!any(!val)`
             }
 
-            assert reduction_type in rtypes_to_inits.keys(), (
+            assert reduction_type in rtypes_to_inits, (
                 f"{reduction_type} not supported for zero-dimension tensors!"
             )
 
@@ -6135,9 +6138,12 @@ def realize_input(cls, x: IRNode) -> IRNode:
         if isinstance(x, (Expr, sympy.logic.boolalg.Boolean, int)):
             return ShapeAsConstantBuffer(expr=x)
         if isinstance(x, Constant):
-            return V.graph.add_tensor_constant(
-                torch.tensor(x.value, dtype=x.get_dtype(), device=x.get_device())
-            )
+            # We need to unset fake mode, or else the torch.tensor() call will
+            # turn into a FakeTensor
+            with _disable_current_modes():
+                return V.graph.add_tensor_constant(
+                    torch.tensor(x.value, dtype=x.get_dtype(), device=x.get_device())
+                )
         if isinstance(x, ConstantBuffer):
             return x
         if isinstance(x, TensorBox):
@@ -8845,7 +8851,9 @@ def create(
         outputs = [
             MultiOutput(
                 FixedLayout(
-                    device=device,
+                    device=output.get_device()
+                    if output.get_device() is not None
+                    else device,  # type: ignore[arg-type]
                     dtype=output.get_dtype(),
                     size=[Conditional._maybe_expr(sz) for sz in merged_output.size()],
                     stride=[
@@ -9236,12 +9244,9 @@ def __init__(
             unbacked_bindings=unbacked_bindings,
         )
 
-        from torch._higher_order_ops.effects import get_effect_key
+        from torch._higher_order_ops.effects import _get_effect
 
-        uncovered_args = [
-            a.value if isinstance(a, TorchBindObject) else a for a in tensor_args
-        ]
-        effect_type = get_effect_key(kernel, (*nontensor_args, *uncovered_args), kwargs)
+        effect_type = _get_effect(kernel)
         assert effect_type is not None
         self.effect_type = effect_type
         self.prev_effect_buffer = V.graph.effectful_ops.get(effect_type, None)
@@ -9292,6 +9297,10 @@ def get_real_obj(self) -> torch.ScriptObject:
     def get_buf_bytes(self) -> int:
         # Returns the sum of all tensors in the flattened object
         real_script_obj = self.get_real_obj()
+
+        if real_script_obj is None:
+            return 0
+
         assert hasattr(real_script_obj, "__obj_flatten__")
         flat_dict = dict(real_script_obj.__obj_flatten__())
         flat_elems = pytree.tree_flatten(flat_dict)[0]
diff --git a/torch/_inductor/kernel/custom_op.py b/torch/_inductor/kernel/custom_op.py
index 303110a561b5e..23878f757cc5e 100644
--- a/torch/_inductor/kernel/custom_op.py
+++ b/torch/_inductor/kernel/custom_op.py
@@ -2,9 +2,11 @@
 
 import functools
 import logging
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any, Optional, Union
 
 import torch
+from torch._inductor import config
 from torch._inductor.codegen.subgraph import SubgraphTemplate
 from torch._inductor.ir import Buffer, FixedLayout, ir_node_to_tensor, TensorBox
 from torch._inductor.lowering import lowerings, validate_ir
@@ -157,7 +159,6 @@ def _adapt_user_input_gen_fns(
 
     Uses V.graph.sizevars.size_hints() to guess best for dynamic shapes.
     """
-    from torch._inductor import config
 
     name_to_index = {name: i for i, name in enumerate(arg_names)}
     index_based_fns = {}
@@ -237,6 +238,7 @@ def autotune_custom_op(
 
     This function generates multiple implementation choices for a custom operation and
     uses Inductor's autotuning system to select the best performing variant at runtime.
+    After selecting the best choice, applies inline fusion if the winning choice has a graph.
 
     Args:
         name: Unique identifier for the autotuning operation
@@ -319,14 +321,34 @@ def autotune_custom_op(
         )
         input_gen_fns = _adapt_user_input_gen_fns(inputs, arg_names, user_input_gen_fns)
 
-    return autotune_select_algorithm(
+    # Run autotuning and get both result and winning choice
+    selected_result, winning_choice = autotune_select_algorithm(
         name=name,
         choices=choices,
         input_nodes=list(inputs),
         layout=choices[0].layout,
         input_gen_fns=input_gen_fns,
+        return_choice=True,
     )
 
+    # Apply inlining for fusion if winning_choice has graph; otherwise return result as-is(default fallback impl)
+    if winning_choice.gm is not None:
+        log.debug(
+            "Inlining winning choice: %s (name=%s)",
+            getattr(winning_choice, "name", type(winning_choice).__name__),
+            name,
+        )
+        from torch._inductor.codegen.subgraph import inline_subgraph_to_ir_nodes
+
+        return inline_subgraph_to_ir_nodes(winning_choice.gm, inputs, name)
+
+    log.debug(
+        "Winning choice does not support inlining: %s (name=%s)",
+        getattr(winning_choice, "name", type(winning_choice).__name__),
+        name,
+    )
+    return selected_result
+
 
 def register_custom_op_autotuning(
     custom_op: torch._library.custom_ops.CustomOpDef,
@@ -359,7 +381,7 @@ def my_attention(query, key, value, head_dim=32):
                 "query": lambda fake: torch.randn_like(fake, device='cuda'),
                 "key": lambda fake: torch.randn_like(fake, device='cuda'),
                 "value": lambda fake: torch.randn_like(fake, device='cuda'),
-            }
+            },
         )
     """
     from torch._library.custom_ops import CustomOpDef
@@ -377,12 +399,12 @@ def my_attention(query, key, value, head_dim=32):
         raise TypeError(f"configs must be a list or tuple, got {type(configs)}")
 
     processed_configs = []
-    for config in configs:
-        if isinstance(config, CustomOpConfig):
-            processed_configs.append(config)
+    for cfg in configs:
+        if isinstance(cfg, CustomOpConfig):
+            processed_configs.append(cfg)
         else:
             raise TypeError(
-                f"Each config must be a CustomOpConfig object, got {type(config)}"
+                f"Each config must be a CustomOpConfig object, got {type(cfg)}"
             )
 
     if not processed_configs:
@@ -401,14 +423,12 @@ def autotuning_lowering(*args: Any, **kwargs: Any) -> Any:
         decompositions = []
         non_tensor_args = []
 
-        for config in processed_configs:
-            decomp = config.get_decomposition(default_impl=default_impl)
+        for cfg in processed_configs:
+            decomp = cfg.get_decomposition(default_impl=default_impl)
             decompositions.append(decomp)
 
             # Merge config params with runtime kwargs (runtime takes precedence)
-            merged_kwargs = _merge_config_and_runtime_kwargs(
-                config.params, runtime_kwargs
-            )
+            merged_kwargs = _merge_config_and_runtime_kwargs(cfg.params, runtime_kwargs)
             non_tensor_args.append(merged_kwargs)
 
         result = autotune_custom_op(
diff --git a/torch/_inductor/kernel/flex/flex_flash_attention.py b/torch/_inductor/kernel/flex/flex_flash_attention.py
index c100df84d5a73..0d3721aa730a4 100644
--- a/torch/_inductor/kernel/flex/flex_flash_attention.py
+++ b/torch/_inductor/kernel/flex/flex_flash_attention.py
@@ -3,8 +3,9 @@
 
 import functools
 import importlib
+from collections.abc import Callable, Sequence
 from contextlib import contextmanager
-from typing import Any, Callable, Optional, Sequence
+from typing import Any, Optional
 
 import sympy
 from sympy import Expr, Integer
diff --git a/torch/_inductor/kernel/mm.py b/torch/_inductor/kernel/mm.py
index 6a8657f86bf03..986ceb4405a14 100644
--- a/torch/_inductor/kernel/mm.py
+++ b/torch/_inductor/kernel/mm.py
@@ -19,7 +19,7 @@
 from torch.nn.functional import ScalingType  # type: ignore[attr-defined]
 from torch.torch_version import TorchVersion
 
-from .. import config as inductor_config
+from .. import config as inductor_config, distributed_autotune
 from ..codegen.cuda.gemm_template import CUTLASS2xGemmTemplate, CUTLASS3xGemmTemplate
 from ..codegen.rocm.ck_tile_universal_gemm_template import CKTileGemmTemplate
 from ..codegen.rocm.ck_universal_gemm_template import CKGemmTemplate
@@ -1315,6 +1315,11 @@ def _to_dtype(x):
         # The future will be awaited at scheduling time in select_algorithm.py
         best_config_future = gen_best_config(mat1, mat2)
 
+    if box := distributed_autotune.maybe_autotune_remote(
+        name, choices, kernel_inputs.nodes(), layout
+    ):
+        return box
+
     return autotune_select_algorithm(
         name,
         choices,
diff --git a/torch/_inductor/kernel/mm_grouped.py b/torch/_inductor/kernel/mm_grouped.py
index 0a44b728a5a93..c81ec607661bc 100644
--- a/torch/_inductor/kernel/mm_grouped.py
+++ b/torch/_inductor/kernel/mm_grouped.py
@@ -7,6 +7,7 @@
 from torch._dynamo.utils import counters
 from torch._inductor.codegen.cutedsl.cutedsl_template import CuteDSLTemplate
 from torch._inductor.runtime.triton_compat import tl
+from torch._inductor.template_heuristics.cutedsl import get_groupgemm_configs
 from torch._inductor.virtualized import V
 from torch.utils._triton import has_triton
 
@@ -19,7 +20,6 @@
     TritonTemplate,
 )
 from ..utils import (
-    ensure_cute_available,
     get_gpu_shared_memory,
     get_num_sms,
     has_free_symbols,
@@ -35,9 +35,6 @@
 )
 
 
-if ensure_cute_available():
-    from torch._inductor.template_heuristics.cutedsl import get_groupgemm_configs
-
 log = logging.getLogger(__name__)
 aten = torch.ops.aten
 
diff --git a/torch/_inductor/loop_body.py b/torch/_inductor/loop_body.py
index 53ae1d8f63f6b..3921aa955a836 100644
--- a/torch/_inductor/loop_body.py
+++ b/torch/_inductor/loop_body.py
@@ -95,7 +95,6 @@ class LoopBody:
     """
 
     indexing_exprs: dict[str, sympy.Expr]
-    indexing_exprs_name: dict[sympy.Expr, str]
     submodules: dict[str, Any]
     subblocks: dict[str, LoopBodyBlock]
     indirect_vars: list[sympy.Symbol]
@@ -104,6 +103,9 @@ class LoopBody:
     memory_usage: dict[MemoryUsageType, list[MemoryEntry]]
     op_counts: collections.Counter[str]
 
+    # defined only temporarily
+    indexing_exprs_name: dict[sympy.Expr, str]
+
     def __init__(
         self,
         fn,
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index cc13f79909014..d374be59c9446 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -26,6 +26,7 @@
 from torch._dynamo.utils import counters
 from torch._higher_order_ops.associative_scan import associative_scan_op
 from torch._higher_order_ops.triton_kernel_wrap import triton_kernel_wrapper_mutation
+from torch._library.fake_class_registry import FakeScriptObject
 from torch._library.utils import get_layout_constraint_tag
 from torch._prims_common import (  # pyrefly: ignore  # deprecated; pyrefly: ignore [deprecated]
     canonicalize_dim,
@@ -482,9 +483,7 @@ def wrapped(*args, **kwargs):
             (fn in fallbacks or in_namespace(fn, "_c10d_functional")) for fn in aten_fn
         ):
             # explicitly assert for "out=" ops for better error messages
-            assert not any(x == "out" for x in kwargs.keys()), (
-                "out= ops aren't yet supported"
-            )
+            assert not any(x == "out" for x in kwargs), "out= ops aren't yet supported"
 
         args, kwargs = transform_args(
             args, kwargs, broadcast, type_promotion_kind, convert_input_to_bool
@@ -1302,8 +1301,11 @@ def compute_slice_index(index, size, default=None):
     V.graph.register_operation(b_size)
     new_size = sym_size
 
-    if start_index is not None:
+    if x.maybe_get_layout() is None:
+        # realize tensor before accessing layout
         x.realize()
+
+    if start_index is not None:
         # we shouldn't have allocated storage offset symbol if start index was determinable
         assert sym_storage is None
         new_storage_offset = x.get_layout().offset + start_index * x.get_stride()[dim]
@@ -2381,6 +2383,10 @@ def warn_triton_random():
 fallback_randn_generator = fallback_handler(aten.randn.generator)
 make_fallback(aten.randint)
 
+# TODO: mlazos reevaluate if we want to codegen something different
+make_fallback(torch.ops.streams.record_event.default)
+make_fallback(torch.ops.streams.wait_event.default)
+
 
 @register_lowering(aten.rand)
 def rand(*args, **kwargs):
@@ -2699,15 +2705,15 @@ def require_channels_last(_, *args, **kwargs):
 
 
 def constrain_to_fake_tensor(arg, fake_arg):
+    if isinstance(fake_arg, FakeScriptObject):
+        return arg
     if isinstance(arg, ir.IRNode):
         meta_stride_expr = [
             s.node.expr if isinstance(s, torch.SymInt) else s for s in fake_arg.stride()
         ]
         return ir.ExternKernel.require_exact_strides(arg, meta_stride_expr)
     if isinstance(arg, dict):
-        return {
-            key: constrain_to_fake_tensor(arg[key], fake_arg[key]) for key in arg.keys()
-        }
+        return {key: constrain_to_fake_tensor(arg[key], fake_arg[key]) for key in arg}
     elif isinstance(arg, (tuple, list)):
         return type(arg)(
             constrain_to_fake_tensor(a, f_a) for (a, f_a) in zip(arg, fake_arg)
@@ -2732,7 +2738,7 @@ def apply_constraint(arg, fx_arg):
             )
             return ir.ExternKernel.require_stride_order(arg, stride_order)
         if isinstance(arg, dict):
-            return {key: apply_constraint(arg[key], fx_arg[key]) for key in arg.keys()}
+            return {key: apply_constraint(arg[key], fx_arg[key]) for key in arg}
         return arg
 
     args = tuple(
@@ -3089,6 +3095,8 @@ def is_aligned(x):
 make_fallback(aten.index_reduce)
 make_fallback(aten.repeat_interleave.Tensor, override_decomp=True)
 
+make_fallback(aten._weight_norm_interface_backward.default, require_contiguous)
+
 
 # Register with type_promotion_kind None.
 # For example, fp16.copy_(fp32) should **not** promote the first input's dtype.
@@ -7096,30 +7104,19 @@ def sym_constrain_range(a, min=None, max=None):
 @register_lowering(aten.sym_size.int)
 def sym_size(a, dim):
     val = V.graph.current_node.meta["val"]
-    # Note [Can val be an int?]
-    # ~~~~~~~~~~~~~~~~~~~~~~~~~
-    # In principle, someone could construct an FX graph where
-    # a call to size/stride has a val that is a plain int (not
-    # SymInt).  However, we will maintain the invariant that
-    # this is not possible: if you are constructing an FX graph
-    # where there is a call to size/stride that returns an
-    # int, but you KNOW that int must always be a constant,
-    # then you do not need trace that call at all (and just
-    # constant propagate the integer as is.)
-    assert isinstance(val, torch.SymInt), (
-        f"Expect val to be torch.SymInt but got val={val}"
-    )
-    return val.node.expr
+    if isinstance(val, torch.SymInt):
+        return val.node.expr
+    else:
+        return int(val)
 
 
 @register_lowering(aten.sym_stride.int)
 def sym_stride(a, dim):
     val = V.graph.current_node.meta["val"]
-    # See Note [Can val be an int?]
-    assert isinstance(val, torch.SymInt), (
-        f"Expect val to be torch.SymInt but got val={val}"
-    )
-    return val.node.expr
+    if isinstance(val, torch.SymInt):
+        return val.node.expr
+    else:
+        return int(val)
 
 
 @register_lowering(aten.sym_numel)
@@ -7307,6 +7304,35 @@ def invoke_subgraph(subgraph_fn: ir.Subgraph, identifier: str, *operands):
     return list(map(TensorBox.create, result))  # type: ignore[call-overload]
 
 
+def process_subgraph_nodes(graph_module: torch.fx.GraphModule, args: list[Any]):
+    """Process nodes from a FX graph by executing them through V.graph.
+
+    This is a common pattern for executing a subgraph's nodes:
+    - Placeholder nodes are mapped to the provided args
+    - Output nodes return their result
+    - Other nodes are executed via V.graph.run_node
+
+    """
+    output = None
+
+    for i, node in enumerate(graph_module.graph.nodes):
+        if node.op == "placeholder":
+            assert node not in V.graph.env
+            V.graph.env[node] = args[i]
+            continue
+        elif node.op == "output":
+            output_args, kwargs = V.graph.fetch_args_kwargs_from_env(node)
+            output = torch.fx.Interpreter.output(V.graph, node, output_args, kwargs)
+        else:
+            assert node not in V.graph.env
+            V.graph.env[node] = V.graph.run_node(node)
+
+    if output is None:
+        raise RuntimeError("No output node found in graph")
+
+    return output
+
+
 # Import the control_deps_op HOP for lowering
 from torch._inductor.fx_passes.control_dependencies import control_deps
 
@@ -7334,21 +7360,11 @@ def control_deps_op_lowering(additional_deps, subgraph_fn, *args):
     arg_offset = 2  # first two args (additional_deps, subgraph)
     assert len(args) + arg_offset == len(original_args)
 
-    output = None
-
     operation_len = len(V.graph.operations)
     assert len(subgraph_fn.graph_module.graph.find_nodes(op="placeholder")) == len(args)
-    for i, node in enumerate(subgraph_fn.graph_module.graph.nodes):
-        if node.op == "placeholder":
-            assert node not in V.graph.env
-            V.graph.env[node] = args[i]
-            continue
-        elif node.op == "output":
-            args, kwargs = V.graph.fetch_args_kwargs_from_env(node)
-            output = torch.fx.Interpreter.output(V.graph, node, args, kwargs)
-        else:
-            assert node not in V.graph.env
-            V.graph.env[node] = V.graph.run_node(node)
+
+    # Process subgraph nodes using the shared helper
+    output = process_subgraph_nodes(subgraph_fn.graph_module, list(args))
 
     assert output is not None and additional_deps
 
@@ -7442,9 +7458,9 @@ def _sink_tokens(tokens):
 def with_effects(token, op, *args, **kwargs):
     result = ir.EffectfulKernel.create(op, *args, **kwargs)
 
-    from torch._higher_order_ops.effects import get_effect_key
+    from torch._higher_order_ops.effects import _get_effect
 
-    effect_type = get_effect_key(op, args, kwargs)
+    effect_type = _get_effect(op)
     assert effect_type is not None
     effectful_kernel = V.graph.effectful_ops[effect_type]
 
diff --git a/torch/_inductor/memory.py b/torch/_inductor/memory.py
index 6f58b683ac22b..ed223de71c079 100644
--- a/torch/_inductor/memory.py
+++ b/torch/_inductor/memory.py
@@ -229,7 +229,7 @@ def assign_memory_planning_info_for_scheduler_buffers(
 
     # populate the MemoryPlanningInfoForBuffer attribute to each scheduler buffer
     # note: there are scheduler buffers not in dep_name_to_succ_nodes (e.g., graph outputs)
-    for buf_name in name_to_buf.keys():
+    for buf_name in name_to_buf:
         name_to_buf[buf_name].mpi_buffer = MemoryPlanningInfoForBuffer(
             size_alloc=sched_buf_to_size[buf_name][0],
             size_free=sched_buf_to_size[buf_name][1],
diff --git a/torch/_inductor/output_code.py b/torch/_inductor/output_code.py
index ef6ddfb03d6c4..e2eb02986ea4e 100644
--- a/torch/_inductor/output_code.py
+++ b/torch/_inductor/output_code.py
@@ -30,6 +30,7 @@
 
 import torch
 from torch._dynamo.utils import counters, get_runtime_metrics_context
+from torch._higher_order_ops.wrap import inductor_compiled_code
 from torch._inductor.cudagraph_utils import (
     BoxedDeviceIndex,
     CudagraphCachedInfo,
@@ -51,6 +52,7 @@
 )
 from torch.autograd.profiler import record_function
 from torch.utils._ordered_set import OrderedSet
+from torch.utils._python_dispatch import is_in_torch_dispatch_mode
 
 from . import config
 from .runtime.autotune_cache import AutotuneCacheBundler
@@ -437,6 +439,7 @@ class CompiledFxGraph(OutputCode):
 
     _boxed_call: Optional[bool] = None
     _triton_bundle: Optional[TritonBundle] = None
+    _wrap_compiled_regions: bool = False
 
     def __init__(
         self,
@@ -581,6 +584,10 @@ def __init__(
         # aot autograd needs to know to pass in inputs as a list
         self._boxed_call = True
 
+        # Store whether to wrap compiled regions in inductor_compiled_code HOP
+        # This is set at compile time to avoid runtime overhead
+        self._wrap_compiled_regions = config.wrap_inductor_compiled_regions
+
     def __del__(self) -> None:
         if self.compiled_fn_runner is not None:
             # For torch._inductor.config.graph_partition = True,
@@ -712,6 +719,19 @@ def post_compile(
             self.mutated_input_idxs,
         )
 
+        # Apply inductor_compiled_code HOP wrapper if configured
+        # This is done in post_compile to ensure it works with cached artifacts
+        if self._wrap_compiled_regions and self.current_callable is not None:
+            original_callable = self.current_callable
+
+            def wrapped_callable(inputs):
+                if is_in_torch_dispatch_mode():
+                    return inductor_compiled_code(original_callable, inputs)
+                else:
+                    return original_callable(inputs)
+
+            self.current_callable = wrapped_callable
+
     def set_triton_bundle(self, triton_bundle: Any) -> None:
         self._triton_bundle = triton_bundle
 
@@ -773,9 +793,86 @@ class CompiledAOTI(OutputCode):
     """
 
     filename: Union[str, list[Union[str, Weights]], torch.fx.GraphModule]
+    device_type: str
+    current_callable: Optional[Callable[..., Any]] = None
+    _cached_files: dict[str, bytes] = dataclasses.field(default_factory=dict)
+
+    def __post_init__(self):
+        if not config.aot_inductor.link_libtorch:
+            return
+
+        if (
+            torch._inductor.cpp_builder._IS_MACOS
+            or torch._inductor.cpp_builder._IS_WINDOWS
+        ):
+            return
+
+        if config.aot_inductor.cross_target_platform == "windows":
+            return
+
+        if config.aot_inductor.package_cpp_only:
+            return
+
+        if not config.enable_autograd_for_aot:
+            return
+
+        if isinstance(self.filename, list):
+            current_callable = next(
+                fn for fn in self.filename if isinstance(fn, str) and fn.endswith(".so")
+            )
+        else:
+            current_callable = self.filename
+
+        if isinstance(current_callable, torch.fx.GraphModule):
+            self.current_callable = current_callable
+            return
+
+        if self.device_type.startswith("cuda"):
+            current_callable = (
+                torch._C._aoti.AOTIModelContainerRunnerCuda(  # type: ignore[call-arg]
+                    current_callable,
+                    1,
+                    self.device_type,
+                    "",
+                    True,
+                ).run  # type: ignore[attr-defined]
+            )  # type: ignore[attr-defined]
+        elif self.device_type == "cpu":
+            current_callable = (
+                torch._C._aoti.AOTIModelContainerRunnerCpu(  # type: ignore[call-arg]
+                    current_callable, 1
+                ).run  # type: ignore[attr-defined]
+            )  # type: ignore[attr-defined]
+        else:
+            raise RuntimeError(f"unsupported device type {self.device_type}")
+        self.current_callable = current_callable
+        self._boxed_call = True
+        for file in self._cached_files:
+            if not os.path.exists(file):
+                with open(file, "wb") as f:
+                    f.write(self._cached_files[file])
 
     def __call__(self, inputs: Sequence[Any]) -> Any:
-        raise NotImplementedError("NYI")
+        if self.current_callable is None:
+            raise RuntimeError("AOTInductor compiled so is not loaded")
+        return self.current_callable(inputs)
+
+    def prepare_for_serialization(self) -> None:
+        self.current_callable = None
+        self._cached_files = {}
+        filenames: list[str] = []
+        if isinstance(self.filename, list):
+            filenames = self.filename  # type: ignore[assignment]
+        elif isinstance(self.filename, str):
+            filenames = [self.filename]
+        for name in filenames:
+            with open(name, "rb") as f:
+                self._cached_files[name] = f.read()
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["current_callable"] = None
+        return state
 
     def post_compile(
         self,
@@ -783,10 +880,8 @@ def post_compile(
         constants: CompiledFxGraphConstants,
         graph_kwargs: _CompileFxKwargs,
     ) -> None:
-        pass
-
-    def prepare_for_serialization(self) -> None:
-        pass
+        if self.current_callable is None:
+            self.__post_init__()
 
     def set_triton_bundle(self, triton_bundle: Any) -> None:
         pass
diff --git a/torch/_inductor/pattern_matcher.py b/torch/_inductor/pattern_matcher.py
index 098d2cda8a760..c015c5232adf3 100644
--- a/torch/_inductor/pattern_matcher.py
+++ b/torch/_inductor/pattern_matcher.py
@@ -1442,6 +1442,13 @@ def register_replacement(
     """
     argnames_static = [*inspect.signature(search_fn).parameters.keys()]
 
+    if inspect.ismethod(search_fn):
+        search_fn = _wrap_bound_method(search_fn, argnames_static)
+
+    if inspect.ismethod(replace_fn):
+        replace_argnames = [*inspect.signature(replace_fn).parameters.keys()]
+        replace_fn = _wrap_bound_method(replace_fn, replace_argnames)
+
     def check_fn(match: Match) -> bool:
         """
         Often shapes get burned into the pattern, so our initial match ran with
@@ -1933,6 +1940,22 @@ def compute_mutation_region_ids(graph: torch.fx.Graph) -> None:
         nd.meta["mutation_region_id"] = mutation_region_id
 
 
+def _wrap_bound_method(fn: Any, argnames: list[str]) -> Any:
+    """
+    Wrap a bound method to remove 'self' from its signature for FX tracing.
+    """
+
+    def wrapper(*args: Any, **kwargs: Any) -> Any:
+        return fn(*args, **kwargs)
+
+    params = [
+        inspect.Parameter(name, inspect.Parameter.POSITIONAL_OR_KEYWORD)
+        for name in argnames
+    ]
+    wrapper.__signature__ = inspect.Signature(params)  # type: ignore[attr-defined]
+    return wrapper
+
+
 class PatternMatcherPass:
     def __init__(
         self,
diff --git a/torch/_inductor/rocm_multiarch_utils.py b/torch/_inductor/rocm_multiarch_utils.py
new file mode 100644
index 0000000000000..a1a6103e10915
--- /dev/null
+++ b/torch/_inductor/rocm_multiarch_utils.py
@@ -0,0 +1,264 @@
+"""
+ROCm Multi-Architecture Support Utilities
+Compile LLVM IR to multi-arch bundles that HIP can load automatically.
+"""
+
+import os
+import subprocess
+from typing import Optional
+
+import torch
+from torch.utils.cpp_extension import _join_rocm_home, ROCM_HOME
+
+
+def get_rocm_compiler() -> str:
+    """
+    Get path to ROCm's clang compiler.
+    Uses PyTorch's ROCM_HOME detection.
+
+    Returns:
+        Path to clang compiler
+
+    Raises:
+        RuntimeError: If ROCm is not found
+    """
+    if ROCM_HOME is None:
+        raise RuntimeError(
+            "ROCm installation not found. "
+            "PyTorch was not built with ROCm support or ROCM_HOME is not set."
+        )
+
+    # ROCm's clang is at <ROCM_HOME>/llvm/bin/clang
+    clang_path = _join_rocm_home("llvm", "bin", "clang")
+
+    if not os.path.exists(clang_path):
+        raise RuntimeError(
+            f"ROCm clang not found at {clang_path}. ROCM_HOME is set to {ROCM_HOME}"
+        )
+
+    return clang_path
+
+
+def get_rocm_bundler() -> str:
+    """
+    Get path to clang-offload-bundler.
+    Uses PyTorch's ROCM_HOME detection.
+
+    Returns:
+        Path to bundler
+
+    Raises:
+        RuntimeError: If bundler is not found
+    """
+    if ROCM_HOME is None:
+        raise RuntimeError(
+            "ROCm installation not found. "
+            "PyTorch was not built with ROCm support or ROCM_HOME is not set."
+        )
+
+    # Bundler is at <ROCM_HOME>/llvm/bin/clang-offload-bundler
+    bundler_path = _join_rocm_home("llvm", "bin", "clang-offload-bundler")
+
+    if not os.path.exists(bundler_path):
+        raise RuntimeError(
+            f"clang-offload-bundler not found at {bundler_path}. "
+            f"ROCM_HOME is set to {ROCM_HOME}"
+        )
+
+    return bundler_path
+
+
+def get_rocm_target_archs() -> list[str]:
+    """
+    Get target architectures from environment or config.
+    Returns: List of architecture strings (e.g., ['gfx90a', 'gfx942'])
+    """
+    # Check PYTORCH_ROCM_ARCH environment variable
+    env_archs = os.environ.get("PYTORCH_ROCM_ARCH", "").strip()
+    if env_archs:
+        archs = [arch.strip() for arch in env_archs.replace(";", ",").split(",")]
+        archs = [arch for arch in archs if arch]
+        if archs:
+            return archs
+
+    # Try to get from inductor config
+    try:
+        from torch._inductor import config
+
+        if hasattr(config, "rocm") and hasattr(config.rocm, "target_archs"):
+            archs = config.rocm.target_archs
+            if archs:
+                return archs
+
+    except Exception:
+        pass
+
+    return torch.cuda.get_arch_list()
+
+
+def compile_llvm_ir_to_code_object(
+    llvm_ir_path: str, output_path: str, target_arch: str
+) -> bool:
+    """
+    Compile unbundled LLVM IR to a single-arch code object.
+
+    Args:
+        llvm_ir_path: Path to .ll file
+        output_path: Where to write .hsaco file
+        target_arch: Target architecture (e.g., 'gfx90a')
+
+    Returns:
+        True if successful
+    """
+    if not os.path.exists(llvm_ir_path):
+        return False
+
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+
+    try:
+        clang = get_rocm_compiler()
+    except RuntimeError:
+        return False
+
+    # Using clang and not hipcc since we are not compiling source code
+    # Instead we use the LLVM IR (.ll) provided by triton
+    cmd = [
+        clang,
+        "-target",
+        "amdgcn-amd-amdhsa",
+        f"-mcpu={target_arch}",
+        llvm_ir_path,
+        "-o",
+        output_path,
+    ]
+
+    try:
+        subprocess.run(cmd, capture_output=True, text=True, check=True)
+
+        if not os.path.exists(output_path):
+            return False
+
+        return True
+
+    except subprocess.CalledProcessError:
+        return False
+
+
+def create_multiarch_bundle(code_objects: dict, output_bundle_path: str) -> bool:
+    """
+    Bundle multiple architecture code objects into a single multi-arch bundle.
+
+    Uses clang-offload-bundler to create a fat binary that HIP runtime can load.
+    The runtime automatically selects the correct architecture at load time.
+
+    Args:
+        code_objects: Dict mapping architecture to code object path
+        output_bundle_path: Path for output bundle
+
+    Returns:
+        True if successful
+    """
+    if not code_objects:
+        return False
+
+    os.makedirs(os.path.dirname(output_bundle_path), exist_ok=True)
+
+    try:
+        bundler = get_rocm_bundler()
+    except RuntimeError:
+        return False
+
+    # Build targets and inputs lists for clang-offload-bundler
+    targets = ["host-x86_64-unknown-linux-gnu"]
+
+    # We include a dummy host entry to satisfy the bundler format
+    inputs = ["/dev/null"]
+
+    for arch, path in sorted(code_objects.items()):
+        if not os.path.exists(path):
+            continue
+        # hipv4 = HIP version 4 code object format
+        # amdgcn-amd-amdhsa = target triple for ROCm/HSA runtime
+        # arch = specific GPU (gfx90a, gfx942, etc.)
+        targets.append(f"hipv4-amdgcn-amd-amdhsa--{arch}")
+        inputs.append(path)
+
+    if len(inputs) == 1:  # Only host, no device code
+        return False
+
+    cmd = [
+        bundler,
+        "--type=o",
+        # CRITICAL: HIP runtime expects 4096-byte alignment for loading bundles
+        # Without this, hipModuleLoadData gives segmentation fault
+        "-bundle-align=4096",  # CRITICAL: Required by HIP runtime!
+        f"--targets={','.join(targets)}",
+    ]
+
+    for input_file in inputs:
+        cmd.append(f"--input={input_file}")
+
+    cmd.append(f"--output={output_bundle_path}")
+
+    try:
+        subprocess.run(cmd, capture_output=True, text=True, check=True)
+
+        if not os.path.exists(output_bundle_path):
+            return False
+
+        return True
+
+    except subprocess.CalledProcessError:
+        return False
+
+
+def compile_multiarch_bundle_from_llvm_ir(
+    llvm_ir_path: str, output_bundle_path: str, target_archs: Optional[list[str]] = None
+) -> bool:
+    """
+    Complete workflow: LLVM IR → multiple code objects → bundle.
+
+    This is the main entry point for multi-arch compilation.
+
+    Args:
+        llvm_ir_path: Path to .ll file
+        output_bundle_path: Where to write bundle
+        target_archs: Optional list of architectures
+
+    Returns:
+        True if successful
+    """
+    if target_archs is None:
+        # Get architectures from environment variable or config
+        target_archs = get_rocm_target_archs()
+
+    # Step 1: Compile LLVM IR to code object for each architecture
+    code_objects = {}
+    temp_dir = os.path.dirname(output_bundle_path)
+    kernel_name = os.path.splitext(os.path.basename(llvm_ir_path))[0]
+
+    for arch in target_archs:
+        # Create temporary single-architecture code object
+        # Format: kernel_name_gfx90a.co, kernel_name_gfx942.co, etc.
+        co_path = os.path.join(temp_dir, f"{kernel_name}_{arch}.co")
+
+        # Compile with clang backend: LLVM IR → GPU machine code
+        if compile_llvm_ir_to_code_object(llvm_ir_path, co_path, arch):
+            code_objects[arch] = co_path
+
+    if not code_objects:
+        return False
+
+    # Step 2: Bundle all code objects together
+    # Uses clang-offload-bundler to create fat binary
+    success = create_multiarch_bundle(code_objects, output_bundle_path)
+
+    # Step 3: Clean up temporary single-arch code objects
+    # The bundle contains all the code, so intermediates are no longer needed
+    for co_path in code_objects.values():
+        try:
+            os.remove(co_path)
+        except Exception:
+            pass
+
+    return success
diff --git a/torch/_inductor/runtime/benchmarking.py b/torch/_inductor/runtime/benchmarking.py
index d592a8c8c00f9..cd66868edb797 100644
--- a/torch/_inductor/runtime/benchmarking.py
+++ b/torch/_inductor/runtime/benchmarking.py
@@ -5,13 +5,14 @@
 from functools import cached_property, wraps
 from itertools import chain
 from statistics import median
-from typing import Any, Optional, Union
-from typing_extensions import Concatenate, ParamSpec, Self, TypeVar
+from typing import Any, Concatenate, Optional, Union
+from typing_extensions import ParamSpec, Self, TypeVar
 
 import torch
 import torch.utils._pytree as pytree
 from torch._dynamo.utils import counters, dynamo_timed
 from torch._inductor.config import use_experimental_benchmarker
+from torch.utils._debug_mode import DebugMode
 
 
 logger = torch._logging.getArtifactLogger(__name__, "benchmarking")
@@ -189,12 +190,14 @@ def benchmark(
         else:
             _callable = lambda: fn(*fn_args, **fn_kwargs)  # noqa: E731
 
-        if inferred_device == torch.device("cpu"):
-            return self.benchmark_cpu(_callable, **kwargs)
-        # TODO(nmacchioni): For non-CPU functions we default to using the GPU-specific benchmarking
-        # implementation which was written specifically with CUDA devices in mind, we may want to
-        # explore alternate implementations for other device types.
-        return self.benchmark_gpu(_callable, **kwargs)
+        # Surfacing all kernels during autotuning is super noisy; filtering these out.
+        with DebugMode._benchmarking_inductor():
+            if inferred_device == torch.device("cpu"):
+                return self.benchmark_cpu(_callable, **kwargs)
+            # TODO(nmacchioni): For non-CPU functions we default to using the GPU-specific benchmarking
+            # implementation which was written specifically with CUDA devices in mind, we may want to
+            # explore alternate implementations for other device types.
+            return self.benchmark_gpu(_callable, **kwargs)
 
     @time_and_count
     def benchmark_cpu(
diff --git a/torch/_inductor/runtime/caching/config.py b/torch/_inductor/runtime/caching/config.py
index 748715d1631ad..14e13f937dbb7 100644
--- a/torch/_inductor/runtime/caching/config.py
+++ b/torch/_inductor/runtime/caching/config.py
@@ -1,6 +1,6 @@
 import os
+from collections.abc import Callable
 from functools import cache, partial
-from typing import Callable
 
 import torch
 from torch._environment import is_fbcode
diff --git a/torch/_inductor/runtime/caching/interfaces.py b/torch/_inductor/runtime/caching/interfaces.py
index 0758e11134018..d0c1011200e43 100644
--- a/torch/_inductor/runtime/caching/interfaces.py
+++ b/torch/_inductor/runtime/caching/interfaces.py
@@ -7,20 +7,20 @@
 from ast import literal_eval
 from enum import Enum
 from functools import partial, wraps
-from logging import DEBUG, getLogger, Logger
+from logging import DEBUG, getLogger, INFO, Logger
 from os import PathLike
 from pathlib import Path
 from threading import Lock
 from time import time
-from typing import Any, Callable, TYPE_CHECKING
-from typing_extensions import override, TypeAlias
-
-from filelock import FileLock
+from typing import Any, TYPE_CHECKING, TypeAlias
+from typing_extensions import override
 
 from . import config, context, exceptions, implementations as impls, locks
 
 
 if TYPE_CHECKING:
+    from collections.abc import Callable
+
     from .utils import P, R
 
 
@@ -327,10 +327,10 @@ def insert(
     def record(
         self,
         ischema: context.IsolationSchema | None = None,
-        custom_params_encoder: Callable[P, Any] | None = None,
-        custom_result_encoder: Callable[[R], Any] | None = None,
-        custom_result_decoder: Callable[[Any], R] | None = None,
-    ) -> Callable[[Callable[P, R]], Callable[P, R]]:
+        custom_params_encoder: Callable[..., Any] | None = None,
+        custom_result_encoder: Callable[..., Any] | None = None,
+        custom_result_decoder: Callable[..., ...] | None = None,
+    ) -> Callable[[Callable[..., ...]], Callable[..., ...]]:
         if custom_result_encoder and not custom_result_decoder:
             raise exceptions.CustomResultDecoderRequiredError(
                 "Custom result encoder provided without custom result decoder."
@@ -504,16 +504,22 @@ def __init__(self) -> None:
         super().__init__()
         self._imc: impls._InMemoryCacheImpl = impls._InMemoryCacheImpl()
 
-        if fpath := os.environ.get("TORCHINDUCTOR_PRE_POPULATE_DETERMINISTIC_CACHE"):
-            # pyrefly: ignore [bad-assignment]
-            flock: FileLock = FileLock(str(fpath) + ".lock")
-            with locks._acquire_flock_with_timeout(flock):
-                with open(fpath) as fp:
-                    dump_for_pre_population: dict[str, str] = json.load(fp)
-                for key_r, value_r in dump_for_pre_population.items():
-                    key: bytes = literal_eval(key_r)
-                    value: bytes = literal_eval(value_r)
-                    self._imc._memory[key] = value
+        if fpath_str := os.environ.get(
+            "TORCHINDUCTOR_PRE_POPULATE_DETERMINISTIC_CACHE"
+        ):
+            fpath: Path = Path(fpath_str)
+            fpath_parent: PathLike[str] = fpath.parent
+            if fpath.is_file():
+                odc: impls._OnDiskCacheImpl = impls._OnDiskCacheImpl(
+                    sub_dir=fpath_parent
+                )
+                with odc.lock():
+                    with open(fpath) as fp:
+                        dump_for_pre_population: dict[str, str] = json.load(fp)
+                    for key_r, value_r in dump_for_pre_population.items():
+                        key: bytes = literal_eval(key_r)
+                        value: bytes = literal_eval(value_r)
+                        self._imc._memory[key] = value
 
         if config.STRICTLY_PRE_POPULATED_DETERMINISM:
             # we'll never need a synchronization cache if we're in strictly pre-populated mode,
@@ -576,7 +582,7 @@ def _dump_imc_to_disk(self) -> Path | None:
                     for key, value in existing_dump.items():
                         if key not in to_dump:
                             to_dump[key] = value
-                        else:
+                        elif to_dump[key] != value:
                             raise exceptions.DeterministicCachingIMCDumpConflictError from None
 
                     w_fp = open(fpath, "w")
@@ -584,6 +590,9 @@ def _dump_imc_to_disk(self) -> Path | None:
                     assert w_fp is not None
                     try:
                         json.dump(to_dump, w_fp, indent=4)
+                        logger.log(
+                            INFO, "Dumped deterministic cache memoization to %s", fpath
+                        )
                     finally:
                         w_fp.close()
 
diff --git a/torch/_inductor/runtime/caching/locks.py b/torch/_inductor/runtime/caching/locks.py
index e7e1f1adc3622..8e8cd011e2d44 100644
--- a/torch/_inductor/runtime/caching/locks.py
+++ b/torch/_inductor/runtime/caching/locks.py
@@ -12,8 +12,8 @@
 from __future__ import annotations
 
 from contextlib import _GeneratorContextManager, contextmanager, ExitStack
-from typing import Generator, TYPE_CHECKING
-from typing_extensions import Protocol, TypeAlias
+from typing import TYPE_CHECKING, TypeAlias
+from typing_extensions import Protocol
 
 from filelock import FileLock, Timeout
 
@@ -21,6 +21,7 @@
 
 
 if TYPE_CHECKING:
+    from collections.abc import Generator
     from threading import Lock
 
 
diff --git a/torch/_inductor/runtime/coordinate_descent_tuner.py b/torch/_inductor/runtime/coordinate_descent_tuner.py
index 341475ef1d6fb..36bd64cbae280 100644
--- a/torch/_inductor/runtime/coordinate_descent_tuner.py
+++ b/torch/_inductor/runtime/coordinate_descent_tuner.py
@@ -5,6 +5,8 @@
 from collections.abc import Callable
 from typing import TYPE_CHECKING
 
+from torch.utils._ordered_set import OrderedSet
+
 from .hints import TRITON_MAX_BLOCK
 from .runtime_utils import red_text, triton_config_to_hashable
 
@@ -51,9 +53,11 @@ def __init__(
         self,
         is_mm=False,
         is_native_matmul=False,
+        is_mix_order_reduction=False,
         name="unknown",
         size_hints=None,
         inductor_meta=None,
+        frozen_fields=None,
     ):
         self.is_mm = is_mm  # we will tune num_stages for mm
 
@@ -62,10 +66,14 @@ def __init__(
         # tl.dot also does not support size smaller than 16; we put this restriction.
         self.is_native_matmul = is_native_matmul
         assert not (self.is_mm and self.is_native_matmul)
+        self.is_mix_order_reduction = is_mix_order_reduction
         self.cached_benchmark_results = {}
         self.name = name
         self.size_hints = size_hints
         self.inductor_meta = inductor_meta or {}
+        self.frozen_fields: OrderedSet[str] = (
+            OrderedSet(frozen_fields) if frozen_fields is not None else OrderedSet()
+        )
 
     def get_config_max(self, prefix: str) -> int:
         max_block = TRITON_MAX_BLOCK[prefix.upper()]
@@ -117,7 +125,13 @@ def tunable_fields(self):
             out.append("num_stages")
             out.remove("ZBLOCK")  # ZBLOCK=1 always in native matmul
 
-        return out
+        if self.is_mix_order_reduction:
+            # unlike TritonConfig.num_stages, this one is
+            # put in TritonConfig.kwargs["NUM_STAGES"] and is used to
+            # control the stage of pipelining of tl.range.
+            out.append("NUM_STAGES")
+
+        return [f for f in out if f not in self.frozen_fields]
 
     def value_too_large(self, name: str, val: int) -> bool:
         block_suffix = "BLOCK"
@@ -140,15 +154,23 @@ def value_too_small(self, name: str, val: int) -> bool:
         # Break if value becomes 0/neg
         return val <= 0
 
-    def get_neighbour_values(self, name, orig_val, radius=1, include_self=False):
+    def get_neighbour_values(self, name, orig_val, radius=None, include_self=False):
         """
         Get neighbour values in 'radius' steps. The original value is not
         returned as it's own neighbour.
         """
+        if radius is None:
+            radius = 1
+        if name == "NUM_STAGES":
+            # we see cases that
+            # NUM_STAGES=1 is better than NUM_STAGES=2
+            # while NUM_STAGES=1 is worse than NUM_STAGES=3
+            radius = max(radius, 2)
+
         assert radius >= 1
 
         def update(cur_val, inc=True):
-            if name == "num_stages":
+            if name in ["num_stages", "NUM_STAGES"]:
                 if inc:
                     return cur_val + 1
                 else:
@@ -185,6 +207,15 @@ def has_improvement(baseline, test):
         threshold = 0.001  # 0.1%
         return test is not None and test < baseline * (1 - threshold)
 
+    def is_valid_config(self, config) -> bool:
+        if self.is_mix_order_reduction:
+            # Mix order reduction has an extra constraint that
+            # we should not tune XBLOCK beyond RSPLIT_SIZE
+            xblock = config.kwargs["XBLOCK"]
+            split_size = config.kwargs["RSPLIT_SIZE"]
+            return xblock <= split_size
+        return True
+
     def check_all_tuning_directions(
         self,
         # pyrefly: ignore [missing-attribute]
@@ -203,10 +234,11 @@ def check_all_tuning_directions(
             old_value = get_field(best_config, field)
             if old_value is None:
                 continue
+            radius = self.inductor_meta.get("coordinate_descent_search_radius", 1)
             candidate_values = self.get_neighbour_values(
                 field,
                 old_value,
-                radius=self.inductor_meta.get("coordinate_descent_search_radius", 1),
+                radius=radius,
                 include_self=True,
             )
             candidate_values_list.append(candidate_values)
@@ -219,6 +251,8 @@ def check_all_tuning_directions(
             candidate_config = copy.deepcopy(best_config)
             for new_val, field in zip(choice, effective_fields):
                 set_field(candidate_config, field, new_val)
+            if not self.is_valid_config(candidate_config):
+                continue
             cmp_res, candidate_timing = self.compare_config(
                 func, candidate_config, best_config, best_timing
             )
@@ -296,6 +330,8 @@ def autotune(
                     candidate_config = copy.deepcopy(best_config)
                     set_field(candidate_config, name, next_val)
 
+                    if not self.is_valid_config(candidate_config):
+                        continue
                     cmp_res, candidate_timing = self.compare_config(
                         func, candidate_config, best_config, best_timing
                     )
diff --git a/torch/_inductor/runtime/runtime_utils.py b/torch/_inductor/runtime/runtime_utils.py
index b6e3e13ced517..169e105d10b03 100644
--- a/torch/_inductor/runtime/runtime_utils.py
+++ b/torch/_inductor/runtime/runtime_utils.py
@@ -187,3 +187,57 @@ def compile_mps_shader(source: str) -> Any:
         return torch.mps.compile_shader(source)
     except SyntaxError as err:
         raise SyntaxError(f"failed to compile {source} with {err.msg}") from err
+
+
+def torch_dtype_to_jax_runtime(dtype: torch.dtype) -> Any:
+    """
+    Map PyTorch dtype to actual JAX dtype object at runtime.
+
+    This helper is used in generated Pallas kernels at runtime to convert
+    PyTorch dtypes to JAX dtype objects (not string representations).
+
+    Args:
+        dtype: PyTorch dtype to convert
+
+    Returns:
+        JAX dtype object (e.g., jnp.float32 object itself)
+    """
+    import jax.numpy as jnp  # pyrefly: ignore [import-error]
+
+    dtype_map = {
+        torch.float32: jnp.float32,
+        torch.float64: jnp.float64,
+        torch.float16: jnp.float16,
+        torch.bfloat16: jnp.bfloat16,
+        torch.int32: jnp.int32,
+        torch.int64: jnp.int64,
+        torch.int16: jnp.int16,
+        torch.int8: jnp.int8,
+        torch.uint8: jnp.uint8,
+        torch.bool: jnp.bool_,
+        torch.complex64: jnp.complex64,
+        torch.complex128: jnp.complex128,
+    }
+    if dtype not in dtype_map:
+        raise ValueError(f"Unsupported dtype for JAX conversion: {dtype}")
+    return dtype_map[dtype]
+
+
+def torch_dtype_to_jax(dtype: torch.dtype) -> str:
+    """
+    Map PyTorch dtype to JAX dtype expression string.
+
+    This helper is used at compile time in codegen to generate
+    JAX dtype expressions for Pallas kernels.
+
+    Args:
+        dtype: PyTorch dtype to convert
+
+    Returns:
+        JAX dtype expression as string (e.g., "jnp.float32")
+    """
+    jax_dtype = torch_dtype_to_jax_runtime(dtype)
+    dtype_name = jax_dtype.__name__
+    if dtype_name == "bool":
+        dtype_name = "bool_"
+    return f"jnp.{dtype_name}"
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
index fe6788fb21e91..d5851eeceeb24 100644
--- a/torch/_inductor/runtime/triton_heuristics.py
+++ b/torch/_inductor/runtime/triton_heuristics.py
@@ -22,9 +22,9 @@
 
 import torch
 from torch._dynamo.utils import counters, set_feature_use
-from torch._environment import is_fbcode
 from torch._inductor import metrics
 from torch._prims_common import compute_required_storage_length
+from torch.utils._debug_mode import get_active_debug_mode
 from torch.utils._ordered_set import OrderedSet
 
 from ..triton_bundler import TritonBundler
@@ -330,9 +330,11 @@ def __init__(
         log.debug("Triton cache dir: %s", os.environ["TRITON_CACHE_DIR"])
 
         self.size_hints = size_hints
+        self.is_mix_order_reduction = self.inductor_meta.get("RSPLIT_SIZE") is not None
         self.coordesc_tuner = CoordescTuner(
             is_mm=False,
             is_native_matmul=triton_meta.get("native_matmul", False),
+            is_mix_order_reduction=self.is_mix_order_reduction,
             name=self.fn.__name__,
             size_hints=size_hints,
             inductor_meta=self.inductor_meta,
@@ -830,7 +832,7 @@ def _precompile_config(self, cfg: Config) -> CompileResult[_KernelType]:
                 # only add inductor_args if the hook takes it
                 sig = inspect.signature(hook)
                 params = sig.parameters
-                if "inductor_args" in params:
+                if "inductor_args" in params and "config_args" in self.inductor_meta:
                     call_kwargs["inductor_args"] = self.inductor_meta["config_args"]
 
                 hook(**call_kwargs)
@@ -1163,15 +1165,36 @@ def save_gpu_kernel(self, stream, launcher):
                 launcher.bin.metadata, "threads_per_warp", 32
             )
 
+        from torch._inductor import config
         from torch._inductor.codecache import CudaKernelParamCache
 
         bin_type = {"hip": "hsaco", "xpu": "spv"}.get(self.device_props.type, "cubin")
         binary = launcher.bin.asm[bin_type]
-        # Also store asm code which can be used for debugging and generating cpp package
-        asm_type = {"hip": "amdgcn", "cuda": "ptx", "xpu": "spv"}.get(
-            self.device_props.type, None
-        )
-        asm = launcher.bin.asm.get(asm_type, None)
+
+        # ROCm multi-arch: capture LLVM IR
+        if torch.version.hip and config.aot_inductor.emit_multi_arch_kernel:
+            # Multi-arch ROCm: Capture LLVM IR for cross-architecture compilation
+            asm_type = "ll"
+
+            # llir is the key to obtain LLVM IR from triton
+            asm = launcher.bin.asm.get("llir", None)
+
+            # CRITICAL: Multi-arch compilation cannot proceed without LLVM IR
+            # Fail fast with clear error message pointing to the issue
+            if not asm:
+                available_keys = list(launcher.bin.asm.keys())
+                raise RuntimeError(
+                    f"ROCm multi-arch requires LLVM IR, but none found. "
+                    f"Available keys: {available_keys}. "
+                    f"Triton may need to be patched to emit LLVM IR."
+                )
+
+        # Everything else: capture architecture-specific assembly
+        else:
+            asm_type = {"hip": "amdgcn", "cuda": "ptx", "xpu": "spv"}.get(
+                self.device_props.type, None
+            )
+            asm = launcher.bin.asm.get(asm_type, None)
 
         CudaKernelParamCache.set(key, params, binary, bin_type, asm, asm_type)
         self.cuda_kernel_saved = True
@@ -1314,6 +1337,17 @@ def run(
         benchmark_run=False,
         **kwargs,
     ):  # type:ignore[override]
+        """Launch triton kernel call and return result."""
+        debug_mode = get_active_debug_mode()
+        debug_call = None
+        if debug_mode:
+            arg_names = list(self.triton_meta.get("signature", {}).keys())
+            kernel_kwargs = dict(zip(arg_names, args))
+            kernel_kwargs.update(kwargs)
+            debug_call = debug_mode.record_triton_kernel(
+                kernel_name=self.fn.__name__, kwargs=kernel_kwargs
+            )
+
         if hasattr(triton, "set_allocator"):
 
             def alloc_fn(size: int, align: int, stream: int | None):
@@ -1369,18 +1403,22 @@ def alloc_fn(size: int, align: int, stream: int | None):
                 args_without_constexprs,
                 profiler_kwargs,
             ):
-                return launcher(
+                result = launcher(
                     *args,
                     **kwargs,
                     stream=stream,
                 )
         else:
-            return launcher(
+            result = launcher(
                 *args,
                 **kwargs,
                 stream=stream,
             )
 
+        if debug_call:
+            debug_call.finalize(self.get_device_interface())
+        return result
+
     def _interpret_args_grid(
         self, args: tuple[Any, ...], cfg: Config
     ) -> tuple[tuple[Any, ...], tuple[int, int, int]]:
@@ -1396,7 +1434,7 @@ def filtered_signature() -> list[str]:
                         # These are torch compiled triton kernels that definitely
                         # have block size configs. Dynamo does not currently
                         # trace user defined triton kernels when TRITON_INTERPRET=1
-                        if x not in cfg.kwargs.keys():
+                        if x not in cfg.kwargs:
                             new_signature.append(x)
                     elif i not in get_constexprs(self.fn):
                         # use constexprs rather than just configs since user
@@ -1843,6 +1881,8 @@ def make_launcher(self) -> LauncherType:
                 else (
                     (binary.metadata.num_ctas, *binary.metadata.cluster_dims)
                     if hasattr(binary, "metadata")
+                    and hasattr(binary.metadata, "num_ctas")
+                    and hasattr(binary.metadata, "cluster_dims")
                     else ()
                 )
             ),
@@ -2428,9 +2468,8 @@ def total_numel() -> int:
             rnumels[prefix] *= 2
 
     if num_warps is None:
-        if reduction_hint == ReductionHint.INNER and not is_fbcode():
-            # r is contiguous, so ensure that each thread has 8 elements for
-            # vectorized loads, assuming bf16/fp16
+        if reduction_hint == ReductionHint.INNER:
+            # r is contiguous, ensure at least 8 elements per thread
             # xblock is usually 1-2, default to giving each thread more work
             num_warps = r // 128
         else:
@@ -2531,7 +2570,7 @@ def _maybe_filter_configs_for_tma_restrictions(inductor_meta, configs: list[Conf
             }
 
         assert all(
-            block_type in configs[0].kwargs for block_type in tma_min_block_sizes.keys()
+            block_type in configs[0].kwargs for block_type in tma_min_block_sizes
         )
 
         # Add a config that is guaranteed to compile
@@ -2900,7 +2939,7 @@ def outer_config_opt():
         )
 
     contiguous_config = make_config(
-        2 if rnumel <= 2048 and not is_fbcode() else 1,  # 1024 or less is persistent
+        2 if rnumel <= 2048 else 1,  # 1024 or less is persistent
         min(rnumel, MAX_R0_BLOCK),
         register_intensive=register_intensive,
     )
@@ -2913,7 +2952,7 @@ def outer_config_opt():
     outer_config = make_config(64, 8, register_intensive=register_intensive)
     # TODO (paulzhan): Test heuristic on AMD and internal testing
     # for correctness
-    if not torch.version.hip and not is_fbcode():
+    if not torch.version.hip:
         outer_config = outer_config_opt()
 
     configs = []
@@ -3168,7 +3207,7 @@ def reduction(
     assert triton_meta is not None
 
     num_dynamic = 0
-    for k in triton_meta["signature"].keys():
+    for k in triton_meta["signature"]:
         if "ks" in k:
             num_dynamic += 1
 
@@ -3390,8 +3429,12 @@ def persistent_reduction(
         for c in configs:
             c.kwargs["RSPLIT_SIZE"] = inductor_meta.get("RSPLIT_SIZE")
 
+            c.kwargs["NUM_STAGES"] = 1
+
             # small XBLOCK to use less registers/smem
-            c.kwargs["XBLOCK"] = 1
+            c.kwargs["XBLOCK"] = (
+                torch._inductor.config.triton.mix_order_reduction_initial_xblock
+            )
 
             rnumel_hint = size_hints["r0_"]
 
@@ -3718,8 +3761,9 @@ class MixOrderReductionGrid(GridExpr):
     def generate(self, meta: dict[str, int]) -> None:
         split_size = meta.get("RSPLIT_SIZE")
         xblock = meta.get("XBLOCK")
-        assert split_size
-        assert xblock == 1, "Mix order reduction force XBLOCK=1 right now"
+        assert split_size, "Missing RSPLIT_SIZE"
+        assert xblock, "Missing XBLOCK"
+        assert split_size % xblock == 0, f"{split_size=}, {xblock=}"
         self.x_grid = self.ceildiv("xnumel", split_size)
 
 
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
index df1d2f729b34a..b7f36aa306a43 100644
--- a/torch/_inductor/scheduler.py
+++ b/torch/_inductor/scheduler.py
@@ -219,6 +219,9 @@ def has_common_read(
     # TODO add a cache
     @classmethod
     def can_fuse(cls, node1: BaseSchedulerNode, node2: BaseSchedulerNode) -> bool:
+        """
+        Check whether we can fuse two reductions with mix loop orders.
+        """
         if not config.triton.mix_order_reduction:
             return False
 
@@ -246,6 +249,13 @@ def can_fuse(cls, node1: BaseSchedulerNode, node2: BaseSchedulerNode) -> bool:
         nrow = sympy.Max(g1[0], g1[1])
         ncol = sympy.Min(g1[0], g1[1])
 
+        # the fused version has worse perf than non-fused version for
+        # small workload. When a workload is small enough, data can be
+        # fully cached by L2
+        size_thres = 5 * 2**20
+        if not V.graph.sizevars.statically_known_geq(nrow * ncol, size_thres):
+            return False
+
         # We require more more row than columns since
         # 1, we prefer doing persistent reduction for each row
         # 2, we will split the reduction across the rows
@@ -262,8 +272,19 @@ def can_fuse(cls, node1: BaseSchedulerNode, node2: BaseSchedulerNode) -> bool:
             (node1, node2) if g1[1] == ncol else (node2, node1)
         )
 
+        # We previously only check the contiguous_node has contiguous
+        # access to common_reads. But that turns out to be not enough.
+        # The contiguous node may access a buffer that's node use by
+        # other_ndoe. If that ascess is non-contiugous, generating
+        # mix-order reduction can be inefficient especially when we
+        # force XBLOCK to be 1
+        # if not all(
+        #     cls.is_contiguous_load(buf, contiguous_node) for buf in common_reads
+        # ):
+        #     return False
         if not all(
-            cls.is_contiguous_load(buf, contiguous_node) for buf in common_reads
+            cls.is_contiguous_load(dep.name, contiguous_node)
+            for dep in contiguous_node.read_writes.reads
         ):
             return False
 
@@ -306,7 +327,6 @@ def are_mix_order_reductions(
     def is_contiguous_load(cls, buf: str, parent_node: BaseSchedulerNode) -> bool:
         from torch._inductor.loop_body import MemoryUsageType
 
-        n_congituous_read = 0
         for node in parent_node.get_nodes():
             assert isinstance(node, SchedulerNode)
             loop_body = node._body
@@ -328,10 +348,11 @@ def is_contiguous_load(cls, buf: str, parent_node: BaseSchedulerNode) -> bool:
                     var_symbols,
                     var_symbols,
                 )
-                n_congituous_read += stride_vars[-1] == 1
-                if n_congituous_read > 0:
-                    return True
-        return False
+
+                # stride==0 means a broadcast
+                if not (stride_vars[-1] == 0 or stride_vars[-1] == 1):
+                    return False
+        return True
 
 
 @dataclasses.dataclass
@@ -449,7 +470,6 @@ class SchedulerDonatedBuffer(SchedulerBuffer):
 
 class BaseSchedulerNode:
     ancestors: OrderedSet[str]
-    debug_device_str: Callable[[BaseSchedulerNode], list[str]]
     group: tuple[torch.device, tuple[tuple[sympy.Expr, ...], ...]]
     last_usage: OrderedSet[str]
     # .min_order and .max_order are only relevant for "grouped" nodes such as FusedSchedulerNode.
@@ -461,21 +481,26 @@ class BaseSchedulerNode:
     max_order: int
     mpi_node: MemoryPlanningInfoForNode
     mutation_renames: dict[str, str]
-    node: Optional[ir.Operation]
+    node: Optional[ir.Operation] = None
     outputs: list[SchedulerBuffer]
     outputs_by_name: dict[str, SchedulerBuffer]
     override_estimated_runtime: Optional[float] = None
     read_writes: dependencies.ReadWrites
     unmet_dependencies: OrderedSet[Dep]
+    written: bool = False
 
     def __init__(self, scheduler: Scheduler) -> None:
-        self.scheduler = scheduler
-        self.debug_device_str = lambda *args, **kwargs: []
+        self.scheduler: Scheduler = scheduler
+        self.debug_device_str: Callable[[BaseSchedulerNode], list[str]] = (
+            lambda *args, **kwargs: []
+        )
 
     def _init_from_node(self, node: ir.Operation) -> None:
         self.node = node
         self.ancestors = OrderedSet()
-        self.last_usage = OrderedSet()  # buffers that won't be used after this kernel
+        self.last_usage = OrderedSet[
+            str
+        ]()  # buffers that won't be used after this kernel
         self.written = False
         self.outputs = [
             SchedulerBuffer(
@@ -2643,6 +2668,12 @@ def _init(self, nodes: list[ir.Operation]) -> None:
         if config._pre_fusion_custom_pass is not None:
             self.nodes = config._pre_fusion_custom_pass(self.nodes)
 
+        if config.distributed_max_autotune_gemm:
+            from . import distributed_autotune
+
+            distributed_autotune.schedule(self)
+            self.compute_ancestors()
+
         self.nodes = self.fuse_nodes(self.nodes)
         if config._post_fusion_custom_pass is not None:
             self.nodes = config._post_fusion_custom_pass(self.nodes)
@@ -2711,8 +2742,9 @@ def _init(self, nodes: list[ir.Operation]) -> None:
         self.process_grouped_nodes()
 
         if (
-            torch._inductor.config.graph_partition
-            and torch._inductor.config.triton.cudagraphs
+            config.graph_partition
+            and config.triton.cudagraphs
+            and config.triton.reorder_for_reducing_graph_partitions
         ):
             self.nodes = self.maybe_reorder_for_minimizing_partition(self.nodes)
             self.nodes = self.reorder_for_partition_with_simple_dependency(self.nodes)
@@ -2889,7 +2921,7 @@ def __add__(self, other: DedupList[_T]) -> DedupList[_T]:
                         list1 = name_to_users[buf1_name]
                         list2 = name_to_users[buf2_name]
                         combined = list1 + list2
-                        for key in name_to_users.keys():
+                        for key in name_to_users:
                             if (
                                 name_to_users[key] is list1
                                 or name_to_users[key] is list2
@@ -3160,6 +3192,9 @@ def dead_node_elimination(self) -> None:
         """
         Remove any nodes without users
         """
+        if not config.use_dce:
+            return
+
         # self.nodes is in topological order, so by iterating in reverse order
         # we have visited (and potentially removed) all users before visiting a
         # given node.
@@ -3345,7 +3380,10 @@ def fuse_nodes(self, nodes: list[BaseSchedulerNode]) -> list[BaseSchedulerNode]:
                     )
                     break
 
-            if config.loop_ordering_after_fusion:
+            if (
+                config.loop_ordering_after_fusion
+                or config.loop_index_inversion_in_fusion
+            ):
                 nodes = self.fuse_nodes_once(nodes, is_reorder_round=True)
             return nodes
 
@@ -3512,6 +3550,7 @@ def rename_deps(deps: OrderedSet[Dep]) -> OrderedSet[Dep]:
 
         new_scheduler_node.min_order = node.min_order
         new_scheduler_node.max_order = node.max_order
+        new_scheduler_node.ancestors = node.ancestors
         new_scheduler_node.last_usage = node.last_usage
 
     def _any_atomic_add(self, node_list: Sequence[BaseSchedulerNode]) -> bool:
@@ -4302,6 +4341,148 @@ def decide_fusion_fail_reason(
 
         return str(reasons)
 
+    def shared_data_after_inverting_indexing(
+        self, node1: BaseSchedulerNode, node2: BaseSchedulerNode
+    ) -> int:
+        """
+        Attempts to enable fusion between two nodes by inverting indexing patterns.
+
+        This optimization targets cases where node1 has a contiguous write and
+        node2 has a contiguous write but discontiguous read. By inverting the
+        indexing in node2's read and write operations, we can make them compatible
+        with node1 for potential fusion.
+
+        Args:
+            node1: First scheduler node (source)
+            node2: Second scheduler node (target for inversion)
+
+        Returns:
+            int: Fusion score if successful, 0 if optimization not applicable
+        """
+
+        if not config.loop_index_inversion_in_fusion:
+            return -1
+
+        if any(n.is_cpu() for n in [node1, node2]):
+            return -1
+
+        # Check for shared buffers between nodes
+        node1_buffer_names = node1.read_writes.buffer_names()
+        node2_buffer_names = node2.read_writes.buffer_names()
+        common_buffer_names = node1_buffer_names & node2_buffer_names
+
+        if not common_buffer_names:
+            return -1
+
+        # only invert if node1 is single unmet dep
+        node2_unmet_dependencies = OrderedSet(
+            dep.name for dep in node2.unmet_dependencies
+        )
+        if node2_unmet_dependencies - node1_buffer_names:
+            return -1
+
+        if len(node2_unmet_dependencies) > 1:
+            return -1
+
+        # Currently only handle single read/write operations
+        if len(node2.read_writes.reads) > 1 or len(node2.read_writes.writes) > 1:
+            return -1
+
+        node2_read = next(iter(node2.read_writes.reads))
+        node2_write = next(iter(node2.read_writes.writes))
+
+        if not isinstance(node2_read, MemoryDep) or not isinstance(
+            node2_write, MemoryDep
+        ):
+            return -1
+
+        node1_writes = {dep.name: dep for dep in node1.read_writes.writes}
+        if node2_read.name not in node1_writes:
+            return -1
+
+        node1_write = node1_writes[node2_read.name]
+
+        if not isinstance(node1_write, MemoryDep):
+            return -1
+
+        # We are checking for compatibility with the normalized node1 write
+        # then modifying node2 reads/writes. since the node1 write will be just used
+        # for compatibility, while node2 will be used in actual modification, just
+        # normalize node1 not node2.
+        node1_write = node1_write.normalize()
+
+        if (
+            node1_write.index != node2_write.index
+            and node1_write.size != node2_write.size
+        ):
+            return -1
+
+        if node2_read.size != node2_write.size or len(node2_read.var_names) != 1:
+            return -1
+
+        # Verify we have exactly two indexing expressions (one read, one write)
+        if len(node2._body.indexing_exprs) != 2:  # type: ignore[attr-defined]
+            return -1
+
+        # No subblocks allowed for this optimization
+        if node2._body.subblocks:  # type: ignore[attr-defined]
+            return -1
+
+        assert (
+            "index0" in node2._body.indexing_exprs  # type: ignore[attr-defined]
+            and "index1" in node2._body.indexing_exprs  # type: ignore[attr-defined]
+        )
+
+        # Extract and verify single read expression
+        node2_read_exprs = OrderedSet(expr for expr in node2._body.get_read_exprs())  # type: ignore[attr-defined]
+        if len(node2_read_exprs) != 1:
+            return -1
+
+        read_expr = next(iter(node2_read_exprs))
+
+        # Determine which index is for reading vs writing
+        if read_expr == node2._body.indexing_exprs["index0"]:  # type: ignore[attr-defined]
+            read_expr_index = "index0"
+            write_expr_index = "index1"
+        else:
+            assert read_expr == node2._body.indexing_exprs["index1"]  # type: ignore[attr-defined]
+            read_expr_index = "index1"
+            write_expr_index = "index0"
+
+        from torch._inductor.invert_expr_analysis import generate_inverse_formula
+
+        index_vars = node2._body.vars[0]  # type: ignore[attr-defined]
+        if len(index_vars) != 1:
+            return -1
+
+        simplified_terms = []
+        for term in sympy.Add.make_args(read_expr):
+            simplified_terms.append(
+                V.graph.sizevars.combine_modular_indexing_pairs(term)
+            )
+        simplified_read_expr = sum(simplified_terms)
+
+        inverse_formula = generate_inverse_formula(simplified_read_expr, index_vars[0])
+
+        # formula is not invertible
+        if inverse_formula is None:
+            return -1
+
+        # === Apply Inversion ===
+
+        # Swap the indexing expressions using the inverse formula
+        node2._body.indexing_exprs[read_expr_index] = node2._body.indexing_exprs[  # type: ignore[attr-defined]
+            write_expr_index
+        ]
+        node2._body.indexing_exprs[write_expr_index] = inverse_formula  # type: ignore[attr-defined]
+
+        # Refresh dependencies and calculate fusion score
+        node2.refresh_dependencies(True, False)  # type: ignore[attr-defined]
+        score = self.score_fusion_memory(node1, node2)
+
+        fusion_log.info("Shared memory after inversion: %d", score)
+        return score
+
     def shared_data_after_reordering_loop(
         self, node1: BaseSchedulerNode, node2: BaseSchedulerNode
     ) -> int:
@@ -4686,6 +4867,7 @@ def can_fuse(
         del device2
 
         shared_data_score = self.score_fusion_memory(node1, node2)
+
         if (
             can_reorder
             and shared_data_score < config.score_fusion_memory_threshold
@@ -4702,6 +4884,16 @@ def can_fuse(
             smaller_node.expand_dimension_for_pointwise_node(expand_dim, expand_size)
             shared_data_score = self.score_fusion_memory(node1, node2)
 
+        if (
+            config.loop_index_inversion_in_fusion
+            and shared_data_score < config.score_fusion_memory_threshold
+        ):
+            new_shared_data_score = self.shared_data_after_inverting_indexing(
+                node1, node2
+            )
+            if new_shared_data_score >= 0:
+                shared_data_score = new_shared_data_score
+
         if loop_ordering_log.isEnabledFor(logging.DEBUG):
             loop_ordering_log.debug(
                 "%s and %s has %s shared data",
diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py
index dc4be650eccb4..d6893b07ee3d9 100644
--- a/torch/_inductor/select_algorithm.py
+++ b/torch/_inductor/select_algorithm.py
@@ -2145,6 +2145,8 @@ def __init__(
         # There is no src hash for ExternKernelChoice in the traditional sense
         # so we indicate this by returning None
         self.src_hash = None
+        # By default GraphModule is None for extern kernels if not set
+        self.gm = None
 
     def to_callable(self):
         return getattr(extern_kernels, self.name)
@@ -2317,6 +2319,7 @@ def __init__(
         self.choice = choice
         self.kwargs = kwargs or {}
         self.has_out_variant = has_out_variant
+        self.gm = choice.gm
 
     def __str__(self) -> str:
         return f"ExternKernelCaller({self.choice.call_name()})"
@@ -2700,6 +2703,7 @@ def __call__(
         precompilation_timeout_seconds: int = 60 * 60,
         return_multi_template=False,
         best_config_future=None,
+        return_choice=False,  # TODO: return_choice is temporary and will be refactored soon
     ):
         from .codegen.cuda.cuda_kernel import CUDATemplateCaller
 
@@ -2722,20 +2726,8 @@ def __call__(
             N = input_nodes[-1].get_size()[-1]
             append_to_log(mm_file_name, {"invoke": str((M, K, N))})
 
-        def create_no_valid_choices(reason: str) -> NoValidChoicesError:
-            backend_config = (
-                "max_autotune_gemm_backends"
-                if name != "convolution"
-                else "max_autotune_conv_backends"
-            )
-            return NoValidChoicesError(
-                f"No choices to select. Provided reason: {reason} "
-                f"please consider adding ATEN into {backend_config} "
-                "config (defined in torch/_inductor/config.py) to allow at least one choice. "
-            )
-
         if len(choices) == 0:
-            raise create_no_valid_choices("No choices exist for backend.")
+            raise self.create_no_valid_choices(name, "No choices exist for backend.")
         log.debug("Max autotune selects from %s choices.", str(len(choices)))
 
         if len(choices) == 1:
@@ -2748,163 +2740,10 @@ def create_no_valid_choices(reason: str) -> NoValidChoicesError:
 
         inputs_key = create_inputs_key(input_nodes)
 
-        # TODO(nmacchioni): remove this hacky way to tell if we ran benchmarking
-        has_autotuned = False
-
-        def benchmark(choices, hint_override: Optional[int] = None):
-            nonlocal has_autotuned
-            # TODO(nmacchioni): remove this hacky way to tell if we ran benchmarking
-            has_autotuned = True
-            counters["inductor"]["select_algorithm_autotune"] += 1
-            # TODO(nmacchioni): remove this layer of abstraction
-            # construct `benchmark_fn` which should pick between in-process and sub-process autotuning
-            benchmark_fn = self.make_benchmark_fn(
-                choices, input_nodes, layout, input_gen_fns, hint_override=hint_override
-            )
-            # `benchmark_fn(choices)` will execute each choice, and return a dict[choice, timing] which
-            # maps each choice to its runtime, calculated by the specified benchmarker, in milliseconds
-            return benchmark_fn(choices)
-
-        def autotune(choices, hint_override: Optional[int] = None):
-            log.debug("Starting autotuning")
-
-            with dynamo_timed(
-                f"{name}_template_autotuning",
-                log_pt2_compile_event=True,
-                dynamo_compile_column_us="compile_time_autotune_time_us",
-                metadata=_autotune_metadata(input_nodes),
-            ):
-                benchmark_results = benchmark(choices, hint_override=hint_override)
-                if config.max_autotune_report_choices_stats:
-                    _log_autotune_choices_stats(
-                        f"{name}_template_autotuning", benchmark_results
-                    )
-                return benchmark_results
-
         if config.autotune_in_subproc:
             # Initialize the suprocess pool so it will warmup early.
             torch._inductor.autotune_process.get_tuning_process_pool()
 
-        def do_autotuning(choices, precompile_fn, hint_override: Optional[int] = None):
-            precompile_start_ts = time.time()
-            with dynamo_timed(
-                f"{name}_template_precompiling",
-                log_pt2_compile_event=True,
-                dynamo_compile_column_us="compile_time_autotune_time_us",
-            ):
-                precompile_fn()
-            precompile_elapse = time.time() - precompile_start_ts
-            log.debug("Precompilation elapsed time: %.02fs", precompile_elapse)
-            # Prune anything that failed to compile
-            choices = [c for c in choices if not c.failed]
-            if len(choices) == 0:
-                raise create_no_valid_choices(
-                    "All choices failed to compile for backend."
-                )
-
-            candidates = self.prescreen_choices(
-                choices, name, inputs_key, self.prescreening_cache
-            )
-            prescreening_elapse: Optional[float] = None
-            if candidates:
-                prescreening_start_ts = time.time()
-                timings = self.lookup(
-                    candidates,
-                    name,
-                    inputs_key,
-                    lambda choices: autotune(choices, hint_override=hint_override),
-                    hint_override=hint_override,
-                )
-                choices = self.prune_choices_postscreen(
-                    choices, timings, name, inputs_key, self.prescreening_cache
-                )
-                prescreening_elapse = time.time() - prescreening_start_ts
-                log.debug("Prescreening elapsed time: %.02fs", prescreening_elapse)
-
-            autotune_start_ts = time.time()
-
-            if best_config_future is not None:
-                best_config = await_sync(best_config_future)
-
-                important_keys = [
-                    "ACC_TYPE",
-                    "ALLOW_TF32",
-                    "BLOCK_K",
-                    "BLOCK_M",
-                    "BLOCK_N",
-                    "EVEN_K",
-                    "GROUP_M",
-                    "USE_FAST_ACCUM",
-                    "num_stages",
-                    "num_warps",
-                    "num_consumer_groups",
-                    "num_buffers_warp_spec",
-                ]
-                choices = [
-                    choice
-                    for choice in choices
-                    if all(
-                        f"{k}={best_config[k]}" in choice.description
-                        for k in important_keys
-                    )
-                    for k in important_keys
-                ]
-                log.info("Filtered to %d choices based on best_config", len(choices))
-
-            timings = self.lookup(
-                choices,
-                name,
-                inputs_key,
-                lambda choices: autotune(choices, hint_override=hint_override),
-                hint_override=hint_override,
-            )
-
-            autotune_elapse = time.time() - autotune_start_ts
-            log.debug("Autotuning elapsed time: %.02fs", autotune_elapse)
-
-            if timings and all(
-                not math.isfinite(timing) for timing in timings.values()
-            ):
-                raise NoValidChoicesError
-
-            if (
-                has_autotuned
-                or log.getEffectiveLevel() == logging.DEBUG
-                or config.trace.log_autotuning_results
-            ):
-                self.log_results(
-                    name,
-                    input_nodes,
-                    timings,
-                    autotune_elapse,
-                    precompile_elapse,
-                    prescreening_elapse,
-                    hint_override=hint_override,
-                )
-
-            def profiler_bench_function():
-                # we're not running through the normal caching autotuner method here because we want to avoid returning
-                # the cached value.
-                # Avoid benchmarking in a separate process because it's not easy to signal to the TuningProcess that we
-                # should use the profiler.
-                with config.patch(
-                    profile_bandwidth_with_do_bench_using_profiling=True,
-                    autotune_in_subproc=False,
-                ):
-                    return benchmark(choices)
-
-            for feedback_fn in self.feedback_saver_fns:
-                # re-benchmarking the same choices with profiler is a bit expensive, so pass it in as a thunk.
-                feedback_fn(
-                    timings,
-                    name,
-                    input_nodes,
-                    choices,
-                    profiler_bench_function,
-                )
-
-            return timings
-
         precompile_fn = self.make_precompile_fn(
             choices,
             name,
@@ -2921,8 +2760,16 @@ def get_timings(hint_override: Optional[int] = None):
                     if not hasattr(c, "hint_override")
                     or c.hint_override == hint_override
                 ]
-                timings = do_autotuning(
-                    filtered_choices, precompile_fn, hint_override=hint_override
+                timings = self.do_autotuning(
+                    name,
+                    input_nodes,
+                    layout,
+                    input_gen_fns,
+                    inputs_key,
+                    filtered_choices,
+                    precompile_fn,
+                    hint_override=hint_override,
+                    best_config_future=best_config_future,
                 )
                 min_extern_choice = float("inf")
                 for choice, timing in timings.items():
@@ -2958,7 +2805,16 @@ def get_timings(hint_override: Optional[int] = None):
                 )
             )
 
-        timings = do_autotuning(choices, precompile_fn)
+        timings = self.do_autotuning(
+            name,
+            input_nodes,
+            layout,
+            input_gen_fns,
+            inputs_key,
+            choices,
+            precompile_fn,
+            best_config_future=best_config_future,
+        )
         # if timings is empty, we really have no choice but to return a semi-random
         # choice. returning the first `ExternKernelCaller` is probably the safest bet
         # in this case, since it will generally be the ATen kernel. if there are no
@@ -2973,20 +2829,262 @@ def get_timings(hint_override: Optional[int] = None):
                         "Autotuning returned empty timings, falling back to first `ExternKernelCaller`: %s",
                         node,
                     )
+                    if return_choice:
+                        return node, choice
                     return node
             node = choices[0].output_node()
+            choice = choices[0]
             log.debug(
                 "Autotuning returned empty timings, falling back to first choice: %s",
                 node,
             )
+            if return_choice:
+                return node, choice
             return node
 
         # if we got any timings at all, pick the best of those
         choice = min(timings, key=timings.__getitem__)
         node = choice.output_node()
         log.debug("Autotuning selected choice: %s", node)
+        if return_choice:
+            return node, choice
         return node
 
+    def benchmark(
+        self,
+        choices,
+        input_nodes,
+        layout,
+        input_gen_fns,
+        hint_override: Optional[int] = None,
+    ):
+        counters["inductor"]["select_algorithm_autotune"] += 1
+        # TODO(nmacchioni): remove this layer of abstraction
+        # construct `benchmark_fn` which should pick between in-process and sub-process autotuning
+        benchmark_fn = self.make_benchmark_fn(
+            choices, input_nodes, layout, input_gen_fns, hint_override=hint_override
+        )
+        # `benchmark_fn(choices)` will execute each choice, and return a dict[choice, timing] which
+        # maps each choice to its runtime, calculated by the specified benchmarker, in milliseconds
+        return benchmark_fn(choices)
+
+    def autotune(
+        self,
+        name,
+        input_nodes,
+        layout,
+        input_gen_fns,
+        choices,
+        hint_override: Optional[int] = None,
+    ):
+        log.debug("Starting autotuning")
+
+        with dynamo_timed(
+            f"{name}_template_autotuning",
+            log_pt2_compile_event=True,
+            dynamo_compile_column_us="compile_time_autotune_time_us",
+            metadata=_autotune_metadata(input_nodes),
+        ):
+            benchmark_results = self.benchmark(
+                choices, input_nodes, layout, input_gen_fns, hint_override=hint_override
+            )
+            if config.max_autotune_report_choices_stats:
+                _log_autotune_choices_stats(
+                    f"{name}_template_autotuning", benchmark_results
+                )
+            return benchmark_results
+
+    def do_autotuning(
+        self,
+        name,
+        input_nodes,
+        layout,
+        input_gen_fns,
+        inputs_key,
+        choices,
+        precompile_fn,
+        hint_override: Optional[int] = None,
+        best_config_future=None,
+    ):
+        """Execute the autotuning process for kernel algorithm selection.
+
+        This method orchestrates the complete autotuning pipeline including precompilation,
+        prescreening, benchmarking, and feedback collection to select the optimal kernel
+        implementation for given inputs.
+
+        Args:
+            name: Name identifier for the operation being autotuned (e.g., 'mm', 'convolution').
+            input_nodes: List of input IR nodes used for benchmarking.
+            layout: Layout information specifying device and memory format for the operation.
+            input_gen_fns: Optional dict mapping argument indices to functions that generate
+                torch.Tensor inputs from ir.Buffer for benchmarking. If provided, these are
+                used instead of random tensors.
+            inputs_key: Cache key representing the input characteristics (sizes, strides, dtypes).
+            choices: List of ChoiceCaller objects representing candidate kernel implementations.
+            precompile_fn: Callable that precompiles all kernel choices before benchmarking.
+            hint_override: Optional index to override which choice is selected, used for testing
+                or forced selection.
+            best_config_future: Optional future containing pre-determined best configuration to
+                filter choices by specific config parameters.
+
+        Returns:
+            dict: Mapping from ChoiceCaller to benchmark timing in seconds. Choices with
+                non-finite timings (inf/nan) indicate failures.
+
+        Raises:
+            NoValidChoicesError: When all choices fail to compile or benchmark, or when all
+                timing results are non-finite.
+        """
+        precompile_start_ts = time.time()
+        with dynamo_timed(
+            f"{name}_template_precompiling",
+            log_pt2_compile_event=True,
+            dynamo_compile_column_us="compile_time_autotune_time_us",
+        ):
+            precompile_fn()
+        precompile_elapse = time.time() - precompile_start_ts
+        log.debug("Precompilation elapsed time: %.02fs", precompile_elapse)
+        # Prune anything that failed to compile
+        choices = [c for c in choices if not c.failed]
+        if len(choices) == 0:
+            raise self.create_no_valid_choices(
+                name, "All choices failed to compile for backend."
+            )
+
+        candidates = self.prescreen_choices(
+            choices, name, inputs_key, self.prescreening_cache
+        )
+        prescreening_elapse: Optional[float] = None
+        if candidates:
+            prescreening_start_ts = time.time()
+            timings = self.lookup(
+                candidates,
+                name,
+                inputs_key,
+                lambda choices: self.autotune(
+                    name,
+                    input_nodes,
+                    layout,
+                    input_gen_fns,
+                    choices,
+                    hint_override=hint_override,
+                ),
+                hint_override=hint_override,
+            )
+            choices = self.prune_choices_postscreen(
+                choices, timings, name, inputs_key, self.prescreening_cache
+            )
+            prescreening_elapse = time.time() - prescreening_start_ts
+            log.debug("Prescreening elapsed time: %.02fs", prescreening_elapse)
+
+        autotune_start_ts = time.time()
+
+        if best_config_future is not None:
+            best_config = await_sync(best_config_future)
+
+            important_keys = [
+                "ACC_TYPE",
+                "ALLOW_TF32",
+                "BLOCK_K",
+                "BLOCK_M",
+                "BLOCK_N",
+                "EVEN_K",
+                "GROUP_M",
+                "USE_FAST_ACCUM",
+                "num_stages",
+                "num_warps",
+                "num_consumer_groups",
+                "num_buffers_warp_spec",
+            ]
+            choices = [
+                choice
+                for choice in choices
+                if all(
+                    f"{k}={best_config[k]}" in choice.description
+                    for k in important_keys
+                )
+                for k in important_keys
+            ]
+            log.info("Filtered to %d choices based on best_config", len(choices))
+
+        has_autotuned: bool = False
+
+        def track_has_autotuned(choices):
+            nonlocal has_autotuned
+            has_autotuned = True
+            return self.autotune(
+                name,
+                input_nodes,
+                layout,
+                input_gen_fns,
+                choices,
+                hint_override=hint_override,
+            )
+
+        timings = self.lookup(
+            choices,
+            name,
+            inputs_key,
+            track_has_autotuned,
+            hint_override=hint_override,
+        )
+
+        autotune_elapse = time.time() - autotune_start_ts
+        log.debug("Autotuning elapsed time: %.02fs", autotune_elapse)
+
+        if timings and all(not math.isfinite(timing) for timing in timings.values()):
+            raise NoValidChoicesError
+
+        if (
+            has_autotuned
+            or log.getEffectiveLevel() == logging.DEBUG
+            or config.trace.log_autotuning_results
+        ):
+            self.log_results(
+                name,
+                input_nodes,
+                timings,
+                autotune_elapse,
+                precompile_elapse,
+                prescreening_elapse,
+                hint_override=hint_override,
+            )
+
+        def profiler_bench_function():
+            # we're not running through the normal caching autotuner method here because we want to avoid returning
+            # the cached value.
+            # Avoid benchmarking in a separate process because it's not easy to signal to the TuningProcess that we
+            # should use the profiler.
+            with config.patch(
+                profile_bandwidth_with_do_bench_using_profiling=True,
+                autotune_in_subproc=False,
+            ):
+                return self.benchmark(choices, input_nodes, layout, input_gen_fns)
+
+        for feedback_fn in self.feedback_saver_fns:
+            # re-benchmarking the same choices with profiler is a bit expensive, so pass it in as a thunk.
+            feedback_fn(
+                timings,
+                name,
+                input_nodes,
+                choices,
+                profiler_bench_function,
+            )
+
+        return timings
+
+    def create_no_valid_choices(self, name: str, reason: str) -> NoValidChoicesError:
+        backend_config = (
+            "max_autotune_gemm_backends"
+            if name != "convolution"
+            else "max_autotune_conv_backends"
+        )
+        return NoValidChoicesError(
+            f"No choices to select. Provided reason: {reason} "
+            f"please consider adding ATEN into {backend_config} "
+            "config (defined in torch/_inductor/config.py) to allow at least one choice. "
+        )
+
     def make_precompile_fn(
         self,
         choices,
@@ -3719,9 +3817,7 @@ def get_choice_info(choice):
             M, K = input_nodes[-2].get_size()[:2]
             N = input_nodes[-1].get_size()[-1]
 
-            out_dict = {
-                str((M, K, N)): [get_choice_info(choice) for choice in timings.keys()]
-            }
+            out_dict = {str((M, K, N)): [get_choice_info(choice) for choice in timings]}
 
             append_to_log(mm_filename, out_dict)
 
diff --git a/torch/_inductor/sizevars.py b/torch/_inductor/sizevars.py
index 35313f472f430..77526a38aeb37 100644
--- a/torch/_inductor/sizevars.py
+++ b/torch/_inductor/sizevars.py
@@ -882,7 +882,7 @@ def _choose(x: int, y: int) -> bool:
         # Start building the unbacked replacements mapping using CanonicalExprFinder
         # The mapping is from Expr to its "canonical" Expr.
         self.unbacked_replacements = {}
-        for expr in self.equality_graph.keys():
+        for expr in self.equality_graph:
             canonical_expr = uf.find_expr(expr)
             if expr != canonical_expr:
                 self.unbacked_replacements[expr] = canonical_expr
diff --git a/torch/_inductor/template_heuristics/triton.py b/torch/_inductor/template_heuristics/triton.py
index 61616d81c2878..9df8d114ef67b 100644
--- a/torch/_inductor/template_heuristics/triton.py
+++ b/torch/_inductor/template_heuristics/triton.py
@@ -422,8 +422,12 @@ def __init__(self) -> None:
             GemmConfig(32, 256, 64, 6, 4),
             GemmConfig(64, 16, 256, 5, 4),
             GemmConfig(64, 32, 256, 5, 4),
+            GemmConfig(64, 128, 128, 2, 4),
             GemmConfig(64, 128, 128, 3, 4),
+            GemmConfig(128, 128, 128, 2, 4),
             GemmConfig(128, 256, 128, 4, 8),
+            GemmConfig(256, 128, 128, 2, 4),
+            GemmConfig(256, 128, 128, 2, 8),
         ]
 
         self.scaled_persistent_mm_configs: list[BaseConfig] = [
@@ -1946,6 +1950,29 @@ def _valid(self, kernel_inputs: KernelInputs) -> bool:
             return False
         return True
 
+    # pyrefly: ignore [bad-override]
+    def _filter_configs(self, configs: list[BaseConfig]) -> list[BaseConfig]:
+        """
+        Filter out bad configs for specific hardware.
+        On AMD MI350X (GFX 9.5+), skip configs with BLOCK_K<=64 due to lack of corresponding MFMA instructions.
+        """
+
+        def should_skip_mi350x_config(config: BaseConfig) -> bool:
+            """Skip config if BLOCK_K<=64 on MI350X (GFX 9.5+)"""
+            try:
+                return (
+                    config.block_k <= 64
+                    and torch.version.hip is not None
+                    and torch.cuda.get_device_capability() >= (9, 5)
+                )
+            except RuntimeError:
+                # If no HIP GPUs are available, we can't check device capability
+                # so we don't skip any configs
+                return False
+
+        filtered_configs = [c for c in configs if not should_skip_mi350x_config(c)]
+        return super()._filter_configs(filtered_configs)
+
 
 # Scaled TMA-specific mixin for scaled MM templates with TMA
 class ScaledTMAConfigMixin(TMAWorkspaceMixin, BaseScaledMMConfigMixin):
diff --git a/torch/_inductor/test_case.py b/torch/_inductor/test_case.py
index 227e369c6ac2b..efdef48884cef 100644
--- a/torch/_inductor/test_case.py
+++ b/torch/_inductor/test_case.py
@@ -32,7 +32,10 @@ def setUp(self) -> None:
             )
         )
 
-        if "TORCHINDUCTOR_FX_GRAPH_CACHE" not in os.environ:
+        if (
+            "TORCHINDUCTOR_FX_GRAPH_CACHE" not in os.environ
+            and "TORCHINDUCTOR_FX_GRAPH_CACHE_DEFAULT" not in os.environ
+        ):
             self._inductor_test_stack.enter_context(
                 config.patch({"fx_graph_cache": True})
             )
diff --git a/torch/_inductor/tiling_utils.py b/torch/_inductor/tiling_utils.py
index 0c9305dc721dd..ae529a355f275 100644
--- a/torch/_inductor/tiling_utils.py
+++ b/torch/_inductor/tiling_utils.py
@@ -145,6 +145,41 @@ def indexing_div_rep(
     return None
 
 
+def find_broadcast_var(
+    index: sympy.Expr, var_ranges: dict[sympy.Expr, int]
+) -> Optional[sympy.Expr]:
+    """
+    Try to find the variable that this index is broadcast over.
+    A broadcast pattern is one where consecutive values of a variable
+    access the same memory location (e.g., x // 10).
+    """
+    # Approximate analysis by evaluating at 1 and 0
+    variables: dict[sympy.Symbol, int] = {}
+    for v in index.free_symbols:
+        if v in var_ranges:
+            variables[v] = 0
+        else:
+            variables[v] = get_hint(v)
+
+    zero_index = sympy_subs(index, variables)
+    for v in var_ranges.keys():
+        if v not in index.free_symbols:
+            continue
+
+        variables[v] = 1
+        try:
+            new_val = sympy_subs(index, variables)
+        except ZeroDivisionError:
+            loop_tiling_log.info("zero division error %s %s", index, variables)
+            continue
+        # Broadcast means the value doesn't change when the variable increments
+        if new_val == zero_index:
+            return v
+        variables[v] = 0
+
+    return None
+
+
 def find_coalesced_var(
     index: sympy.Expr, var_ranges: dict[sympy.Expr, int]
 ) -> Optional[sympy.Expr]:
@@ -165,7 +200,7 @@ def find_coalesced_var(
             variables[v] = get_hint(v)
 
     zero_index = sympy_subs(index, variables)
-    for v in var_ranges.keys():
+    for v in var_ranges:
         variables[v] = 1
         try:
             new_val = sympy_subs(index, variables)
@@ -568,11 +603,12 @@ def remove_identity(expr: sympy.Expr) -> sympy.Expr:
     return fused_out
 
 
-def get_score(addr: sympy.Expr, var_ranges: dict[sympy.Symbol, int]) -> int:
+def get_score(
+    addr: sympy.Expr, var_ranges: dict[sympy.Symbol, int], buf_names: OrderedSet[str]
+) -> int:
     """
-    Score addr according to its approximate size
+    Score addr according to its approximate size.
     """
-
     # TODO - deduplicate with candidate_tilings
     var_sizes = []
     for v in addr.free_symbols:
@@ -587,6 +623,15 @@ def get_score(addr: sympy.Expr, var_ranges: dict[sympy.Symbol, int]) -> int:
     )
 
 
+def try_get_buf_size(buf_name: str) -> Optional[int]:
+    buf = V.graph.try_get_buffer(buf_name)
+    if not buf:
+        return None
+    return V.graph.sizevars.atomically_apply_size_hint(
+        sympy_product(buf.get_size()), fallback=config.unbacked_symint_fallback
+    )
+
+
 def get_hint(v: Union[sympy.Expr, int]) -> int:
     if isinstance(v, int):
         return v
@@ -612,6 +657,8 @@ class CoalesceVarAnalysis:
     # TODO: separate into dataclass that olds mem, dtype, is_write
     coalesced_by_var: dict[sympy.Expr, int]
 
+    uncoalesced_addrs: dict[sympy.Expr, int]
+
     norm_read_writes: FusedNormalizedReadsWrites
 
     suggested_split: Optional[VarTiling] = None
@@ -657,28 +704,40 @@ def analyze_memory_coalescing(
         if indirect_expr:
             continue
 
-        size = get_score(memory_expr, var_ranges)
+        size = get_score(memory_expr, var_ranges, buf_names)
+
         if size == 0:
             continue
 
         maybe_coalesced_var = find_coalesced_var(memory_expr, var_ranges)
+        # while broadcasting vars are not technically coalesced,
+        # accesses at least stay in cache, so they provide most of the benefit.
+        # treat the same for now.
+        if maybe_coalesced_var is None:
+            maybe_coalesced_var = find_broadcast_var(memory_expr, var_ranges)
 
-        byte_multipler = 0
+        total_score = 0
         for buf_name in buf_names:
-            if buf := V.graph.try_get_buffer(buf_name):
-                byte_multipler += buf.dtype.itemsize
+            if (buf := V.graph.try_get_buffer(buf_name)) and (
+                buf_size := try_get_buf_size(buf_name)
+            ):
+                # constrain by buf size since we'll read at most that many elements
+                # score could be more through either masking or by broadcasting (e.g. x // 16)
+                total_score += min(buf_size, size) * buf.dtype.itemsize
 
         # coalesced writes more important
-        byte_multipler *= 1 if is_read else 2
+        total_score *= 1 if is_read else 2
 
         if maybe_coalesced_var:
-            coalesced_by_var[maybe_coalesced_var] += size * byte_multipler
+            coalesced_by_var[maybe_coalesced_var] += total_score
         else:
-            uncoalesced_addrs[memory_expr] += size * byte_multipler
+            uncoalesced_addrs[memory_expr] += total_score
 
     if not uncoalesced_addrs:
         return CoalesceVarAnalysis(
-            coalesced_by_var=coalesced_by_var, norm_read_writes=norm_read_writes
+            coalesced_by_var=coalesced_by_var,
+            uncoalesced_addrs=uncoalesced_addrs,
+            norm_read_writes=norm_read_writes,
         )
 
     # map from var -> tiling -> total_score
@@ -722,7 +781,9 @@ def analyze_memory_coalescing(
 
     if len(tiling_scores) == 0:
         return CoalesceVarAnalysis(
-            coalesced_by_var=coalesced_by_var, norm_read_writes=norm_read_writes
+            coalesced_by_var=coalesced_by_var,
+            uncoalesced_addrs=uncoalesced_addrs,
+            norm_read_writes=norm_read_writes,
         )
 
     best_tiling: Optional[tuple[sympy.Expr, int]] = None
@@ -736,7 +797,9 @@ def analyze_memory_coalescing(
 
     if best_tiling is None:
         return CoalesceVarAnalysis(
-            coalesced_by_var=coalesced_by_var, norm_read_writes=norm_read_writes
+            coalesced_by_var=coalesced_by_var,
+            uncoalesced_addrs=uncoalesced_addrs,
+            norm_read_writes=norm_read_writes,
         )
 
     # TODO - for strictly pointwise fusions,
@@ -745,6 +808,7 @@ def analyze_memory_coalescing(
     # TODO - could also prefer index var splits to reduction, better tested
     return CoalesceVarAnalysis(
         coalesced_by_var=coalesced_by_var,
+        uncoalesced_addrs=uncoalesced_addrs,
         norm_read_writes=norm_read_writes,
         suggested_split=VarTiling(best_tiling[0], best_tiling[1], best_tiling_score),
     )
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
index 13938f6ec1e55..f029a2e73f038 100644
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@@ -58,6 +58,7 @@
 import torch.utils._pytree as pytree
 from torch._inductor.analysis.device_info import datasheet_tops
 from torch._inductor.runtime.hints import DeviceProperties
+from torch.fx.passes.regional_inductor import _needs_inductor_compile
 from torch.utils._dtype_abbrs import dtype_abbrs
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._pytree import tree_flatten, tree_map_only
@@ -780,9 +781,19 @@ def get_fused_kernel_name(
 ) -> str:
     all_origins = aggregate_origins(node_schedule)
     if descriptive_names == "original_aten":
+
+        def get_origin_meta_str(origin):
+            original_aten = origin.meta["original_aten"]
+            key = ""
+            if isinstance(original_aten, torch._ops.OpOverload):
+                key = original_aten._overloadpacket.__name__
+            elif isinstance(original_aten, torch._ops.HigherOrderOperator):
+                key = str(original_aten.name())
+            return key
+
         # Bases the kernel name off of the top-level aten operator (i.e. pre-decompositions)
         sources = [
-            origin.meta["original_aten"]._overloadpacket.__name__
+            get_origin_meta_str(origin)
             for origin in all_origins
             if origin.op == "call_function"
             and "original_aten" in origin.meta
@@ -793,12 +804,22 @@ def get_fused_kernel_name(
         # Bases the kernel name off of the top-level "torch" operator (i.e. post-dynamo graph)
         sources = []
         for origin in all_origins:
-            if origin.op == "call_function" and "source_fn_stack" in origin.meta:
-                source_fn = origin.meta["source_fn_stack"][-1]
+            if origin.op == "call_function":
+                source_fn = None
+                suffix = ""
+                if "source_fn_stack" in origin.meta:
+                    source_fn = origin.meta["source_fn_stack"][-1]
+                elif "fwd_source_fn_stack" in origin.meta:
+                    # backward nodes have "fwd_source_fn_stack" instead
+                    source_fn = origin.meta["fwd_source_fn_stack"][-1]
+                    suffix = "backward"
+                if not source_fn:
+                    continue
                 if isinstance(source_fn[1], str):
-                    sources.append(source_fn[1])
+                    sources.append(source_fn[1] + suffix)
                 else:
-                    sources.append(source_fn[1].__name__)
+                    sources.append(source_fn[1].__name__ + suffix)
+
         sources = sorted(OrderedSet(sources))
     elif descriptive_names == "inductor_node":
         sources = [
@@ -851,11 +872,20 @@ def get_kernel_metadata(
 
     for node in inductor_nodes:
         if "original_aten" in node.meta and node.meta["original_aten"] is not None:
-            key = str(node.meta["original_aten"]._overloadpacket)
-            original_aten_dict[key].append(node.name)
+            original_aten = node.meta["original_aten"]
+            key = None
+            if isinstance(original_aten, torch._ops.OpOverload):
+                key = str(original_aten._overloadpacket)
+            elif isinstance(original_aten, torch._ops.HigherOrderOperator):
+                key = str(original_aten.name())
+            if key:
+                original_aten_dict[key].append(node.name)
         if "from_node" in node.meta:
             key = node.meta["from_node"][0].name
             from_node_dict[key].append(node.name)
+        elif node.meta.get("partitioner_tag") == "is_backward":
+            # backward nodes currently don't have a "from node"
+            from_node_dict[node.name].append(node.name)
     sort_str = "Topologically Sorted" if single_graph is not None else "Unsorted"
     metadata = (
         f"{wrapper.comment} {sort_str} Source Nodes: [{', '.join(from_node_dict.keys())}], "
@@ -1226,7 +1256,7 @@ def unload_xpu_triton_pyds() -> None:
         if not module_name.startswith("torch._inductor.runtime.compile_tasks."):
             continue
         m = sys.modules[module_name]
-        for attr_name in m.__dict__.keys():
+        for attr_name in m.__dict__:
             if attr_name.startswith("triton_"):
                 kernel = getattr(m, attr_name)
                 if isinstance(
@@ -1937,10 +1967,11 @@ def use_blackwell_cutedsl_grouped_mm(
     """
     Returns True if we can use the blackwell kernel for grouped mm.
     Required conditions:
-        1. CuTeDSL is available
-        2. We are on a blackwell arch
-        3. The dtype is bf16
-        4. Max autotune or max autotune gemm is enabled
+        1. CuTeDSL backend is enabled
+        2. CuTeDSL is available
+        3. We are on a blackwell arch
+        4. The dtype is bf16
+        5. Max autotune or max autotune gemm is enabled
         6. A, B, and the output are 16B aligned
         7. We are not using dynamic shapes
         8. A is 2d
@@ -1951,9 +1982,15 @@ def use_blackwell_cutedsl_grouped_mm(
     if not ensure_cute_available():
         return False
 
+    if not _use_autotune_backend("CUTEDSL"):
+        return False
+
     from .codegen.cuda.cuda_env import is_datacenter_blackwell_arch
 
-    if not is_gpu(layout.device.type) and is_datacenter_blackwell_arch():
+    if not is_gpu(layout.device.type):
+        return False
+
+    if not is_datacenter_blackwell_arch():
         return False
 
     layout_dtypes = [torch.bfloat16]
@@ -2238,9 +2275,21 @@ def use_cpp_bmm_template(
 
     assert isinstance(mat1.layout, Layout)
 
-    return (
-        use_cpp_gemm_template(layout, mat1, mat2, require_constant_mat2=False)
-        and mat1.layout.is_contiguous()
+    # In certain scenarios, such as when the first stride is 0, the entire tensor may not be contiguous.
+    # But the 2D matrix within each batch can still be contiguous, allowing us to apply max autotune.
+    # So here we specifically check for contiguity within the 2D matrix of each batch.
+    mat1_size = mat1.layout.size
+    mat1_stride = mat1.layout.stride
+    mat1_each_batch_is_contiguous = (
+        _use_template_for_cpu(layout)
+        and mat1.get_dtype() == torch.float32
+        and (len(mat1_size) == 3)
+        and (len(mat1_stride) == 3)
+        and (mat1_stride[1] == mat1_size[2])
+        and (mat1_stride[2] == 1)
+    )
+    return use_cpp_gemm_template(layout, mat1, mat2, require_constant_mat2=False) and (
+        mat1.layout.is_contiguous() or mat1_each_batch_is_contiguous
     )
 
 
@@ -2710,7 +2759,6 @@ def pass_execution_and_save(
     with tempfile.NamedTemporaryFile(
         mode="w",
         encoding="utf-8",
-        delete=False,
     ) as f:
         before_io = io.StringIO()
         after_io = io.StringIO()
@@ -2808,13 +2856,16 @@ def is_wait(node: Optional[Union[IRNode, Operation]]) -> bool:
     return type(node) is ir._WaitKernel
 
 
-def contains_collective(snode: BaseSchedulerNode) -> bool:
+def contains_collective(
+    snode: BaseSchedulerNode,
+    filter_fn: Optional[Callable[[BaseSchedulerNode], bool]] = None,
+) -> bool:
     from torch._inductor.scheduler import GroupedSchedulerNode
 
     if isinstance(snode, GroupedSchedulerNode):
         return any(contains_collective(x) for x in snode.snodes)
 
-    return is_collective(snode.node)
+    return is_collective(snode.node) and (filter_fn is None or filter_fn(snode))
 
 
 def contains_wait(snode: BaseSchedulerNode) -> bool:
@@ -2984,19 +3035,10 @@ def device_need_guard(device: str) -> bool:
 
 
 def needs_fallback_due_to_atomic_add_limitations(dtype: torch.dtype) -> bool:
-    # tl.atomic add has bfloat16 support in fbcode
-    # but not in OSS https://github.com/pytorch/pytorch/issues/97016
-    # we will fallback until the code is upstreamed to OSS
-    if (
-        config.is_fbcode()
-        and dtype == torch.bfloat16
-        and torch.cuda.is_available()
-        and torch.cuda.get_device_capability() >= (9, 0)
-        and config.bfloat16_atomic_adds_enabled
-    ):
-        return False
+    if dtype == torch.bfloat16 and torch.cuda.is_available():
+        return torch.cuda.get_device_capability() < (9, 0)
     else:
-        return dtype in OrderedSet([torch.int64, torch.bool, torch.bfloat16])
+        return dtype in (torch.int64, torch.bool)
 
 
 def use_scatter_fallback(
@@ -3253,6 +3295,10 @@ def expr_fits_within_32bit(e: sympy.Expr) -> bool:
     size_hint = V.graph.sizevars.size_hint
     has_hint = V.graph.sizevars.shape_env.has_hint
 
+    if config.assume_32bit_indexing:
+        V.graph.sizevars.check_leq(e, int_max)  # type: ignore[arg-type]
+        return True
+
     # Allow for unhinted e as long as we can still statically prove
     # (e.g., via ValueRanges) that it is still in bounds
     if V.graph.sizevars.statically_known_true(e <= int_max):
@@ -4015,3 +4061,40 @@ def load_template(name: str, template_dir: Path) -> str:
     """Load a template file and return its content."""
     with open(template_dir / f"{name}.py.jinja") as f:
         return f.read()
+
+
+def should_fallback_by_default(node: torch.fx.Node) -> bool:
+    """Decide whether fallback for a node. This is only used in inductor lite mode."""
+    target = node.target
+
+    assert isinstance(
+        target, (torch._ops.OpOverload, torch._ops.HigherOrderOperator)
+    ), f"Expected OpOverload or HigherOrderOperator, but found {type(target)}"
+
+    if not config.fallback_by_default:
+        return False
+
+    # some ops need special handle due to dynamic shapes. we can avoid
+    # fallback if they do not impact numerics.
+    skip_fallback_due_to_dynamic_shape = OrderedSet(
+        [
+            torch.ops.aten._assert_scalar.default,
+            torch.ops.aten.lift_fresh_copy.default,
+        ]
+    )
+
+    if target in skip_fallback_due_to_dynamic_shape:
+        return False
+
+    # Most hops have registered lowering. We should follow the lowering and not fallback.
+    # However, in rare cases, hops may not register lowering, such as
+    # torch.ops.higher_order.triton_kernel_wrapper_functional. We should fallback for
+    # these hops.
+    fallback_hops = OrderedSet(
+        [torch.ops.higher_order.triton_kernel_wrapper_functional]
+    )
+
+    if isinstance(target, torch._ops.HigherOrderOperator):
+        return target in fallback_hops
+
+    return not _needs_inductor_compile(node)
diff --git a/torch/_inductor/virtualized.py b/torch/_inductor/virtualized.py
index b08cd5059baf8..f45e372e2b3a3 100644
--- a/torch/_inductor/virtualized.py
+++ b/torch/_inductor/virtualized.py
@@ -86,6 +86,8 @@
     from torch._inductor.loop_body import InterpreterShim
     from torch._subclasses import FakeTensorMode
 
+    from .distributed_autotune import _DistributedAutotuneState
+
 threadlocal = local()
 
 T = TypeVar("T")
@@ -201,6 +203,9 @@ def get_index_dtype_as_torch_dtype(self):
 _local_buffer_context: Virtualized[LocalBufferContext] = Virtualized(
     "local_buffer_context", NullHandler
 )
+_distributed_autotune_state: Virtualized[_DistributedAutotuneState] = Virtualized(
+    "distributed_autotune_state", NullHandler
+)
 
 
 def _choices_default():
@@ -370,6 +375,12 @@ class _V:
     set_local_buffer_context: Callable[[Any], Any] = _local_buffer_context._set_handler
     get_local_buffer_context: Callable[[], Any] = _local_buffer_context._get_handler
     set_choices_handler: Callable[[Any], Any] = _choices._set_handler
+    set_distributed_autotune_state: Callable[[Any], Any] = (
+        _distributed_autotune_state._set_handler
+    )
+    get_distributed_autotune_state: Callable[[], Any] = (
+        _distributed_autotune_state._get_handler
+    )
 
     @property
     def ops(self) -> OpsHandler[Any]:
@@ -429,5 +440,9 @@ def local_buffer_context(self):
     def choices(self) -> InductorChoices:
         return _choices._get_handler()
 
+    @property
+    def distributed_autotune_state(self):
+        return _distributed_autotune_state._get_handler()
+
 
 V = _V()
diff --git a/torch/_library/autograd.py b/torch/_library/autograd.py
index 2707d07059edf..125ed5b73d8e2 100644
--- a/torch/_library/autograd.py
+++ b/torch/_library/autograd.py
@@ -1,4 +1,5 @@
 # mypy: allow-untyped-defs
+import contextlib
 import dataclasses
 from collections.abc import Callable
 from dataclasses import dataclass
@@ -235,6 +236,16 @@ def not_list_of_optional_tensor(tree):
     return True
 
 
+@contextlib.contextmanager
+def autograd_fallback_mode(mode):
+    prev = _C._get_autograd_fallback_mode()
+    try:
+        _C._set_autograd_fallback_mode(mode)
+        yield
+    finally:
+        _C._set_autograd_fallback_mode(prev)
+
+
 flatten = _pytree.tree_flatten
 unflatten = _pytree.tree_unflatten
 spec_t = _pytree.TreeSpec
diff --git a/torch/_library/custom_ops.py b/torch/_library/custom_ops.py
index faa066a987f65..c75fc6e4d5296 100644
--- a/torch/_library/custom_ops.py
+++ b/torch/_library/custom_ops.py
@@ -13,6 +13,7 @@
 from torch.utils._exposed_in import exposed_in
 
 from . import autograd, utils
+from .effects import EffectType
 
 
 device_types_t = Optional[Union[str, Sequence[str]]]
@@ -471,6 +472,9 @@ def register_fake(self, fn: Callable, /) -> Callable:
         self._abstract_fn = fn
         return fn
 
+    def register_effect(self, effect: Optional[EffectType]) -> None:
+        self._lib._register_effectful_op(self._qualname, effect)
+
     def register_torch_dispatch(
         self, torch_dispatch_class: Any, fn: Optional[Callable] = None, /
     ) -> Callable:
diff --git a/torch/_library/effects.py b/torch/_library/effects.py
new file mode 100644
index 0000000000000..41fbaa4c1c7b4
--- /dev/null
+++ b/torch/_library/effects.py
@@ -0,0 +1,68 @@
+from enum import Enum
+from typing import Optional
+
+import torch
+
+
+class EffectType(Enum):
+    ORDERED = "Ordered"
+
+
+from torch._library.utils import RegistrationHandle
+
+
+class EffectHolder:
+    """A holder where one can register an effect impl to."""
+
+    def __init__(self, qualname: str):
+        self.qualname: str = qualname
+        self._set_default_effect()
+
+    def _set_default_effect(self) -> None:
+        self._effect: Optional[EffectType] = None
+
+        # If the op contains a ScriptObject input, we want to mark it as having effects
+        namespace, opname = torch._library.utils.parse_namespace(self.qualname)
+        split = opname.split(".")
+        if len(split) > 1:
+            assert len(split) == 2, (
+                f"Tried to split {opname} based on '.' but found more than 1 '.'"
+            )
+            opname, overload = split
+        else:
+            overload = ""
+
+        if namespace == "higher_order":
+            return
+
+        opname = f"{namespace}::{opname}"
+        if torch._C._get_operation_overload(opname, overload) is not None:
+            # Since we call this when destroying the library, sometimes the
+            # schema will be gone already at that time.
+            schema = torch._C._get_schema(opname, overload)
+            for arg in schema.arguments:
+                if isinstance(arg.type, torch.ClassType):
+                    self._effect = EffectType.ORDERED
+                    return
+
+    @property
+    def effect(self) -> Optional[EffectType]:
+        return self._effect
+
+    @effect.setter
+    def effect(self, _):
+        raise RuntimeError("Unable to directly set kernel.")
+
+    def register(self, effect: Optional[EffectType]) -> RegistrationHandle:
+        """Register an effect
+
+        Returns a RegistrationHandle that one can use to de-register this
+        effect.
+        """
+        self._effect = effect
+
+        def deregister_effect():
+            self._set_default_effect()
+
+        handle = RegistrationHandle(deregister_effect)
+        return handle
diff --git a/torch/_library/infer_schema.py b/torch/_library/infer_schema.py
index 62bd70f65a510..8c10a23dab881 100644
--- a/torch/_library/infer_schema.py
+++ b/torch/_library/infer_schema.py
@@ -142,7 +142,7 @@ def unstringify_type(ty: Union[type[object], str]) -> tuple[typing.Any, bool]:
                 list_type = tuple_to_list(annotation_type)
                 example_type_str = "\n\n"
                 # Only suggest the list type if this type is supported.
-                if list_type in SUPPORTED_PARAM_TYPES.keys():
+                if list_type in SUPPORTED_PARAM_TYPES:
                     example_type_str = f"For example, {list_type}.\n\n"
                 error_fn(
                     f"Parameter {name} has unsupported type {param.annotation}. "
@@ -291,7 +291,7 @@ def parse_return(annotation, error_fn):
 
     origin = typing.get_origin(annotation)
     if origin is not tuple:
-        if annotation not in SUPPORTED_RETURN_TYPES.keys():
+        if annotation not in SUPPORTED_RETURN_TYPES:
             error_fn(
                 f"Return has unsupported type {annotation}. "
                 f"The valid types are: {SUPPORTED_RETURN_TYPES}."
diff --git a/torch/_library/simple_registry.py b/torch/_library/simple_registry.py
index 8709c9e95c2b5..466f6cc68e52b 100644
--- a/torch/_library/simple_registry.py
+++ b/torch/_library/simple_registry.py
@@ -1,6 +1,7 @@
 from collections.abc import Callable
 from typing import Any, Optional
 
+from .effects import EffectHolder
 from .fake_impl import FakeImplHolder
 from .utils import RegistrationHandle
 
@@ -51,6 +52,8 @@ def __init__(self, qualname: str) -> None:
             GenericTorchDispatchRuleHolder(qualname)
         )
 
+        self.effect: EffectHolder = EffectHolder(qualname)
+
     # For compatibility reasons. We can delete this soon.
     @property
     def abstract_impl(self) -> FakeImplHolder:
diff --git a/torch/_logging/_internal.py b/torch/_logging/_internal.py
index 04298b7cdac84..e0af21614cb55 100644
--- a/torch/_logging/_internal.py
+++ b/torch/_logging/_internal.py
@@ -498,7 +498,7 @@ def _set_logs(**kwargs) -> None:
                 if val not in logging._levelToName:
                     raise ValueError(
                         f"Unrecognized log level for log {alias}: {val}, valid level values "
-                        f"are: {','.join([str(k) for k in logging._levelToName.keys()])}"
+                        f"are: {','.join([str(k) for k in logging._levelToName])}"
                     )
 
                 log_state.enable_log(
@@ -891,10 +891,14 @@ def format(self, record):
         # exception handling - copied from logging.Formatter.format
         s = record.message
         if record.exc_info:
+            from torch._dynamo import config
+
+            should_format_exc = config.verbose or artifact_name != "graph_breaks"
             # Cache the traceback text to avoid converting it multiple times
             # (it's constant anyway)
-            if not record.exc_text:
-                record.exc_text = self.formatException(record.exc_info)
+            if should_format_exc:
+                if not record.exc_text:
+                    record.exc_text = self.formatException(record.exc_info)
         if record.exc_text:
             if s[-1:] != "\n":
                 s = s + "\n"
diff --git a/torch/_logging/_registrations.py b/torch/_logging/_registrations.py
index 162ad53a63ccd..f0077f0f9bb7d 100644
--- a/torch/_logging/_registrations.py
+++ b/torch/_logging/_registrations.py
@@ -225,6 +225,11 @@
     "Detailed Inductor benchmarking information.",
     off_by_default=True,
 )
+register_artifact(
+    "node_runtime_estimation",
+    "Node runtime estimation for compile-time optimization decisions.",
+    off_by_default=True,
+)
 register_artifact(
     "autotuning",
     "Autotuning choice logs, such as kernel source, perf, and tuning parameters.",
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index f84b77e630bf3..5a629b371c766 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -39,6 +39,7 @@
 )
 from torch._refs import _broadcast_shapes, _maybe_broadcast
 from torch.fx.experimental import _config as exp_config
+from torch.nn.functional import ScalingType, SwizzleType
 from torch.utils import _pytree as pytree
 
 
@@ -51,6 +52,15 @@
 MODE_SUM, MODE_MEAN, MODE_MAX = range(3)
 
 
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+
+def round_up(x, y):
+    """Rounds up x to nearest multiple of y"""
+    return ((x + y - 1) // y) * y
+
+
 def register_meta(op) -> Callable[[Callable[_P, _T]], Callable[_P, _T]]:
     def wrapper(fn):
         fn = _convert_out_params(fn)
@@ -1021,6 +1031,10 @@ def meta_linalg_eig(input: Tensor):
     )
     values = input.new_empty(input.shape[:-1], dtype=complex_dtype)
     vectors = input.new_empty(input.shape, dtype=complex_dtype)
+    is_cuda = device_hint(input) == "cuda"
+    vectors.as_strided_(
+        input.shape, make_contiguous_strides_for(input.shape, row_major=is_cuda)
+    )
     return values, vectors
 
 
@@ -1968,6 +1982,15 @@ def meta_replication_pad2d(input, padding):
     return _pad2d_common(input, padding, is_reflection=False)
 
 
+@register_meta(
+    aten._weight_norm_interface_backward.default,
+)
+def meta_weight_norm_backward(grad_w, saved_v, saved_g, saved_norms, dim):
+    grad_v = torch.empty_like(saved_v)
+    grad_g = torch.empty_like(saved_g)
+    return grad_v, grad_g
+
+
 @register_meta(
     [
         aten.reflection_pad2d_backward.default,
@@ -6302,8 +6325,7 @@ def meta__efficient_attention_backward(
     return grad_query, grad_key, grad_value, grad_bias
 
 
-@register_meta([aten._scaled_mm.default])
-def meta_scaled_mm(
+def _check_scaled_mm_sizes(
     self: torch.Tensor,
     mat2: torch.Tensor,
     scale_a: torch.Tensor,
@@ -6375,9 +6397,6 @@ def has_zero_dim(tensor_2d):
             )
         )  # note: this applies to blockwise scaling for non-FP8 types (FP8 accepts FP32 scales)
 
-        def ceil_div(a, b):
-            return (a + b - 1) // b
-
         if scale_a.numel() == 1 and scale_b.numel() == 1:
             # tensorwise scaling
             torch._check(
@@ -6486,6 +6505,384 @@ def ceil_div(a, b):
     return torch.empty(self.size(0), mat2.size(1), dtype=_out_dtype, device=self.device)
 
 
+@register_meta([aten._scaled_mm.default])
+def meta_scaled_mm(
+    self: torch.Tensor,
+    mat2: torch.Tensor,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    scale_result: Optional[torch.Tensor] = None,
+    out_dtype: Optional[torch.dtype] = None,
+    use_fast_accum: bool = False,
+):
+    return _check_scaled_mm_sizes(
+        self, mat2, scale_a, scale_b, bias, scale_result, out_dtype, use_fast_accum
+    )
+
+
+def _check_scaled_mm_sizes_v2(
+    self: torch.Tensor,
+    mat2: torch.Tensor,
+    scale_a: list[torch.Tensor],
+    scale_recipe_a: list[ScalingType],
+    scale_b: list[torch.Tensor],
+    scale_recipe_b: list[ScalingType],
+    bias: Optional[torch.Tensor] = None,
+    out_dtype: Optional[torch.dtype] = None,
+    swizzle_a: Optional[list[SwizzleType]] = None,
+    swizzle_b: Optional[list[SwizzleType]] = None,
+    use_fast_accum: bool = False,
+):
+    def is_fp8_or_fp4_type(dtype):
+        return dtype in (
+            torch.float8_e4m3fn,
+            torch.float8_e5m2,
+            torch.float8_e4m3fnuz,
+            torch.float8_e5m2fnuz,
+            torch.float4_e2m1fn_x2,
+        )
+
+    def is_fp4_type(dtype):
+        return dtype in (torch.float4_e2m1fn_x2,)
+
+    torch._check(
+        self.dim() == 2 and mat2.dim() == 2,
+        lambda: f"Inputs must be 2D but got self.dim()={self.dim()} and mat2.dim()={mat2.dim()}",
+    )
+    torch._check(
+        is_fp8_or_fp4_type(self.dtype) and is_fp8_or_fp4_type(mat2.dtype),
+        lambda: f"Expected both inputs to be fp8 or fp4 types but got self.dtype={self.dtype} and mat2.dtype={mat2.dtype}",
+    )
+
+    # Passed tensors:
+    # self: [M, K]
+    # mat2: [K, N]
+    M = self.shape[0]
+    K = self.shape[1]
+    N = mat2.shape[1]
+
+    # If we're using fp4, using fp4x2 packed format - adjust K appropriately
+    if is_fp4_type(self.dtype) and is_fp4_type(mat2.dtype):
+        K_packed_multiplier = 2
+        K *= K_packed_multiplier
+
+    scale_recipe_a = [ScalingType(si) for si in scale_recipe_a]
+    scale_recipe_b = [ScalingType(si) for si in scale_recipe_b]
+
+    if swizzle_a:
+        swizzle_a = [SwizzleType(si) for si in swizzle_a]
+    else:
+        swizzle_a = [
+            SwizzleType.NO_SWIZZLE,
+        ]
+    if swizzle_b:
+        swizzle_b = [SwizzleType(si) for si in swizzle_b]
+    else:
+        swizzle_b = [
+            SwizzleType.NO_SWIZZLE,
+        ]
+
+    if device_hint(self) == "cuda":
+
+        def is_row_major(stride):
+            return stride[0] > stride[1] and stride[1] == 1
+
+        def is_col_major(stride):
+            return stride[0] == 1 and stride[1] > 1
+
+        def has_zero_dim(tensor_2d):
+            return tensor_2d.size(0) == 0 or tensor_2d.size(1) == 0
+
+        torch._check(
+            is_row_major(self.stride()) or has_zero_dim(self),
+            lambda: f"self must be row_major, got stride {self.stride()}",
+        )
+        torch._check(
+            is_col_major(mat2.stride()) or has_zero_dim(mat2),
+            lambda: f"mat2 must be col_major, got stride {mat2.stride()}",
+        )
+        torch._check(
+            self.size(1) % 16 == 0,
+            lambda: f"Expected self.size(1) to be divisible by 16, but got self.size(1)={self.size(1)}",
+        )
+        torch._check(
+            mat2.size(0) % 16 == 0 and mat2.size(1) % 16 == 0,
+            lambda: f"Expected both dimensions of mat2 to be divisible by 16 but got {mat2.shape}",
+        )
+
+        def is_tensorwise(recipe_a: list[ScalingType], recipe_b: list[ScalingType]):
+            return (
+                len(recipe_a) == 1
+                and len(recipe_b) == 1
+                and recipe_a[0] == ScalingType.TensorWise
+                and recipe_b[0] == ScalingType.TensorWise
+            )
+
+        def is_rowwise(recipe_a: list[ScalingType], recipe_b: list[ScalingType]):
+            return (
+                len(recipe_a) == 1
+                and len(recipe_b) == 1
+                and recipe_a[0] == ScalingType.RowWise
+                and recipe_b[0] == ScalingType.RowWise
+            )
+
+        def is_mx(recipe_a: list[ScalingType], recipe_b: list[ScalingType]):
+            return (
+                len(recipe_a) == 1
+                and len(recipe_b) == 1
+                and recipe_a[0] == ScalingType.BlockWise1x32
+                and recipe_b[0] == ScalingType.BlockWise1x32
+            )
+
+        def is_nv(recipe_a: list[ScalingType], recipe_b: list[ScalingType]):
+            return (
+                len(recipe_a) == 2
+                and len(recipe_b) == 2
+                and recipe_a[0] == ScalingType.BlockWise1x16
+                and recipe_a[1] == ScalingType.TensorWise
+                and recipe_b[0] == ScalingType.BlockWise1x16
+                and recipe_b[1] == ScalingType.TensorWise
+            )
+
+        def is_1x128_1x128(recipe_a: list[ScalingType], recipe_b: list[ScalingType]):
+            return (
+                len(recipe_a) == 1
+                and len(recipe_b) == 1
+                and recipe_a[0] == ScalingType.BlockWise1x128
+                and recipe_b[0] == ScalingType.BlockWise1x128
+            )
+
+        def is_1x128_128x128(recipe_a: list[ScalingType], recipe_b: list[ScalingType]):
+            return (
+                len(recipe_a) == 1
+                and len(recipe_b) == 1
+                and recipe_a[0] == ScalingType.BlockWise1x128
+                and recipe_b[0] == ScalingType.BlockWise128x128
+            )
+
+        def is_128x128_1x128(recipe_a: list[ScalingType], recipe_b: list[ScalingType]):
+            return (
+                len(recipe_a) == 1
+                and len(recipe_b) == 1
+                and recipe_a[0] == ScalingType.BlockWise128x128
+                and recipe_b[0] == ScalingType.BlockWise1x128
+            )
+
+        # Given scaling types, check input dimensions
+
+        if is_tensorwise(scale_recipe_a, scale_recipe_b):
+            # TensorWise
+            torch._check(
+                scale_a[0].numel() == 1
+                and scale_b[0].numel() == 1
+                and scale_a[0].dtype == torch.float32
+                and scale_b[0].dtype == torch.float32,
+                lambda: "For Tensorwise scaling, both scale_a and scale_b must be single element float (fp32) tensors",
+            )
+        elif is_rowwise(scale_recipe_a, scale_recipe_b):
+            torch._check(
+                scale_a[0].shape[0] == M
+                and scale_a[0].numel() == M
+                and scale_a[0].dtype == torch.float32
+                and scale_b[0].numel() == N
+                and scale_b[0].dtype == torch.float32,
+                lambda: (
+                    f"For Rowwise scaling, scale_a must have {self.shape[0]} elements (got: {scale_a[0].numel()})"
+                    f", and scale_b must have {mat2.shape[1]} elements (got: {scale_b[0].numel()})"
+                ),
+            )
+        elif is_1x128_1x128(scale_recipe_a, scale_recipe_b):
+            # A, B are fp8, scales are fp32
+            # As: [M x K // 128], stride: [1, M]
+            # Bs: [N x K // 128], stride: [1, N]
+            types_ok = (
+                scale_a[0].dtype == torch.float32 and scale_b[0].dtype == torch.float32
+            )
+            sa = scale_a[0]
+            scale_a_ok = (
+                sa.shape[0] == M
+                and sa.shape[1] == K // 128
+                and sa.stride(0) == 1
+                and (sa.stride(1) == M or (sa.shape[1] == 1 and sa.stride(1) == 1))
+            )
+            sb = scale_b[0]
+            scale_b_ok = (
+                sb.shape[0] == N
+                and sb.shape[1] == K // 128
+                and sb.stride(0) == 1
+                and (sb.stride(1) == N or (sb.shape[1] == 1 and sb.stride(1) == 1))
+            )
+
+            torch._check(
+                types_ok and scale_a_ok and scale_b_ok,
+                lambda: (
+                    "For 1x128 x 1x128 blockwise scaling, "
+                    f"scale a must have shape [{M}, {K // 128}] (got: {sa.shape}) and stride [1, {M}] (got: {sa.stride})"
+                    f"scale b must have shape [{N}, {K // 128}] (got: {sb.shape}) and stride [1, {N}] (got: {sb.stride})"
+                ),
+            )
+        elif is_128x128_1x128(scale_recipe_a, scale_recipe_b):
+            # A, B are fp8, scales are fp32
+            # L4 = round_up(K // 128, 4)
+            # As: [L4 x M // 128], stride: [1, L4]
+            # Bs: [N x K // 128], stride: [1, N]
+            types_ok = (
+                scale_a[0].dtype == torch.float32 and scale_b[0].dtype == torch.float32
+            )
+            L4 = round_up(K / 128, 4)
+            sa = scale_a[0]
+            scale_a_ok = (
+                sa.shape[0] == L4
+                and sa.shape[1] == M // 128
+                and sa.stride(0) == 1
+                and (sa.stride(1) == L4 or (sa.shape[1] == 1 and sa.stride(1) == 1))
+            )
+            sb = scale_b[0]
+            scale_b_ok = (
+                sb.shape[0] == N
+                and sb.shape[1] == K // 128
+                and sb.stride(0) == 1
+                and (sb.stride(1) == N or (sb.shape[1] == 1 and sb.stride(1) == 1))
+            )
+            torch._check(
+                types_ok and scale_a_ok and scale_b_ok,
+                lambda: (
+                    "For 128x128 x 1x128 blockwise scaling, L4 = {round_up(K / 128, 4)}, "
+                    f"scale a must have shape [{L4}, {M // 128}] (got: {sa.shape}) and stride [1, {L4}] (got: {sa.stride})"
+                    f"scale b must have shape [{N}, {K // 128}] (got: {sb.shape}) and stride [1, {N}] (got: {sb.stride})"
+                ),
+            )
+        elif is_1x128_128x128(scale_recipe_a, scale_recipe_b):
+            # A, B are fp8, scales are fp32
+            # L4 = round_up(K // 128, 4)
+            # As: [M x K // 128], stride: [1, M]
+            # Bs: [L4 x N // 128], stride: [1, L4]
+            types_ok = (
+                scale_a[0].dtype == torch.float32 and scale_b[0].dtype == torch.float32
+            )
+            L4 = round_up(K / 128, 4)
+            sa = scale_a[0]
+            scale_a_ok = (
+                sa.shape[0] == M
+                and sa.shape[1] == K // 128
+                and sa.stride(0) == 1
+                and (sa.stride(1) == M or (sa.shape[1] == 1 and sa.stride(1) == 1))
+            )
+            sb = scale_b[0]
+            scale_b_ok = (
+                sb.shape[0] == L4
+                and sb.shape[1] == N // 128
+                and sb.stride(0) == 1
+                and (sb.stride(1) == L4 or (sb.shape[1] == 1 and sb.stride(1) == 1))
+            )
+            torch._check(
+                types_ok and scale_a_ok and scale_b_ok,
+                lambda: (
+                    "For 1x128 x 128x128 blockwise scaling, L4 = {round_up(K / 128, 4)}, "
+                    f"scale a must have shape [{M}, {K // 128}] (got: {sa.shape}) and stride [1, {M}] (got: {sa.stride})"
+                    f"scale b must have shape [{L4}, {N // 128}] (got: {sb.shape}) and stride [1, {L4}] (got: {sb.stride})"
+                ),
+            )
+        elif is_mx(scale_recipe_a, scale_recipe_b):
+            if torch.version.hip:
+                # Note(slayton58): These mirror ROCm in ScaledBlas.cpp, but I think they're wrong..
+                expected_scale_a_elems = ceil_div(self.shape[0], 32) * self.shape[1]
+                expected_scale_b_elems = ceil_div(self.shape[1], 32) * self.shape[0]
+                expected_swizzle = SwizzleType.NO_SWIZZLE
+            else:
+                expected_scale_a_elems = round_up(self.shape[0], 128) * round_up(
+                    ceil_div(self.shape[1], 32), 4
+                )
+                expected_scale_b_elems = round_up(mat2.shape[1], 128) * round_up(
+                    ceil_div(self.shape[1], 32), 4
+                )
+                expected_swizzle = SwizzleType.SWIZZLE_32_4_4
+            torch._check(
+                scale_a[0].numel() == expected_scale_a_elems
+                and scale_a[0].dtype == torch.float8_e8m0fnu
+                and scale_b[0].numel() == expected_scale_b_elems
+                and scale_b[0].dtype == torch.float8_e8m0fnu
+                and swizzle_a[0] == expected_swizzle
+                and swizzle_b[0] == expected_swizzle,
+                lambda: (
+                    f"for MX scaling scale_a must have {expected_scale_a_elems} (got: {scale_a[0].numel()}) "
+                    f"and scale_b must have {expected_scale_b_elems} (got: {scale_b[0].numel()}). Scales must "
+                    f"have types {torch.float8_e8m0fnu} (for self: {scale_a[0].dtype}, mat_b: {scale_b[0].dtype}) "
+                    f"Must have swizzle type {expected_swizzle} (got self: {swizzle_a[0]}, mat_b: {swizzle_b[0]})"
+                ),
+            )
+        elif is_nv(scale_recipe_a, scale_recipe_b):
+            expected_scale_a_elems = round_up(M, 128) * round_up(ceil_div(K, 16), 4)
+            expected_scale_b_elems = round_up(N, 128) * round_up(ceil_div(K, 16), 4)
+            expected_swizzle = SwizzleType.SWIZZLE_32_4_4
+            torch._check(
+                scale_a[0].numel() == expected_scale_a_elems
+                and scale_a[0].dtype == torch.float8_e4m3fn
+                and scale_a[1].numel() == 1
+                and scale_a[1].dtype == torch.float32
+                and scale_b[0].numel() == expected_scale_b_elems
+                and scale_b[0].dtype == torch.float8_e4m3fn
+                and scale_b[1].numel() == 1
+                and scale_b[1].dtype == torch.float32
+                and swizzle_a[0] == expected_swizzle
+                and swizzle_b[0] == expected_swizzle,
+                lambda: (
+                    f"for NV scaling scale_a must have {expected_scale_a_elems} (got: {scale_a[0].numel()}) "
+                    f"and scale_b must have {expected_scale_b_elems} (got: {scale_b[0].numel()}). Must have "
+                    f"swizzle type {expected_swizzle} (got self: {swizzle_a[0]}, mat_b: {swizzle_b[0]})"
+                ),
+            )
+        else:
+            torch._check(
+                False,
+                lambda: (
+                    "Invalid scaling configuration. "
+                    "For tensorwise scaling, both scales should be scalar. "
+                    f"For rowwise scaling, scale_a should be ({M}, 1), scale_b should be (1, {N}). "
+                    f"For (BlockWise1x128, BlockWise128x128), scale_a should be ({M}, {ceil_div(K, 128)}), "
+                    + f"scale_b should be ({ceil_div(K, 128)}, {ceil_div(N, 128)}). "
+                    f"For (BlockWise1x128, BlockWise1x128), scale_a should be ({M}, {ceil_div(K, 128)}), "
+                    + f"scale_b should be ({ceil_div(K, 128)}, {N}). "
+                    f"Got scale_a.size()=({scale_a[0].size(0)}, {scale_a[0].size(1)}) "
+                    f"and scale_b.size()=({scale_b[0].size(0)}, {scale_b[0].size(1)})"
+                ),
+            )
+
+    _out_dtype = out_dtype if out_dtype is not None else self.dtype
+    return torch.empty(M, N, dtype=_out_dtype, device=self.device)
+
+
+@register_meta([aten._scaled_mm_v2.default])
+def meta_scaled_mm_v2(
+    self: torch.Tensor,
+    mat2: torch.Tensor,
+    scale_a: list[torch.Tensor],
+    scale_recipe_a: list[ScalingType],
+    swizzle_a: list[SwizzleType],
+    scale_b: list[torch.Tensor],
+    scale_recipe_b: list[ScalingType],
+    swizzle_b: list[SwizzleType],
+    bias: Optional[torch.Tensor] = None,
+    output_dtype: Optional[torch.dtype] = None,
+    contraction_dims: Optional[list[int]] = None,
+    use_fast_accum: bool = False,
+):
+    return _check_scaled_mm_sizes_v2(
+        self,
+        mat2,
+        scale_a,
+        scale_recipe_a,
+        scale_b,
+        scale_recipe_b,
+        bias=bias,
+        out_dtype=output_dtype,
+        swizzle_a=swizzle_a,
+        swizzle_b=swizzle_b,
+        use_fast_accum=use_fast_accum,
+    )
+
+
 @register_meta([aten.scatter_reduce.two, aten.scatter_reduce.two_out])
 @out_wrapper()
 def meta_scatter_reduce_two(self, dim, index, src, reduce, include_self=True):
@@ -7540,10 +7937,6 @@ def check_valid_strides(mat_name, mat):
             and scale_b.dtype == torch.float8_e8m0fnu
         )
 
-        def round_up(x, y):
-            """Rounds up x to nearest multiple of y"""
-            return ((x + y - 1) // y) * y
-
         def check_scale(scale_name, scale, mat, scaled_dim, scale_multiplier=1):
             if mat.dim() == 2:
                 torch._check(
diff --git a/torch/_namedtensor_internals.py b/torch/_namedtensor_internals.py
index 16d04f181525d..b0fa6a206fac3 100644
--- a/torch/_namedtensor_internals.py
+++ b/torch/_namedtensor_internals.py
@@ -93,9 +93,9 @@ def update_names_with_list(tensor, names, inplace):
 
 def update_names_with_mapping(tensor, rename_map, inplace):
     dim_map = build_dim_map(tensor)
-    for old_dim in rename_map.keys():
+    for old_dim in rename_map:
         new_dim = rename_map[old_dim]
-        if old_dim in dim_map.keys():
+        if old_dim in dim_map:
             dim_map[old_dim] = new_dim
         else:
             raise RuntimeError(
diff --git a/torch/_numpy/_dtypes.py b/torch/_numpy/_dtypes.py
index a429d28f30cc3..134f7617b758a 100644
--- a/torch/_numpy/_dtypes.py
+++ b/torch/_numpy/_dtypes.py
@@ -248,7 +248,7 @@ def sctype_from_string(s):
     """Normalize a string value: a type 'name' or a typecode or a width alias."""
     if s in _names:
         return _names[s]
-    if s in _name_aliases.keys():
+    if s in _name_aliases:
         return _name_aliases[s]
     if s in _typecodes:
         return _typecodes[s]
diff --git a/torch/_numpy/_ndarray.py b/torch/_numpy/_ndarray.py
index f192a39dd0296..e3f3836754017 100644
--- a/torch/_numpy/_ndarray.py
+++ b/torch/_numpy/_ndarray.py
@@ -49,7 +49,7 @@
 
 class Flags:
     def __init__(self, flag_to_value: dict):
-        assert all(k in FLAGS for k in flag_to_value.keys())  # sanity check
+        assert all(k in FLAGS for k in flag_to_value)  # sanity check
         self._flag_to_value = flag_to_value
 
     def __getattr__(self, attr: str):
@@ -59,7 +59,7 @@ def __getattr__(self, attr: str):
             raise AttributeError(f"No flag attribute '{attr}'")
 
     def __getitem__(self, key):
-        if key in SHORTHAND_TO_FLAGS.keys():
+        if key in SHORTHAND_TO_FLAGS:
             key = SHORTHAND_TO_FLAGS[key]
         if key in FLAGS:
             try:
@@ -76,7 +76,7 @@ def __setattr__(self, attr, value):
             super().__setattr__(attr, value)
 
     def __setitem__(self, key, value):
-        if key in FLAGS or key in SHORTHAND_TO_FLAGS.keys():
+        if key in FLAGS or key in SHORTHAND_TO_FLAGS:
             raise NotImplementedError("Modifying flags is not implemented")
         else:
             raise KeyError(f"No flag key '{key}'")
diff --git a/torch/_numpy/_util.py b/torch/_numpy/_util.py
index e1102790af2b5..636c95cf8167d 100644
--- a/torch/_numpy/_util.py
+++ b/torch/_numpy/_util.py
@@ -230,6 +230,12 @@ def _coerce_to_tensor(obj, dtype=None, copy=False, ndmin=0):
     if ndim_extra > 0:
         tensor = tensor.view((1,) * ndim_extra + tensor.shape)
 
+    # special handling for np._CopyMode
+    try:
+        copy = bool(copy)
+    except ValueError:
+        # TODO handle _CopyMode.IF_NEEDED correctly
+        copy = False
     # copy if requested
     if copy:
         tensor = tensor.clone()
diff --git a/torch/_numpy/testing/utils.py b/torch/_numpy/testing/utils.py
index d43f63f10388c..ffc027043b6f5 100644
--- a/torch/_numpy/testing/utils.py
+++ b/torch/_numpy/testing/utils.py
@@ -207,7 +207,7 @@ def assert_equal(actual, desired, err_msg="", verbose=True):
         if not isinstance(actual, dict):
             raise AssertionError(repr(type(actual)))
         assert_equal(len(actual), len(desired), err_msg, verbose)
-        for k in desired.keys():
+        for k in desired:
             if k not in actual:
                 raise AssertionError(repr(k))
             assert_equal(actual[k], desired[k], f"key={k!r}\n{err_msg}", verbose)
diff --git a/torch/_ops.py b/torch/_ops.py
index 9cdf735532d7d..8f8a7328429fa 100644
--- a/torch/_ops.py
+++ b/torch/_ops.py
@@ -1023,6 +1023,7 @@ def _fallthrough_keys(self) -> list[DispatchKey]:
             DispatchKey.BackendSelect,
             DispatchKey.PythonTLSSnapshot,
             DispatchKey.PythonDispatcher,
+            DispatchKey.Functionalize,
         ]
 
         def _may_use_fallthrough_instead_of_fallback(key: DispatchKey):
@@ -1046,17 +1047,23 @@ def _may_use_fallthrough_instead_of_fallback(key: DispatchKey):
     def _register_as_effectful_op_temporarily(self):
         from torch._higher_order_ops.effects import (
             _EffectType,
+            _get_effect,
             _register_effectful_op,
-            SIDE_EFFECTS,
         )
 
         try:
-            if self not in SIDE_EFFECTS:
-                _register_effectful_op(self, _EffectType.ORDERED)
+            # We don't want to register the effect if there already exists a
+            # registration, especially if the registration is None (explicitly
+            # no effect)
+            register_tmp_effect = _get_effect(self) is None
+            handle = None
+            if register_tmp_effect:
+                handle = _register_effectful_op(self, _EffectType.ORDERED)
             yield
         finally:
-            if self in SIDE_EFFECTS:
-                del SIDE_EFFECTS[self]
+            if register_tmp_effect:
+                assert handle is not None
+                handle.destroy()
 
     # Use positional-only argument to avoid naming collision with aten ops arguments
     # that are named "self". This way, all the aten ops can be called by kwargs.
diff --git a/torch/_prims_common/wrappers.py b/torch/_prims_common/wrappers.py
index 941fb6ee68e84..e369481c1044b 100644
--- a/torch/_prims_common/wrappers.py
+++ b/torch/_prims_common/wrappers.py
@@ -133,7 +133,7 @@ def _fn(*args, **kwargs):
             type_promoting_args = tuple(
                 bound.arguments[x]
                 for x in self.type_promoting_arg_names  # type: ignore[union-attr]
-                if x in bound.arguments.keys()
+                if x in bound.arguments
             )
 
             flattened_type_promoting_args = pytree.arg_tree_leaves(*type_promoting_args)
@@ -145,7 +145,7 @@ def _fn(*args, **kwargs):
             promoted_args = {
                 x: _maybe_convert_to_dtype(bound.arguments[x], compute_dtype)
                 for x in self.type_promoting_arg_names  # type: ignore[union-attr]
-                if x in bound.arguments.keys()
+                if x in bound.arguments
             }
             bound.arguments.update(promoted_args)
 
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 9224643fe55ab..e56163266caa1 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -385,7 +385,13 @@ def handle_noncontiguous_outputs(input_tlist, output):
 
 
 def _broadcast_shapes(*_shapes):
-    from torch.fx.experimental.symbolic_shapes import guard_or_false, is_nested_int
+    from torch.fx.experimental.symbolic_shapes import (
+        guard_or_false,
+        is_nested_int,
+        size_hint,
+    )
+
+    backed_so = torch.fx.experimental._config.backed_size_oblivious
 
     shapes = tuple(
         (x,) if isinstance(x, IntLike) else x
@@ -418,6 +424,22 @@ def _broadcast_shapes(*_shapes):
                 ):
                     continue
             else:
+                # When backed size oblivious is used, we specialize for broadcasting
+                # if its the only way to compile the example input.
+                # i.e: s0:1, s1:1 ==>
+                #           assert s0==s1, no specialization on ==1 or !=1.
+                #            The non-broadcast path is picked
+                #      s0:1, s1:4 ==>
+                #           specialize(s0) to be 1.
+                #      s0:4, s1:1 ==>
+                #           specialize(s1) to be 1.
+                if backed_so:
+                    a = size_hint(shape[idx], allow_none=True)
+                    b = size_hint(common_shape[idx], allow_none=True)
+                    if a == 1 and b != 1:
+                        torch._check(shape[idx] == 1)
+                    if b == 1 and a != 1:
+                        torch._check(common_shape[idx] == 1)
                 if guard_or_false(shape[idx] == common_shape[idx]):
                     continue
 
@@ -702,7 +724,7 @@ def exp2(a):
 # CompositeImplicitAutograd - don't register decomp
 @out_wrapper()
 @elementwise_type_promotion_wrapper(
-    type_promoting_args=("a,"),
+    type_promoting_args=("a",),
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH,
 )
 def fill(a: TensorLikeType, value: NumberType) -> TensorLikeType:
@@ -3070,7 +3092,9 @@ def dstack(tensors: TensorSequenceType) -> TensorLikeType:
 
 @register_decomposition(aten.expand)
 def expand(a: Tensor, *shape, implicit: bool = False) -> Tensor:
-    from torch.fx.experimental.symbolic_shapes import guard_or_false, sym_or
+    from torch.fx.experimental.symbolic_shapes import guard_or_false, size_hint, sym_or
+
+    backed_so = torch.fx.experimental._config.backed_size_oblivious
 
     # NOTE: cannot use utils.extract_shape_from_varargs here
     # because that also validates the shape, but the shape
@@ -3103,6 +3127,19 @@ def expand(a: Tensor, *shape, implicit: bool = False) -> Tensor:
         if guard_or_false(requested_length == -1):
             shape_[offset_idx] = x
         else:
+            # When backed size oblivious is used, we specialize for broadcasting
+            # if its the only way to compile the example input.
+            # i.e: x:1, requested_length:1 ==>
+            #           assert x==requested_length, no specialization on ==1 or !=1.
+            #            The non-broadcast path is picked
+            #      x:1, requested_length:4 ==>
+            #           specialize(x) to be 1.
+            if backed_so:
+                x_hint = size_hint(x, allow_none=True)
+                requested_hint = size_hint(requested_length, allow_none=True)
+                if x_hint == 1 and requested_hint != 1:
+                    torch._check(x == 1)
+
             torch._check(
                 sym_or(x == 1, requested_length == x),
                 lambda: f"expand: attempting to expand a dimension of length {x} -> {requested_length}!",
diff --git a/torch/_subclasses/fake_impls.py b/torch/_subclasses/fake_impls.py
index ff309af8a29e0..530c8d939d77f 100644
--- a/torch/_subclasses/fake_impls.py
+++ b/torch/_subclasses/fake_impls.py
@@ -223,6 +223,11 @@ def non_kwarg_is_pinned(fake_mode, func, *args, **kwargs):
     return r
 
 
+@register_op_impl(aten._async_error.default)
+def _async_error(fake_mode, func, msg: str):
+    pass
+
+
 @register_op_impl(aten.to.prim_Device)
 @register_op_impl(aten.to.device)
 def non_kwarg_to(fake_mode, func, *args, **kwargs):
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index d682db9312afd..23d222c5165e4 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -759,7 +759,14 @@ def __new__(
 
         if (
             device.type
-            in ["cuda", "hpu", "xpu", "mps", torch._C._get_privateuse1_backend_name()]
+            in [
+                "cuda",
+                "hpu",
+                "xpu",
+                "mps",
+                "mtia",
+                torch._C._get_privateuse1_backend_name(),
+            ]
             and device.index is None
         ):
             if device.type != "mps" and getattr(torch, device.type).is_initialized():
diff --git a/torch/_subclasses/functional_tensor.py b/torch/_subclasses/functional_tensor.py
index a8329d11e7ea1..b0aa1977b1093 100644
--- a/torch/_subclasses/functional_tensor.py
+++ b/torch/_subclasses/functional_tensor.py
@@ -11,7 +11,7 @@
 import torch.fx.traceback as fx_traceback
 import torch.utils._pytree as pytree
 from torch._C import _functionalization_reapply_views_tls as _reapply_views
-from torch._ops import _get_dispatch_mode_pre_dispatch
+from torch._ops import _get_dispatch_mode_pre_dispatch, TorchBindOpOverload
 from torch._subclasses.meta_utils import is_sparse_any
 from torch.utils._python_dispatch import (
     _detect_infra_mode,
@@ -471,7 +471,7 @@ def unwrap(x):
 
         from torch._higher_order_ops.effects import handle_effects, has_effects
 
-        if has_effects(func, args, kwargs):
+        if has_effects(func):
             assert not torch._C._dispatch_has_kernel_for_dispatch_key(
                 func.name(), torch._C.DispatchKey.Functionalize
             )
@@ -504,65 +504,81 @@ def unwrap(x):
             - FunctionalTensor._extra_dispatch_keys
         )
 
-        # All we want to do here is reuse the existing C++ functionalization logic.
-        # This requires swizzling our TLS dispatch keys so that the Functionalize key is active.
-        with torch._C._ForceDispatchKeyGuard(include_to_set, exclude_to_set):
-            try:
-                # By default for python functionalization (for AOTAutograd), we reapply views.
-                old_apply_views = torch._functionalize_enable_reapply_views(True)  # type: ignore[attr-defined]
-
-                # Sometimes these functions cannot be directly dispatched to functionalize key
-                # because args are sometimes not functional tensors for some reason?
-                if func in FunctionalTensor.metadata_fns:
-                    outs_unwrapped = func(*args_unwrapped, **kwargs_unwrapped)
-                    outs_wrapped = pytree.tree_map_only(
-                        torch.Tensor, wrap, outs_unwrapped
-                    )
-                else:
-                    # Note: [Functionalization View Replay Annotation]
-                    # When functionalization encounters a mutation, it handles aliases by lazily regenerating the aliases
-                    # at the first time they are next used.
-                    # This is a problem when plumbing user annotations during tracing. We want the view ops from view replay
-                    # to have the same annotation that the user specified on the original views. But view replay in
-                    # functionalization happens the next time the alias is used (e.g. second_op(alias_with_pending_mutation)),
-                    # so when we regenerate views before calling into second_op, those views will end up getting the metadata
-                    # for second_op!
-                    #
-                    # Instead, we need to remember the node metadata from the original views, and ensure that this node metadata
-                    # is globally set when we lazily perform view replay.
-                    # The globally set metadata will be used to populate the fx node created for the replayed operation.
-                    if m := torch._C._get_dispatch_mode(
-                        torch._C._TorchDispatchModeKey.PROXY
-                    ):
-                        for a in pytree.tree_leaves([args, kwargs]):
-                            if not isinstance(a, FunctionalTensor):
-                                continue
-                            curr_node = m.tracer.tensor_tracker[
-                                torch._from_functional_tensor(a.elem)
-                            ].proxy.node
-                            with fx_traceback.set_current_replay_node(curr_node):
-                                torch._sync(a)
-
-                    # When we dispatch to the C++ functionalization kernel, we might need to jump back to the
-                    # PreDispatch mode stack afterwards, to handle any other PreDispatch modes underneath
-                    # FunctionalTensorMode. If we call func() directly, we would need to exclude PreDispatch
-                    # from the TLS in order to avoid infinite looping, but this would prevent us from coming
-                    # back to PreDispatch later
-                    outs_unwrapped = func._op_dk(
-                        torch._C.DispatchKey.Functionalize,
-                        *args_unwrapped,
-                        **kwargs_unwrapped,
-                    )
-
-                    if self.export:
-                        if func is torch.ops.aten.dropout.default:
-                            torch._freeze_functional_tensor(outs_unwrapped)  # type: ignore[attr-defined]
-                    outs_wrapped = pytree.tree_map_only(
-                        torch.Tensor, wrap, outs_unwrapped
-                    )
-            finally:
-                torch._disable_functionalization()
-                torch._functionalize_enable_reapply_views(old_apply_views)  # type: ignore[attr-defined]
+        if isinstance(func, TorchBindOpOverload):
+            # When the function is a TorchBindOpOverload, meaning some of the
+            # inputs are FakeScriptObjects, we need to skip c++ dispatcher and
+            # dispatch in python because C++ dispatcher will check the schema
+            # and cannot recognize FakeScriptObject.
+            ctx = PythonFunctionalizeAPI()
+            fully_unwrapped_args = ctx.unwrap_tensors(args)
+            fully_unwrapped_kwargs = ctx.unwrap_tensors(
+                kwargs  # pyrefly: ignore[bad-argument-type]
+            )
+            outs_unwrapped = func(
+                *fully_unwrapped_args,
+                **fully_unwrapped_kwargs,
+            )
+            outs_wrapped = ctx.wrap_tensors(outs_unwrapped)
+        else:
+            # All we want to do here is reuse the existing C++ functionalization logic.
+            # This requires swizzling our TLS dispatch keys so that the Functionalize key is active.
+            with torch._C._ForceDispatchKeyGuard(include_to_set, exclude_to_set):
+                try:
+                    # By default for python functionalization (for AOTAutograd), we reapply views.
+                    old_apply_views = torch._functionalize_enable_reapply_views(True)  # type: ignore[attr-defined]
+
+                    # Sometimes these functions cannot be directly dispatched to functionalize key
+                    # because args are sometimes not functional tensors for some reason?
+                    if func in FunctionalTensor.metadata_fns:
+                        outs_unwrapped = func(*args_unwrapped, **kwargs_unwrapped)
+                        outs_wrapped = pytree.tree_map_only(
+                            torch.Tensor, wrap, outs_unwrapped
+                        )
+                    else:
+                        # Note: [Functionalization View Replay Annotation]
+                        # When functionalization encounters a mutation, it handles aliases by lazily regenerating the aliases
+                        # at the first time they are next used.
+                        # This is a problem when plumbing user annotations during tracing. We want the view ops from view replay
+                        # to have the same annotation that the user specified on the original views. But view replay in
+                        # functionalization happens the next time the alias is used (e.g. second_op(alias_with_pending_mutation)),
+                        # so when we regenerate views before calling into second_op, those views will end up getting the metadata
+                        # for second_op!
+                        #
+                        # Instead, we need to remember the node metadata from the original views, and ensure that this node metadata
+                        # is globally set when we lazily perform view replay.
+                        # The globally set metadata will be used to populate the fx node created for the replayed operation.
+                        if m := torch._C._get_dispatch_mode(
+                            torch._C._TorchDispatchModeKey.PROXY
+                        ):
+                            for a in pytree.tree_leaves([args, kwargs]):
+                                if not isinstance(a, FunctionalTensor):
+                                    continue
+                                curr_node = m.tracer.tensor_tracker[
+                                    torch._from_functional_tensor(a.elem)
+                                ].proxy.node
+                                with fx_traceback.set_current_replay_node(curr_node):
+                                    torch._sync(a)
+
+                        # When we dispatch to the C++ functionalization kernel, we might need to jump back to the
+                        # PreDispatch mode stack afterwards, to handle any other PreDispatch modes underneath
+                        # FunctionalTensorMode. If we call func() directly, we would need to exclude PreDispatch
+                        # from the TLS in order to avoid infinite looping, but this would prevent us from coming
+                        # back to PreDispatch later
+                        outs_unwrapped = func._op_dk(
+                            torch._C.DispatchKey.Functionalize,
+                            *args_unwrapped,
+                            **kwargs_unwrapped,
+                        )
+
+                        if self.export:
+                            if func is torch.ops.aten.dropout.default:
+                                torch._freeze_functional_tensor(outs_unwrapped)  # type: ignore[attr-defined]
+                        outs_wrapped = pytree.tree_map_only(
+                            torch.Tensor, wrap, outs_unwrapped
+                        )
+                finally:
+                    torch._disable_functionalization()
+                    torch._functionalize_enable_reapply_views(old_apply_views)  # type: ignore[attr-defined]
 
         is_included = torch._C._dispatch_tls_is_dispatch_key_included(
             torch._C.DispatchKey.Functionalize
diff --git a/torch/_subclasses/meta_utils.py b/torch/_subclasses/meta_utils.py
index f56800367af45..4ede1d7234066 100644
--- a/torch/_subclasses/meta_utils.py
+++ b/torch/_subclasses/meta_utils.py
@@ -1415,8 +1415,12 @@ def tensor_visitor_fn(
                     # tensor graph input that is a view of a strided NT.
                     from torch._dynamo.exc import unimplemented
 
+                    # NOTE this graph break will NOT be present in Dynamo's graph break registry
                     unimplemented(
-                        "strided nested tensors are not supported by meta conversion"
+                        gb_type="attempted to apply meta conversion to strided nested tensor",
+                        context=str(t),
+                        explanation="This is not supported.",
+                        hints=[],
                     )
                 elif t.is_mkldnn:
                     is_leaf = t.is_leaf
@@ -1453,7 +1457,10 @@ def tensor_visitor_fn(
                         from torch._dynamo.exc import unimplemented
 
                         unimplemented(
-                            "view functorch tensors are not supported by meta conversion"
+                            gb_type="attempted to apply meta conversion to view functorch tensor",
+                            context=str(t),
+                            explanation="This is not supported.",
+                            hints=[],
                         )
 
                     # Wraps a functorch tensor class (BatchedTensor, GradTrackingTensor)
diff --git a/torch/_weights_only_unpickler.py b/torch/_weights_only_unpickler.py
index 1ac9d2046f242..5aaa77b25697a 100644
--- a/torch/_weights_only_unpickler.py
+++ b/torch/_weights_only_unpickler.py
@@ -187,7 +187,7 @@ def _get_allowed_globals():
     }
 
     # dtype
-    for t in torch.storage._dtype_to_storage_type_map().keys():
+    for t in torch.storage._dtype_to_storage_type_map():
         rc[str(t)] = t
     for t in torch.storage._new_dtypes():
         rc[str(t)] = t
diff --git a/torch/accelerator/__init__.py b/torch/accelerator/__init__.py
index 4d1a78df1f74c..e1a82aa63ce22 100644
--- a/torch/accelerator/__init__.py
+++ b/torch/accelerator/__init__.py
@@ -10,6 +10,7 @@
 from ._utils import _device_t, _get_device_index
 from .memory import (
     empty_cache,
+    get_memory_info,
     max_memory_allocated,
     max_memory_reserved,
     memory_allocated,
@@ -25,9 +26,10 @@
     "current_device_idx",  # deprecated
     "current_device_index",
     "current_stream",
-    "empty_cache",
     "device_count",
     "device_index",
+    "empty_cache",
+    "get_memory_info",
     "is_available",
     "max_memory_allocated",
     "max_memory_reserved",
@@ -91,7 +93,7 @@ def is_available() -> bool:
     return mod.is_available()
 
 
-def current_accelerator(check_available: bool = False) -> Optional[torch.device]:
+def current_accelerator(check_available: bool = False) -> torch.device | None:
     r"""Return the device of the accelerator available at compilation time.
     If no accelerator were available at compilation time, returns None.
     See :ref:`accelerator<accelerators>` for details.
@@ -259,7 +261,7 @@ class device_index:
         ...     pass
     """
 
-    def __init__(self, device: Optional[int], /) -> None:
+    def __init__(self, device: int | None, /) -> None:
         self.idx = device
         self.prev_idx = -1
 
diff --git a/torch/accelerator/_utils.py b/torch/accelerator/_utils.py
index 730f2a82543d0..ef62e298de1c8 100644
--- a/torch/accelerator/_utils.py
+++ b/torch/accelerator/_utils.py
@@ -1,5 +1,3 @@
-from typing import Optional
-
 import torch
 from torch.types import Device as _device_t
 
@@ -9,7 +7,7 @@ def _get_device_index(device: _device_t, optional: bool = False) -> int:
         return device
     if isinstance(device, str):
         device = torch.device(device)
-    device_index: Optional[int] = None
+    device_index: int | None = None
     if isinstance(device, torch.device):
         acc = torch.accelerator.current_accelerator()
         if acc is None:
diff --git a/torch/accelerator/memory.py b/torch/accelerator/memory.py
index d98be36321119..513e497f3883c 100644
--- a/torch/accelerator/memory.py
+++ b/torch/accelerator/memory.py
@@ -8,6 +8,7 @@
 
 __all__ = [
     "empty_cache",
+    "get_memory_info",
     "max_memory_allocated",
     "max_memory_reserved",
     "memory_allocated",
@@ -87,6 +88,9 @@ def memory_stats(device_index: _device_t = None, /) -> OrderedDict[str, Any]:
             If not given, use :func:`torch.accelerator.current_device_index` by default.
             If a :class:`torch.device` or str is provided, its type must match the current
             :ref:`accelerator<accelerators>` device type.
+
+    Returns:
+        OrderedDict[str, Any]: an ordered dictionary mapping statistic names to their values.
     """
     if not torch._C._accelerator_isAllocatorInitialized():
         return OrderedDict()
@@ -117,6 +121,9 @@ def memory_allocated(device_index: _device_t = None, /) -> int:
             If not given, use :func:`torch.accelerator.current_device_index` by default.
             If a :class:`torch.device` or str is provided, its type must match the current
             :ref:`accelerator<accelerators>` device type.
+
+    Returns:
+        int: the current memory occupied by live tensors (in bytes) within the current process.
     """
     return memory_stats(device_index).get("allocated_bytes.all.current", 0)
 
@@ -134,6 +141,9 @@ def max_memory_allocated(device_index: _device_t = None, /) -> int:
             If not given, use :func:`torch.accelerator.current_device_index` by default.
             If a :class:`torch.device` or str is provided, its type must match the current
             :ref:`accelerator<accelerators>` device type.
+
+    Returns:
+        int: the peak memory occupied by live tensors (in bytes) within the current process.
     """
     return memory_stats(device_index).get("allocated_bytes.all.peak", 0)
 
@@ -147,6 +157,9 @@ def memory_reserved(device_index: _device_t = None, /) -> int:
             If not given, use :func:`torch.accelerator.current_device_index` by default.
             If a :class:`torch.device` or str is provided, its type must match the current
             :ref:`accelerator<accelerators>` device type.
+
+    Returns:
+        int: the current memory reserved by PyTorch (in bytes) within the current process.
     """
     return memory_stats(device_index).get("reserved_bytes.all.current", 0)
 
@@ -164,6 +177,9 @@ def max_memory_reserved(device_index: _device_t = None, /) -> int:
             If not given, use :func:`torch.accelerator.current_device_index` by default.
             If a :class:`torch.device` or str is provided, its type must match the current
             :ref:`accelerator<accelerators>` device type.
+
+    Returns:
+        int: the peak memory reserved by PyTorch (in bytes) within the current process.
     """
     return memory_stats(device_index).get("reserved_bytes.all.peak", 0)
 
@@ -200,3 +216,21 @@ def reset_peak_memory_stats(device_index: _device_t = None, /) -> None:
     """
     device_index = _get_device_index(device_index, optional=True)
     return torch._C._accelerator_resetPeakStats(device_index)
+
+
+def get_memory_info(device_index: _device_t = None, /) -> tuple[int, int]:
+    r"""Return the current device memory information for a given device index.
+
+    Args:
+        device_index (:class:`torch.device`, str, int, optional): the index of the device to target.
+            If not given, use :func:`torch.accelerator.current_device_index` by default.
+            If a :class:`torch.device` or str is provided, its type must match the current
+            :ref:`accelerator<accelerators>` device type.
+
+    Returns:
+        tuple[int, int]: a tuple of two integers (free_memory, total_memory) in bytes.
+            The first value is the free memory on the device (available across all processes and applications),
+            The second value is the device's total hardware memory capacity.
+    """
+    device_index = _get_device_index(device_index, optional=True)
+    return torch._C._accelerator_getMemoryInfo(device_index)
diff --git a/torch/ao/nn/intrinsic/qat/modules/conv_fused.py b/torch/ao/nn/intrinsic/qat/modules/conv_fused.py
index 65ec31ee3e3b7..0054e996e33ce 100644
--- a/torch/ao/nn/intrinsic/qat/modules/conv_fused.py
+++ b/torch/ao/nn/intrinsic/qat/modules/conv_fused.py
@@ -1,6 +1,6 @@
 # mypy: allow-untyped-defs
 import math
-from typing import ClassVar, Optional
+from typing import ClassVar
 
 import torch
 import torch.ao.nn.intrinsic as nni
@@ -458,7 +458,7 @@ class ConvBn1d(_ConvBnNd, nn.Conv1d):
     """
 
     _FLOAT_BN_MODULE: ClassVar[type[nn.BatchNorm1d]] = nn.BatchNorm1d
-    _FLOAT_RELU_MODULE: ClassVar[Optional[type[nn.Module]]] = None
+    _FLOAT_RELU_MODULE: ClassVar[type[nn.Module] | None] = None
     _FLOAT_MODULE: ClassVar[type[nn.Module]] = nni.ConvBn1d  # type: ignore[assignment]
     _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv1d]] = nn.Conv1d
 
@@ -530,9 +530,9 @@ class ConvBnReLU1d(ConvBn1d):
     _FLOAT_MODULE: ClassVar[type[nn.Module]] = nni.ConvBnReLU1d
     _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv1d]] = nn.Conv1d
     _FLOAT_BN_MODULE: ClassVar[type[nn.BatchNorm1d]] = nn.BatchNorm1d
-    _FLOAT_RELU_MODULE: ClassVar[Optional[type[nn.Module]]] = nn.ReLU
+    _FLOAT_RELU_MODULE: ClassVar[type[nn.Module] | None] = nn.ReLU
     # module class after fusing bn into conv
-    _FUSED_FLOAT_MODULE: ClassVar[Optional[type[nn.Module]]] = nni.ConvReLU1d
+    _FUSED_FLOAT_MODULE: ClassVar[type[nn.Module] | None] = nni.ConvReLU1d
 
     def __init__(
         self,
@@ -595,8 +595,8 @@ class ConvReLU1d(nnqat.Conv1d, nni._FusedModule):
 
     _FLOAT_MODULE: ClassVar[type[nni.ConvReLU1d]] = nni.ConvReLU1d  # type: ignore[assignment]
     _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv1d]] = nn.Conv1d
-    _FLOAT_BN_MODULE: ClassVar[Optional[type[nn.Module]]] = None
-    _FLOAT_RELU_MODULE: ClassVar[Optional[type[nn.Module]]] = nn.ReLU
+    _FLOAT_BN_MODULE: ClassVar[type[nn.Module] | None] = None
+    _FLOAT_RELU_MODULE: ClassVar[type[nn.Module] | None] = nn.ReLU
 
     def __init__(
         self,
@@ -660,8 +660,8 @@ class ConvBn2d(_ConvBnNd, nn.Conv2d):
 
     _FLOAT_MODULE: ClassVar[type[nni.ConvBn2d]] = nni.ConvBn2d  # type: ignore[assignment]
     _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv2d]] = nn.Conv2d
-    _FLOAT_BN_MODULE: ClassVar[Optional[type[nn.Module]]] = nn.BatchNorm2d
-    _FLOAT_RELU_MODULE: ClassVar[Optional[type[nn.Module]]] = None
+    _FLOAT_BN_MODULE: ClassVar[type[nn.Module] | None] = nn.BatchNorm2d
+    _FLOAT_RELU_MODULE: ClassVar[type[nn.Module] | None] = None
 
     def __init__(
         self,
@@ -731,9 +731,9 @@ class ConvBnReLU2d(ConvBn2d):
     _FLOAT_MODULE: ClassVar[type[nni.ConvBnReLU2d]] = nni.ConvBnReLU2d  # type: ignore[assignment]
     _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv2d]] = nn.Conv2d
     _FLOAT_BN_MODULE: ClassVar[type[nn.BatchNorm2d]] = nn.BatchNorm2d
-    _FLOAT_RELU_MODULE: ClassVar[Optional[type[nn.Module]]] = nn.ReLU
+    _FLOAT_RELU_MODULE: ClassVar[type[nn.Module] | None] = nn.ReLU
     # module class after fusing bn into conv
-    _FUSED_FLOAT_MODULE: ClassVar[Optional[type[nni.ConvReLU2d]]] = nni.ConvReLU2d
+    _FUSED_FLOAT_MODULE: ClassVar[type[nni.ConvReLU2d] | None] = nni.ConvReLU2d
 
     def __init__(
         self,
@@ -796,8 +796,8 @@ class ConvReLU2d(nnqat.Conv2d, nni._FusedModule):
 
     _FLOAT_MODULE: ClassVar[type[nn.Module]] = nni.ConvReLU2d  # type: ignore[assignment]
     _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv2d]] = nn.Conv2d
-    _FLOAT_BN_MODULE: ClassVar[Optional[type[nn.Module]]] = None
-    _FLOAT_RELU_MODULE: ClassVar[Optional[type[nn.Module]]] = nn.ReLU
+    _FLOAT_BN_MODULE: ClassVar[type[nn.Module] | None] = None
+    _FLOAT_RELU_MODULE: ClassVar[type[nn.Module] | None] = nn.ReLU
 
     def __init__(
         self,
@@ -861,8 +861,8 @@ class ConvBn3d(_ConvBnNd, nn.Conv3d):
 
     _FLOAT_MODULE: ClassVar[type[nni.ConvBn3d]] = nni.ConvBn3d  # type: ignore[assignment]
     _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv3d]] = nn.Conv3d
-    _FLOAT_BN_MODULE: ClassVar[Optional[type[nn.Module]]] = nn.BatchNorm3d
-    _FLOAT_RELU_MODULE: ClassVar[Optional[type[nn.Module]]] = None
+    _FLOAT_BN_MODULE: ClassVar[type[nn.Module] | None] = nn.BatchNorm3d
+    _FLOAT_RELU_MODULE: ClassVar[type[nn.Module] | None] = None
 
     def __init__(
         self,
@@ -931,9 +931,9 @@ class ConvBnReLU3d(ConvBn3d):
     _FLOAT_MODULE: ClassVar[type[nni.ConvBnReLU3d]] = nni.ConvBnReLU3d  # type: ignore[assignment]
     _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv3d]] = nn.Conv3d
     _FLOAT_BN_MODULE: ClassVar[type[nn.BatchNorm3d]] = nn.BatchNorm3d
-    _FLOAT_RELU_MODULE: ClassVar[Optional[type[nn.ReLU]]] = nn.ReLU
+    _FLOAT_RELU_MODULE: ClassVar[type[nn.ReLU] | None] = nn.ReLU
     # module class after fusing bn into conv
-    _FUSED_FLOAT_MODULE: ClassVar[Optional[type[nni.ConvReLU3d]]] = nni.ConvReLU3d
+    _FUSED_FLOAT_MODULE: ClassVar[type[nni.ConvReLU3d] | None] = nni.ConvReLU3d
 
     def __init__(
         self,
@@ -998,8 +998,8 @@ class ConvReLU3d(nnqat.Conv3d, nni._FusedModule):
 
     _FLOAT_MODULE: ClassVar[type[nni.ConvReLU3d]] = nni.ConvReLU3d  # type: ignore[assignment]
     _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv3d]] = nn.Conv3d
-    _FLOAT_BN_MODULE: ClassVar[Optional[type[nn.Module]]] = None
-    _FLOAT_RELU_MODULE: ClassVar[Optional[type[nn.Module]]] = nn.ReLU
+    _FLOAT_BN_MODULE: ClassVar[type[nn.Module] | None] = None
+    _FLOAT_RELU_MODULE: ClassVar[type[nn.Module] | None] = nn.ReLU
 
     def __init__(
         self,
diff --git a/torch/ao/nn/intrinsic/qat/modules/linear_relu.py b/torch/ao/nn/intrinsic/qat/modules/linear_relu.py
index c8533f8d5b67f..183286ebb8dad 100644
--- a/torch/ao/nn/intrinsic/qat/modules/linear_relu.py
+++ b/torch/ao/nn/intrinsic/qat/modules/linear_relu.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import Optional
+from typing import TYPE_CHECKING
 
 import torch
 import torch.ao.nn.intrinsic as nni
@@ -9,6 +9,10 @@
 from torch.ao.nn.intrinsic.modules.fused import _FusedModule
 
 
+if TYPE_CHECKING:
+    from torch.ao.quantization.qconfig import QConfigAny
+
+
 __all__ = ["LinearReLU"]
 
 
@@ -44,7 +48,7 @@ def __init__(
         in_features: int,
         out_features: int,
         bias: bool = True,
-        qconfig: Optional[object] = None,
+        qconfig: QConfigAny = None,
     ) -> None:
         super().__init__(in_features, out_features, bias, qconfig)
 
diff --git a/torch/ao/nn/qat/dynamic/modules/linear.py b/torch/ao/nn/qat/dynamic/modules/linear.py
index c8e30b26fb52f..689a5361a7903 100644
--- a/torch/ao/nn/qat/dynamic/modules/linear.py
+++ b/torch/ao/nn/qat/dynamic/modules/linear.py
@@ -1,4 +1,4 @@
-from typing import Optional, TYPE_CHECKING, Union
+from typing import Optional, TYPE_CHECKING
 
 import torch
 
@@ -29,8 +29,8 @@ def __init__(
         out_features: int,
         bias: bool = True,
         qconfig: Optional["QConfig"] = None,
-        device: Optional[Union[int, str, torch.device]] = None,
-        dtype: Optional[str] = None,
+        device: int | str | torch.device | None = None,
+        dtype: str | None = None,
     ) -> None:
         super().__init__(in_features, out_features, bias, qconfig, device, dtype)
         if not torch.ao.quantization.qconfig._activation_is_memoryless(qconfig):  # type: ignore[arg-type]
diff --git a/torch/ao/nn/qat/modules/conv.py b/torch/ao/nn/qat/modules/conv.py
index e1f869d105c9b..9d228d56fce12 100644
--- a/torch/ao/nn/qat/modules/conv.py
+++ b/torch/ao/nn/qat/modules/conv.py
@@ -1,5 +1,5 @@
 # mypy: allow-untyped-defs
-from typing import ClassVar, Literal, Union
+from typing import ClassVar, Literal
 
 import torch
 import torch.nn as nn
@@ -20,7 +20,7 @@ def __init__(
         out_channels: int,
         kernel_size: tuple[int, ...],
         stride: tuple[int, ...],
-        padding: Union[str, tuple[int, ...]],
+        padding: str | tuple[int, ...],
         dilation: tuple[int, ...],
         transposed: bool,
         output_padding: tuple[int, ...],
@@ -145,7 +145,7 @@ def __init__(
         out_channels: int,
         kernel_size: _size_1_t,
         stride: _size_1_t = 1,
-        padding: Union[str, _size_1_t] = 0,
+        padding: str | _size_1_t = 0,
         dilation: _size_1_t = 1,
         groups: int = 1,
         bias: bool = True,
@@ -207,7 +207,7 @@ def __init__(
         out_channels: int,
         kernel_size: _size_2_t,
         stride: _size_2_t = 1,
-        padding: Union[str, _size_2_t] = 0,
+        padding: str | _size_2_t = 0,
         dilation: _size_2_t = 1,
         groups: int = 1,
         bias: bool = True,
@@ -272,7 +272,7 @@ def __init__(
         out_channels: int,
         kernel_size: _size_3_t,
         stride: _size_3_t = 1,
-        padding: Union[str, _size_3_t] = 0,
+        padding: str | _size_3_t = 0,
         dilation: _size_3_t = 1,
         groups: int = 1,
         bias: bool = True,
diff --git a/torch/ao/nn/quantizable/modules/activation.py b/torch/ao/nn/quantizable/modules/activation.py
index fbb83c0ec289f..d808d50c366c6 100644
--- a/torch/ao/nn/quantizable/modules/activation.py
+++ b/torch/ao/nn/quantizable/modules/activation.py
@@ -1,6 +1,5 @@
 # mypy: allow-untyped-defs
 import warnings
-from typing import Optional
 
 import torch
 import torch.jit  # this is needed to avoid a circular import
@@ -67,8 +66,8 @@ def __init__(
         bias: bool = True,
         add_bias_kv: bool = False,
         add_zero_attn: bool = False,
-        kdim: Optional[int] = None,
-        vdim: Optional[int] = None,
+        kdim: int | None = None,
+        vdim: int | None = None,
         batch_first: bool = False,
         device=None,
         dtype=None,
@@ -289,12 +288,12 @@ def forward(
         query: Tensor,
         key: Tensor,
         value: Tensor,
-        key_padding_mask: Optional[Tensor] = None,
+        key_padding_mask: Tensor | None = None,
         need_weights: bool = True,
-        attn_mask: Optional[Tensor] = None,
+        attn_mask: Tensor | None = None,
         average_attn_weights: bool = True,
         is_causal: bool = False,
-    ) -> tuple[Tensor, Optional[Tensor]]:
+    ) -> tuple[Tensor, Tensor | None]:
         r"""
         Note::
             Please, refer to :func:`~torch.nn.MultiheadAttention.forward` for more
@@ -357,12 +356,12 @@ def _forward_impl(
         query: Tensor,
         key: Tensor,
         value: Tensor,
-        key_padding_mask: Optional[Tensor] = None,
+        key_padding_mask: Tensor | None = None,
         need_weights: bool = True,
-        attn_mask: Optional[Tensor] = None,
+        attn_mask: Tensor | None = None,
         average_attn_weights: bool = True,
         is_causal: bool = False,
-    ) -> tuple[Tensor, Optional[Tensor]]:
+    ) -> tuple[Tensor, Tensor | None]:
         # This version will not deal with the static key/value pairs.
         # Keeping it here for future changes.
         #
diff --git a/torch/ao/nn/quantizable/modules/rnn.py b/torch/ao/nn/quantizable/modules/rnn.py
index 504a3d9015328..74e4bd902d156 100644
--- a/torch/ao/nn/quantizable/modules/rnn.py
+++ b/torch/ao/nn/quantizable/modules/rnn.py
@@ -7,7 +7,6 @@
 
 import numbers
 import warnings
-from typing import Optional
 
 import torch
 from torch import Tensor
@@ -101,7 +100,7 @@ def __init__(
         self.cell_state_dtype: torch.dtype = torch.quint8
 
     def forward(
-        self, x: Tensor, hidden: Optional[tuple[Tensor, Tensor]] = None
+        self, x: Tensor, hidden: tuple[Tensor, Tensor] | None = None
     ) -> tuple[Tensor, Tensor]:
         if hidden is None or hidden[0] is None or hidden[1] is None:
             hidden = self.initialize_hidden(x.shape[0], x.is_quantized)
@@ -247,7 +246,7 @@ def __init__(
             input_dim, hidden_dim, bias=bias, split_gates=split_gates, **factory_kwargs
         )
 
-    def forward(self, x: Tensor, hidden: Optional[tuple[Tensor, Tensor]] = None):
+    def forward(self, x: Tensor, hidden: tuple[Tensor, Tensor] | None = None):
         result = []
         seq_len = x.shape[0]
         for i in range(seq_len):
@@ -297,14 +296,14 @@ def __init__(
                 **factory_kwargs,
             )
 
-    def forward(self, x: Tensor, hidden: Optional[tuple[Tensor, Tensor]] = None):
+    def forward(self, x: Tensor, hidden: tuple[Tensor, Tensor] | None = None):
         if self.batch_first:
             x = x.transpose(0, 1)
         if hidden is None:
             hx_fw, cx_fw = (None, None)
         else:
             hx_fw, cx_fw = hidden
-        hidden_bw: Optional[tuple[Tensor, Tensor]] = None
+        hidden_bw: tuple[Tensor, Tensor] | None = None
         if self.bidirectional:
             if hx_fw is None:
                 hx_bw = None
@@ -506,7 +505,7 @@ def __init__(
         )
         self.layers = torch.nn.ModuleList(layers)
 
-    def forward(self, x: Tensor, hidden: Optional[tuple[Tensor, Tensor]] = None):
+    def forward(self, x: Tensor, hidden: tuple[Tensor, Tensor] | None = None):
         if self.batch_first:
             x = x.transpose(0, 1)
 
diff --git a/torch/ao/nn/quantized/dynamic/modules/conv.py b/torch/ao/nn/quantized/dynamic/modules/conv.py
index 8f51696c48162..68c3f6acd0934 100644
--- a/torch/ao/nn/quantized/dynamic/modules/conv.py
+++ b/torch/ao/nn/quantized/dynamic/modules/conv.py
@@ -2,7 +2,7 @@
 r"""Dynamically quantized convolution modules."""
 
 import warnings
-from typing import ClassVar, Literal, Optional
+from typing import ClassVar, Literal
 
 import torch
 import torch.ao.nn.quantized as nnq
@@ -49,8 +49,8 @@ class Conv1d(nnq.Conv1d):
     """
 
     _FLOAT_MODULE: ClassVar[type[nn.Conv1d]] = nn.Conv1d
-    _NNIQAT_CONV_BN_MODULE: ClassVar[Optional[type[nn.Module]]] = None
-    _NNI_CONV_RELU_MODULE: ClassVar[Optional[type[nn.Module]]] = None
+    _NNIQAT_CONV_BN_MODULE: ClassVar[type[nn.Module] | None] = None
+    _NNI_CONV_RELU_MODULE: ClassVar[type[nn.Module] | None] = None
 
     def __init__(
         self,
@@ -137,8 +137,8 @@ class Conv2d(nnq.Conv2d):
     """
 
     _FLOAT_MODULE: ClassVar[type[nn.Conv2d]] = nn.Conv2d
-    _NNIQAT_CONV_BN_MODULE: ClassVar[Optional[type[nn.Module]]] = None
-    _NNI_CONV_RELU_MODULE: ClassVar[Optional[type[nn.Module]]] = None
+    _NNIQAT_CONV_BN_MODULE: ClassVar[type[nn.Module] | None] = None
+    _NNI_CONV_RELU_MODULE: ClassVar[type[nn.Module] | None] = None
 
     def __init__(
         self,
@@ -223,8 +223,8 @@ class Conv3d(nnq.Conv3d):
     """
 
     _FLOAT_MODULE: ClassVar[type[nn.Conv3d]] = nn.Conv3d
-    _NNIQAT_CONV_BN_MODULE: ClassVar[Optional[type[nn.Module]]] = None
-    _NNI_CONV_RELU_MODULE: ClassVar[Optional[type[nn.Module]]] = None
+    _NNIQAT_CONV_BN_MODULE: ClassVar[type[nn.Module] | None] = None
+    _NNI_CONV_RELU_MODULE: ClassVar[type[nn.Module] | None] = None
 
     def __init__(
         self,
diff --git a/torch/ao/nn/quantized/functional.py b/torch/ao/nn/quantized/functional.py
index 30994b2921bd2..f84d41b58503a 100644
--- a/torch/ao/nn/quantized/functional.py
+++ b/torch/ao/nn/quantized/functional.py
@@ -2,7 +2,6 @@
 r"""Functional interface (quantized)."""
 
 import warnings
-from typing import Optional
 
 import torch
 from torch import Tensor
@@ -439,9 +438,9 @@ def interpolate(
 def linear(
     input: Tensor,
     weight: Tensor,
-    bias: Optional[Tensor] = None,
-    scale: Optional[float] = None,
-    zero_point: Optional[int] = None,
+    bias: Tensor | None = None,
+    scale: float | None = None,
+    zero_point: int | None = None,
 ) -> Tensor:
     r"""
     Applies a linear transformation to the incoming quantized data:
@@ -558,8 +557,8 @@ def leaky_relu(
     input: Tensor,
     negative_slope: float = 0.01,
     inplace: bool = False,
-    scale: Optional[float] = None,
-    zero_point: Optional[int] = None,
+    scale: float | None = None,
+    zero_point: int | None = None,
 ):
     r"""
     Quantized version of the.
diff --git a/torch/ao/nn/quantized/modules/linear.py b/torch/ao/nn/quantized/modules/linear.py
index e03306d5fb851..84fa07b4a0220 100644
--- a/torch/ao/nn/quantized/modules/linear.py
+++ b/torch/ao/nn/quantized/modules/linear.py
@@ -1,7 +1,6 @@
 # mypy: allow-untyped-decorators
 # mypy: allow-untyped-defs
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 import torch.ao.nn.intrinsic as nni
@@ -31,9 +30,7 @@ def __init__(self, dtype=torch.qint8):
         self.set_weight_bias(wq, None)  # type: ignore[possibly-undefined]
 
     @torch.jit.export
-    def set_weight_bias(
-        self, weight: torch.Tensor, bias: Optional[torch.Tensor]
-    ) -> None:
+    def set_weight_bias(self, weight: torch.Tensor, bias: torch.Tensor | None) -> None:
         if self.dtype == torch.qint8:
             self._packed_params = torch.ops.quantized.linear_prepack(weight, bias)
         elif self.dtype == torch.float16:
@@ -279,7 +276,7 @@ def weight(self):
     def bias(self):
         return self._weight_bias()[1]
 
-    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None:
+    def set_weight_bias(self, w: torch.Tensor, b: torch.Tensor | None) -> None:
         self._packed_params.set_weight_bias(w, b)
 
     @classmethod
diff --git a/torch/ao/nn/quantized/reference/modules/conv.py b/torch/ao/nn/quantized/reference/modules/conv.py
index f599577ecbb1a..3273b89cc70ab 100644
--- a/torch/ao/nn/quantized/reference/modules/conv.py
+++ b/torch/ao/nn/quantized/reference/modules/conv.py
@@ -65,7 +65,7 @@ def __init__(
         padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
         device=None,
         dtype=None,
-        weight_qparams: Optional[dict[str, Any]] = None,
+        weight_qparams: dict[str, Any] | None = None,
     ):
         nn.Conv1d.__init__(
             self,
@@ -129,7 +129,7 @@ def __init__(
         padding_mode="zeros",
         device=None,
         dtype=None,
-        weight_qparams: Optional[dict[str, Any]] = None,
+        weight_qparams: dict[str, Any] | None = None,
     ):
         nn.Conv2d.__init__(
             self,
@@ -194,7 +194,7 @@ def __init__(
         padding_mode="zeros",
         device=None,
         dtype=None,
-        weight_qparams: Optional[dict[str, Any]] = None,
+        weight_qparams: dict[str, Any] | None = None,
     ):
         nn.Conv3d.__init__(
             self,
@@ -290,7 +290,7 @@ def __init__(
         padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
         device=None,
         dtype=None,
-        weight_qparams: Optional[dict[str, Any]] = None,
+        weight_qparams: dict[str, Any] | None = None,
     ):
         nn.ConvTranspose1d.__init__(
             self,
@@ -310,7 +310,7 @@ def __init__(
         self._init_weight_qparams(weight_qparams, device)
 
     def forward(
-        self, x: torch.Tensor, output_size: Optional[list[int]] = None
+        self, x: torch.Tensor, output_size: list[int] | None = None
     ) -> torch.Tensor:
         """
         we have:
@@ -370,7 +370,7 @@ def __init__(
         padding_mode="zeros",
         device=None,
         dtype=None,
-        weight_qparams: Optional[dict[str, Any]] = None,
+        weight_qparams: dict[str, Any] | None = None,
     ):
         nn.ConvTranspose2d.__init__(
             self,
@@ -391,7 +391,7 @@ def __init__(
         self._init_weight_qparams(weight_qparams, device)
 
     def forward(
-        self, x: torch.Tensor, output_size: Optional[list[int]] = None
+        self, x: torch.Tensor, output_size: list[int] | None = None
     ) -> torch.Tensor:
         """
         we have:
@@ -452,7 +452,7 @@ def __init__(
         padding_mode="zeros",
         device=None,
         dtype=None,
-        weight_qparams: Optional[dict[str, Any]] = None,
+        weight_qparams: dict[str, Any] | None = None,
     ):
         nn.ConvTranspose3d.__init__(
             self,
@@ -473,7 +473,7 @@ def __init__(
         self._init_weight_qparams(weight_qparams, device)
 
     def forward(
-        self, x: torch.Tensor, output_size: Optional[list[int]] = None
+        self, x: torch.Tensor, output_size: list[int] | None = None
     ) -> torch.Tensor:
         """
         we have:
diff --git a/torch/ao/nn/quantized/reference/modules/linear.py b/torch/ao/nn/quantized/reference/modules/linear.py
index 67f4aee33ba34..6014fab24036c 100644
--- a/torch/ao/nn/quantized/reference/modules/linear.py
+++ b/torch/ao/nn/quantized/reference/modules/linear.py
@@ -1,4 +1,4 @@
-from typing import Any, Optional
+from typing import Any
 
 import torch
 import torch.nn as nn
@@ -26,9 +26,9 @@ def __init__(
         in_features: int,
         out_features: int,
         bias_: bool = True,
-        device: Optional[torch.device] = None,
-        dtype: Optional[torch.dtype] = None,
-        weight_qparams: Optional[dict[str, Any]] = None,
+        device: torch.device | None = None,
+        dtype: torch.dtype | None = None,
+        weight_qparams: dict[str, Any] | None = None,
     ) -> None:
         super().__init__(in_features, out_features, bias_, device, dtype)
         self._init_weight_qparams(weight_qparams, device)
diff --git a/torch/ao/nn/quantized/reference/modules/rnn.py b/torch/ao/nn/quantized/reference/modules/rnn.py
index 3aba49a10db81..1bdbfb81430b4 100644
--- a/torch/ao/nn/quantized/reference/modules/rnn.py
+++ b/torch/ao/nn/quantized/reference/modules/rnn.py
@@ -1,5 +1,5 @@
 # mypy: allow-untyped-defs
-from typing import Any, Optional
+from typing import Any
 
 import torch
 import torch.nn as nn
@@ -166,7 +166,7 @@ def __init__(
         nonlinearity: str = "tanh",
         device=None,
         dtype=None,
-        weight_qparams_dict: Optional[dict[str, Any]] = None,
+        weight_qparams_dict: dict[str, Any] | None = None,
     ) -> None:
         factory_kwargs = {
             "device": device,
@@ -181,7 +181,7 @@ def _get_name(self):
 
     # TODO: refactor nn.RNNCell to have a _forward that takes weight_ih and weight_hh as input
     # and remove duplicated code, same for the other two Cell modules
-    def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor:
+    def forward(self, input: Tensor, hx: Tensor | None = None) -> Tensor:
         assert input.dim() in (
             1,
             2,
@@ -258,7 +258,7 @@ def __init__(
         bias: bool = True,
         device=None,
         dtype=None,
-        weight_qparams_dict: Optional[dict[str, Any]] = None,
+        weight_qparams_dict: dict[str, Any] | None = None,
     ) -> None:
         factory_kwargs = {
             "device": device,
@@ -271,7 +271,7 @@ def _get_name(self):
         return "QuantizedLSTMCell(Reference)"
 
     def forward(
-        self, input: Tensor, hx: Optional[tuple[Tensor, Tensor]] = None
+        self, input: Tensor, hx: tuple[Tensor, Tensor] | None = None
     ) -> tuple[Tensor, Tensor]:
         assert input.dim() in (
             1,
@@ -335,7 +335,7 @@ def __init__(
         bias: bool = True,
         device=None,
         dtype=None,
-        weight_qparams_dict: Optional[dict[str, Any]] = None,
+        weight_qparams_dict: dict[str, Any] | None = None,
     ) -> None:
         factory_kwargs = {
             "device": device,
@@ -347,7 +347,7 @@ def __init__(
     def _get_name(self):
         return "QuantizedGRUCell(Reference)"
 
-    def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor:
+    def forward(self, input: Tensor, hx: Tensor | None = None) -> Tensor:
         assert input.dim() in (
             1,
             2,
@@ -410,7 +410,7 @@ def __init__(
         proj_size: int = 0,
         device=None,
         dtype=None,
-        weight_qparams_dict: Optional[dict[str, Any]] = None,
+        weight_qparams_dict: dict[str, Any] | None = None,
     ) -> None:
         super().__init__(
             mode,
@@ -497,7 +497,7 @@ def __init__(self, *args, **kwargs):
     def permute_hidden(  # type: ignore[override]
         self,
         hx: tuple[Tensor, Tensor],
-        permutation: Optional[Tensor],
+        permutation: Tensor | None,
     ) -> tuple[Tensor, Tensor]:
         if permutation is None:
             return hx
@@ -506,7 +506,7 @@ def permute_hidden(  # type: ignore[override]
         )
 
     def get_expected_cell_size(
-        self, input: Tensor, batch_sizes: Optional[Tensor]
+        self, input: Tensor, batch_sizes: Tensor | None
     ) -> tuple[int, int, int]:
         if batch_sizes is not None:
             mini_batch = int(batch_sizes[0])
@@ -526,7 +526,7 @@ def check_forward_args(  # type: ignore[override]
         self,
         input: Tensor,
         hidden: tuple[Tensor, Tensor],
-        batch_sizes: Optional[Tensor],
+        batch_sizes: Tensor | None,
     ):
         self.check_input(input, batch_sizes)
         self.check_hidden_size(
diff --git a/torch/ao/nn/quantized/reference/modules/sparse.py b/torch/ao/nn/quantized/reference/modules/sparse.py
index a206a81997bcc..8ff80997c1439 100644
--- a/torch/ao/nn/quantized/reference/modules/sparse.py
+++ b/torch/ao/nn/quantized/reference/modules/sparse.py
@@ -1,5 +1,5 @@
 # mypy: allow-untyped-defs
-from typing import Any, Optional
+from typing import Any
 
 import torch.nn as nn
 import torch.nn.functional as F
@@ -23,15 +23,15 @@ def __init__(
         self,
         num_embeddings: int,
         embedding_dim: int,
-        padding_idx: Optional[int] = None,
-        max_norm: Optional[float] = None,
+        padding_idx: int | None = None,
+        max_norm: float | None = None,
         norm_type: float = 2.0,
         scale_grad_by_freq: bool = False,
         sparse: bool = False,
-        _weight: Optional[Tensor] = None,
+        _weight: Tensor | None = None,
         device=None,
         dtype=None,
-        weight_qparams: Optional[dict[str, Any]] = None,
+        weight_qparams: dict[str, Any] | None = None,
     ) -> None:
         super().__init__(
             num_embeddings,
@@ -92,17 +92,17 @@ def __init__(
         self,
         num_embeddings: int,
         embedding_dim: int,
-        max_norm: Optional[float] = None,
+        max_norm: float | None = None,
         norm_type: float = 2.0,
         scale_grad_by_freq: bool = False,
         mode: str = "mean",
         sparse: bool = False,
-        _weight: Optional[Tensor] = None,
+        _weight: Tensor | None = None,
         include_last_offset: bool = False,
-        padding_idx: Optional[int] = None,
+        padding_idx: int | None = None,
         device=None,
         dtype=None,
-        weight_qparams: Optional[dict[str, Any]] = None,
+        weight_qparams: dict[str, Any] | None = None,
     ) -> None:
         super().__init__(
             num_embeddings,
@@ -126,8 +126,8 @@ def _get_name(self):
     def forward(
         self,
         input: Tensor,
-        offsets: Optional[Tensor] = None,
-        per_sample_weights: Optional[Tensor] = None,
+        offsets: Tensor | None = None,
+        per_sample_weights: Tensor | None = None,
     ) -> Tensor:
         weight_quant_dequant = self.get_weight()
         return F.embedding_bag(
diff --git a/torch/ao/nn/quantized/reference/modules/utils.py b/torch/ao/nn/quantized/reference/modules/utils.py
index f68cb46534f3e..8311b0e5697b0 100644
--- a/torch/ao/nn/quantized/reference/modules/utils.py
+++ b/torch/ao/nn/quantized/reference/modules/utils.py
@@ -87,9 +87,9 @@ def _init_weight_qparams(self, weight_qparams, device):
         # for capturing `.item` operations
         self.weight_axis_int: int = self.weight_axis.item()  # type: ignore[operator, assignment]
         # pyrefly: ignore [bad-assignment]
-        self.weight_quant_min: typing.Optional[int] = weight_qparams.get("quant_min")
+        self.weight_quant_min: int | None = weight_qparams.get("quant_min")
         # pyrefly: ignore [bad-assignment]
-        self.weight_quant_max: typing.Optional[int] = weight_qparams.get("quant_max")
+        self.weight_quant_max: int | None = weight_qparams.get("quant_max")
 
     def get_weight(self):
         """
@@ -196,8 +196,8 @@ def _quantize_weight_decomposed(
     weight_scale: torch.Tensor,
     weight_zero_point: torch.Tensor,
     weight_axis: int,
-    weight_quant_min: typing.Optional[int],
-    weight_quant_max: typing.Optional[int],
+    weight_quant_min: int | None,
+    weight_quant_max: int | None,
 ) -> torch.Tensor:
     _DTYPE_TO_QVALUE_BOUNDS: dict[torch.dtype, tuple[int, int]] = {
         torch.uint8: (0, 255),
@@ -258,8 +258,8 @@ def _dequantize_weight_decomposed(
     weight_scale: torch.Tensor,
     weight_zero_point: torch.Tensor,
     weight_axis: int,
-    weight_quant_min: typing.Optional[int],
-    weight_quant_max: typing.Optional[int],
+    weight_quant_min: int | None,
+    weight_quant_max: int | None,
 ) -> torch.Tensor:
     # TODO: get the quant_min and quant_max from activation_post_process
     _DTYPE_TO_QVALUE_BOUNDS: dict[torch.dtype, tuple[int, int]] = {
@@ -343,8 +343,8 @@ def _quantize_and_dequantize_weight_decomposed(
     weight_scale: torch.Tensor,
     weight_zero_point: torch.Tensor,
     weight_axis_int: int,
-    weight_quant_min: typing.Optional[int],
-    weight_quant_max: typing.Optional[int],
+    weight_quant_min: int | None,
+    weight_quant_max: int | None,
 ) -> torch.Tensor:
     """Quantize and then dequantize the weight based on
     the quantization parameters
diff --git a/torch/ao/nn/sparse/quantized/dynamic/linear.py b/torch/ao/nn/sparse/quantized/dynamic/linear.py
index 65d9e8df79f39..d327cabd0d368 100644
--- a/torch/ao/nn/sparse/quantized/dynamic/linear.py
+++ b/torch/ao/nn/sparse/quantized/dynamic/linear.py
@@ -1,5 +1,4 @@
 # mypy: allow-untyped-defs
-from typing import Optional
 
 import torch
 import torch.ao.nn.intrinsic as nni
@@ -125,9 +124,9 @@ def bias(self):
     def set_weight_bias(
         self,
         w: torch.Tensor,
-        b: Optional[torch.Tensor],
-        row_block_size: Optional[int],
-        col_block_size: Optional[int],
+        b: torch.Tensor | None,
+        row_block_size: int | None,
+        col_block_size: int | None,
     ) -> None:
         assert row_block_size is not None and col_block_size is not None
         self.out_features = w.shape[0]
diff --git a/torch/ao/nn/sparse/quantized/linear.py b/torch/ao/nn/sparse/quantized/linear.py
index e51ce51138ac5..f106a32abfbf9 100644
--- a/torch/ao/nn/sparse/quantized/linear.py
+++ b/torch/ao/nn/sparse/quantized/linear.py
@@ -1,5 +1,4 @@
 # mypy: allow-untyped-defs
-from typing import Optional
 
 import torch
 from torch.ao.nn.quantized.modules.utils import (
@@ -33,9 +32,9 @@ def _get_name(self):
     def set_weight_bias(
         self,
         weight: torch.Tensor,
-        bias: Optional[torch.Tensor],
-        row_block_size: Optional[int],
-        col_block_size: Optional[int],
+        bias: torch.Tensor | None,
+        row_block_size: int | None,
+        col_block_size: int | None,
     ) -> None:
         assert row_block_size is not None and col_block_size is not None
         self._packed_params = torch.ops.sparse.qlinear_prepack(
@@ -209,9 +208,9 @@ def bias(self):
     def set_weight_bias(
         self,
         w: torch.Tensor,
-        b: Optional[torch.Tensor],
-        row_block_size: Optional[int],
-        col_block_size: Optional[int],
+        b: torch.Tensor | None,
+        row_block_size: int | None,
+        col_block_size: int | None,
     ) -> None:
         assert row_block_size is not None and col_block_size is not None
         self._packed_params.set_weight_bias(w, b, row_block_size, col_block_size)
diff --git a/torch/ao/nn/sparse/quantized/utils.py b/torch/ao/nn/sparse/quantized/utils.py
index ccf85e68d84ff..2cfd4a5973dfa 100644
--- a/torch/ao/nn/sparse/quantized/utils.py
+++ b/torch/ao/nn/sparse/quantized/utils.py
@@ -1,5 +1,4 @@
 import threading
-from typing import Optional
 
 
 __all__ = ["LinearBlockSparsePattern"]
@@ -43,9 +42,9 @@ def __enter__(self) -> None:
 
     def __exit__(
         self,
-        exc_type: Optional[type[BaseException]],
-        exc_value: Optional[BaseException],
-        backtrace: Optional[object],
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        backtrace: object | None,
     ) -> None:
         LinearBlockSparsePattern.row_block_size = (
             LinearBlockSparsePattern.prev_row_block_size
diff --git a/torch/ao/ns/_numeric_suite.py b/torch/ao/ns/_numeric_suite.py
index a4b873cb7d168..026ac73606e30 100644
--- a/torch/ao/ns/_numeric_suite.py
+++ b/torch/ao/ns/_numeric_suite.py
@@ -1,6 +1,6 @@
 # mypy: allow-untyped-defs
 from collections.abc import Callable
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 import torch.ao.nn.quantized as nnq
@@ -21,10 +21,10 @@
 
 
 def _find_match(
-    str_list: Union[dict[str, Any], list[str]],
+    str_list: dict[str, Any] | list[str],
     key_str: str,
     postfix: str,
-) -> Optional[str]:
+) -> str | None:
     split_str = key_str.split(".")
     if split_str[-1] == postfix:
         match_string = "".join(key_str.split(".")[0:-1])
diff --git a/torch/ao/ns/_numeric_suite_fx.py b/torch/ao/ns/_numeric_suite_fx.py
index 882bbb8e93590..1861d0160db15 100644
--- a/torch/ao/ns/_numeric_suite_fx.py
+++ b/torch/ao/ns/_numeric_suite_fx.py
@@ -85,7 +85,7 @@
 
 import collections
 from collections.abc import Callable
-from typing import Any, Optional, TYPE_CHECKING
+from typing import Any, TYPE_CHECKING
 
 import torch
 import torch.ao.quantization.quantize_fx as quantize_fx
@@ -156,8 +156,8 @@ def __init__(
         results_type: str,
         index_within_arg: int,
         index_of_arg: int,
-        fqn: Optional[str],
-        qconfig_str: Optional[str] = "",
+        fqn: str | None,
+        qconfig_str: str | None = "",
     ):
         super().__init__()
         self.stats: list[torch.Tensor] = []
@@ -306,9 +306,8 @@ def _extract_weights_one_model(
     model: GraphModule,
     nodes_and_names_to_instrument: list[tuple[Node, str]],
     results: NSResultsType,
-    op_to_type_to_weight_extraction_fn: Optional[
-        dict[str, dict[Callable, Callable]]
-    ] = None,
+    op_to_type_to_weight_extraction_fn: dict[str, dict[Callable, Callable]]
+    | None = None,
 ) -> None:
     torch._C._log_api_usage_once(
         "quantization_api._numeric_suite_fx._extract_weights_one_model"
@@ -329,11 +328,10 @@ def _extract_weights_impl(
     gm_a: GraphModule,
     model_name_b: str,
     gm_b: GraphModule,
-    base_name_to_sets_of_related_ops: Optional[dict[str, set[NSNodeTargetType]]] = None,
-    unmatchable_types_map: Optional[dict[str, set[NSNodeTargetType]]] = None,
-    op_to_type_to_weight_extraction_fn: Optional[
-        dict[str, dict[Callable, Callable]]
-    ] = None,
+    base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]] | None = None,
+    unmatchable_types_map: dict[str, set[NSNodeTargetType]] | None = None,
+    op_to_type_to_weight_extraction_fn: dict[str, dict[Callable, Callable]]
+    | None = None,
 ) -> NSResultsType:
     torch._C._log_api_usage_once(
         "quantization_api._numeric_suite_fx._extract_weights_impl"
@@ -381,11 +379,10 @@ def extract_weights(
     model_a: nn.Module,
     model_name_b: str,
     model_b: nn.Module,
-    base_name_to_sets_of_related_ops: Optional[dict[str, set[NSNodeTargetType]]] = None,
-    unmatchable_types_map: Optional[dict[str, set[NSNodeTargetType]]] = None,
-    op_to_type_to_weight_extraction_fn: Optional[
-        dict[str, dict[Callable, Callable]]
-    ] = None,
+    base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]] | None = None,
+    unmatchable_types_map: dict[str, set[NSNodeTargetType]] | None = None,
+    op_to_type_to_weight_extraction_fn: dict[str, dict[Callable, Callable]]
+    | None = None,
 ) -> NSResultsType:
     """
     Extract weights from model A and model B, and return a comparison.
@@ -473,8 +470,8 @@ def _add_loggers_impl(
     gm_b: GraphModule,
     logger_cls: Callable,
     should_log_inputs: bool,
-    base_name_to_sets_of_related_ops: Optional[dict[str, set[NSNodeTargetType]]] = None,
-    unmatchable_types_map: Optional[dict[str, set[NSNodeTargetType]]] = None,
+    base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]] | None = None,
+    unmatchable_types_map: dict[str, set[NSNodeTargetType]] | None = None,
 ) -> tuple[nn.Module, nn.Module]:
     torch._C._log_api_usage_once("quantization_api._numeric_suite_fx._add_loggers_impl")
     matched_subgraph_pairs = get_matching_subgraph_pairs(
@@ -529,8 +526,8 @@ def add_loggers(
     model_b: nn.Module,
     logger_cls: Callable,
     should_log_inputs: bool = False,
-    base_name_to_sets_of_related_ops: Optional[dict[str, set[NSNodeTargetType]]] = None,
-    unmatchable_types_map: Optional[dict[str, set[NSNodeTargetType]]] = None,
+    base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]] | None = None,
+    unmatchable_types_map: dict[str, set[NSNodeTargetType]] | None = None,
 ) -> tuple[nn.Module, nn.Module]:
     """
     Instrument model A and model B with loggers.
@@ -674,9 +671,9 @@ def _add_shadow_loggers_impl(
     gm_b: GraphModule,
     logger_cls: Callable,
     should_log_inputs: bool,
-    base_name_to_sets_of_related_ops: Optional[dict[str, set[NSNodeTargetType]]] = None,
-    node_type_to_io_type_map: Optional[dict[str, set[NSNodeTargetType]]] = None,
-    unmatchable_types_map: Optional[dict[str, set[NSNodeTargetType]]] = None,
+    base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]] | None = None,
+    node_type_to_io_type_map: dict[str, set[NSNodeTargetType]] | None = None,
+    unmatchable_types_map: dict[str, set[NSNodeTargetType]] | None = None,
 ) -> nn.Module:
     torch._C._log_api_usage_once(
         "quantization_api._numeric_suite_fx._add_shadow_loggers_impl"
@@ -704,9 +701,9 @@ def add_shadow_loggers(
     model_b: nn.Module,
     logger_cls: Callable,
     should_log_inputs: bool = False,
-    base_name_to_sets_of_related_ops: Optional[dict[str, set[NSNodeTargetType]]] = None,
-    node_type_to_io_type_map: Optional[dict[str, set[NSNodeTargetType]]] = None,
-    unmatchable_types_map: Optional[dict[str, set[NSNodeTargetType]]] = None,
+    base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]] | None = None,
+    node_type_to_io_type_map: dict[str, set[NSNodeTargetType]] | None = None,
+    unmatchable_types_map: dict[str, set[NSNodeTargetType]] | None = None,
 ) -> nn.Module:
     """
     Instrument model A and model B with shadow loggers.
@@ -846,8 +843,8 @@ def prepare_n_shadows_model(
     example_inputs: Any,
     qconfig_multi_mapping: QConfigMultiMapping,
     backend_config: BackendConfig,
-    custom_prepare_fn: Optional[Callable] = None,
-    custom_prepare_kwargs: Optional[dict[str, Any]] = None,
+    custom_prepare_fn: Callable | None = None,
+    custom_prepare_kwargs: dict[str, Any] | None = None,
     custom_tracer: Any = None,
 ) -> GraphModule:
     """
@@ -1083,8 +1080,8 @@ def loggers_set_save_activations(
 
 def convert_n_shadows_model(
     model: GraphModule,
-    custom_convert_fn: Optional[Callable] = None,
-    custom_convert_kwargs: Optional[dict[str, Any]] = None,
+    custom_convert_fn: Callable | None = None,
+    custom_convert_kwargs: dict[str, Any] | None = None,
 ) -> GraphModule:
     """
     Given a model from `prepare_n_shadows_model`, runs `convert_fx`
diff --git a/torch/ao/ns/fx/graph_matcher.py b/torch/ao/ns/fx/graph_matcher.py
index fd7f5cbe55276..4fdad3f2d9bc4 100644
--- a/torch/ao/ns/fx/graph_matcher.py
+++ b/torch/ao/ns/fx/graph_matcher.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 import collections
 import enum
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from torch.ao.quantization import FakeQuantizeBase, ObserverBase
@@ -312,7 +312,7 @@ def _get_name_for_subgraph(
     return proposed_name
 
 
-def _get_node_target_type(node: Node, gm: GraphModule) -> Optional[NSNodeTargetType]:
+def _get_node_target_type(node: Node, gm: GraphModule) -> NSNodeTargetType | None:
     if node.op in ("call_function", "call_method"):
         return node.target
     elif node.op == "call_module":
@@ -326,8 +326,8 @@ def _get_node_target_type(node: Node, gm: GraphModule) -> Optional[NSNodeTargetT
 def get_matching_subgraph_pairs(
     gm_a: GraphModule,
     gm_b: GraphModule,
-    base_name_to_sets_of_related_ops: Optional[dict[str, set[NSNodeTargetType]]] = None,
-    unmatchable_types_map: Optional[dict[str, set[NSNodeTargetType]]] = None,
+    base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]] | None = None,
+    unmatchable_types_map: dict[str, set[NSNodeTargetType]] | None = None,
 ) -> dict[str, tuple[NSSubgraph, NSSubgraph]]:
     """
     Matches matchable subgraphs of graph_a to graph_b.
diff --git a/torch/ao/ns/fx/graph_passes.py b/torch/ao/ns/fx/graph_passes.py
index 1b1726499445f..338db28ce41d9 100644
--- a/torch/ao/ns/fx/graph_passes.py
+++ b/torch/ao/ns/fx/graph_passes.py
@@ -1,6 +1,6 @@
 # mypy: allow-untyped-defs
 from collections.abc import Callable
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch.ao.ns.fx.mappings import get_node_type_to_io_type_map
@@ -24,7 +24,7 @@
 )
 
 
-def _maybe_get_fqn(node: Node, gm: GraphModule) -> Optional[str]:
+def _maybe_get_fqn(node: Node, gm: GraphModule) -> str | None:
     fqn = None
     if hasattr(gm, "_node_name_to_scope"):
         # fqn on observers is not present, because they do not
@@ -53,7 +53,7 @@ def _insert_logger_after_node(
     results_type: str,
     index_within_arg: int,
     index_of_arg: int,
-    fqn: Optional[str],
+    fqn: str | None,
 ) -> Node:
     """
     Given a starting graph of
@@ -203,8 +203,8 @@ def _insert_quantize_per_tensor_node(
     node_a: Node,
     gm_b: GraphModule,
     graph_c: Graph,
-    scale: Union[torch.Tensor, float],
-    zero_point: Union[torch.Tensor, int],
+    scale: torch.Tensor | float,
+    zero_point: torch.Tensor | int,
     dtype_cast_name: str,
 ) -> Node:
     # copy scale
@@ -234,14 +234,14 @@ def _insert_quantize_per_tensor_node(
 def _insert_dtype_cast_after_node(
     node_a: Node,
     node_c: Node,
-    prev_node_c: Union[Node, list[Node]],
+    prev_node_c: Node | list[Node],
     gm_a: GraphModule,
     gm_b: GraphModule,
     graph_c: Graph,
     node_name_prefix: str,
     logger_cls: Callable,
     node_type_to_io_type_map: dict[str, set[NSNodeTargetType]],
-) -> Union[Node, list[Node]]:
+) -> Node | list[Node]:
     """
     Given a starting graph C (derived from graph B) of
 
@@ -526,8 +526,8 @@ def _can_insert(node_a_arg, gm_a):
 
 
 def _insert_copy_of_subgraph_a_after_input_node_c(
-    input_node_c: Union[Node, list[Node]],
-    input_node_c_2: Optional[Union[Node, list[Node]]],
+    input_node_c: Node | list[Node],
+    input_node_c_2: Node | list[Node] | None,
     subgraph_a: NSSubgraph,
     gm_a: GraphModule,
     gm_b: GraphModule,
@@ -570,8 +570,8 @@ def _insert_copy_of_subgraph_a_after_input_node_c(
 
 
 def _insert_copy_of_node_a_after_input_node_c(
-    input_node_c: Union[Node, list[Node]],
-    input_node_c_2: Optional[Union[Node, list[Node]]],
+    input_node_c: Node | list[Node],
+    input_node_c_2: Node | list[Node] | None,
     node_a: Node,
     gm_a: GraphModule,
     gm_b: GraphModule,
@@ -721,7 +721,7 @@ def create_a_shadows_b(
     matched_subgraph_pairs: dict[str, tuple[NSSubgraph, NSSubgraph]],
     logger_cls: Callable,
     should_log_inputs: bool,
-    node_type_to_io_type_map: Optional[dict[str, set[NSNodeTargetType]]] = None,
+    node_type_to_io_type_map: dict[str, set[NSNodeTargetType]] | None = None,
 ) -> GraphModule:
     """
     Creates a new GraphModule consisting of the graph of C, with the meaningful
@@ -1005,7 +1005,7 @@ def load_arg(a):
                             index_of_arg=0,
                             fqn=fqn_base_a,
                         )
-                        input_logger: Union[Node, list[Node]] = dtype_cast_node
+                        input_logger: Node | list[Node] = dtype_cast_node
                     else:
                         if not isinstance(dtype_cast_node, list):
                             raise AssertionError(
diff --git a/torch/ao/ns/fx/mappings.py b/torch/ao/ns/fx/mappings.py
index 9c0bc8b0f45f9..275291789f1c5 100644
--- a/torch/ao/ns/fx/mappings.py
+++ b/torch/ao/ns/fx/mappings.py
@@ -1,5 +1,5 @@
 import operator
-from typing import Optional, TYPE_CHECKING
+from typing import TYPE_CHECKING
 
 import torch
 import torch.ao.nn.intrinsic as nni
@@ -461,7 +461,7 @@ def get_base_name_to_sets_of_related_ops() -> dict[str, set[NSNodeTargetType]]:
 def get_base_name_for_op(
     base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]],
     op: NSNodeTargetType,
-) -> Optional[str]:
+) -> str | None:
     for base_name, set_of_related_ops in base_name_to_sets_of_related_ops.items():
         if op in set_of_related_ops:
             return base_name
@@ -471,7 +471,7 @@ def get_base_name_for_op(
 def add_op_to_sets_of_related_ops(
     base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]],
     op: NSNodeTargetType,
-    related_op: Optional[NSNodeTargetType],
+    related_op: NSNodeTargetType | None,
 ) -> None:
     if related_op is not None:
         for set_of_related_ops in base_name_to_sets_of_related_ops.values():
diff --git a/torch/ao/ns/fx/n_shadows_utils.py b/torch/ao/ns/fx/n_shadows_utils.py
index f16700994d095..95d467d9337ea 100644
--- a/torch/ao/ns/fx/n_shadows_utils.py
+++ b/torch/ao/ns/fx/n_shadows_utils.py
@@ -3,7 +3,7 @@
 import copy
 import operator
 from collections.abc import Callable
-from typing import Any, Optional
+from typing import Any
 
 import torch
 import torch.fx
@@ -225,7 +225,7 @@ def _get_logger_for_subgraph(
     subgraph_candidate_idx: int,
     qconfig_str: str,
     logger_cls: Callable,
-    fqn: Optional[str],
+    fqn: str | None,
 ) -> torch.nn.Module:
     """
     Given a model and a linear subgraph starting from `first_node` and
@@ -465,12 +465,12 @@ def create_one_transformed_and_logged_copy_of_subgraph(
     subgraph_candidate_idx: int,
     first_node: Node,
     last_node: Node,
-    fqn: Optional[str],
+    fqn: str | None,
     list_of_node_name_to_qconfig: list[dict[str, QConfigAny]],
     example_inputs: Any,
-    last_added_shadow_node_list: list[Optional[Node]],
-    custom_prepare_fn: Optional[Callable] = None,
-    custom_prepare_kwargs: Optional[dict[str, Any]] = None,
+    last_added_shadow_node_list: list[Node | None],
+    custom_prepare_fn: Callable | None = None,
+    custom_prepare_kwargs: dict[str, Any] | None = None,
 ) -> None:
     """
     Given a subgraph in `mt` and a subgraph candidate idx, inserts the
@@ -631,8 +631,8 @@ def create_n_transformed_and_logged_copies_of_subgraph(
     nodes_in_this_subgraph: list[Any],
     qconfig_mappings: list[QConfigMapping],
     list_of_node_name_to_qconfig: list[dict[str, QConfigAny]],
-    custom_prepare_fn: Optional[Callable] = None,
-    custom_prepare_kwargs: Optional[dict[str, Any]] = None,
+    custom_prepare_fn: Callable | None = None,
+    custom_prepare_kwargs: dict[str, Any] | None = None,
 ) -> None:
     """
     Given a model `mt` and a subgraph_idx, creates the needed copies
@@ -709,7 +709,7 @@ def create_n_transformed_and_logged_copies_of_subgraph(
     # order but the eventual results will be in reverse order.
     # So, we keep track of the last shadow logger we added and
     # always insert after it.
-    last_added_shadow_node_list: list[Optional[Node]] = [None]
+    last_added_shadow_node_list: list[Node | None] = [None]
     for subgraph_candidate_idx in range(len(qconfig_mappings) + 1):
         create_one_transformed_and_logged_copy_of_subgraph(
             mt,
@@ -887,7 +887,7 @@ def _get_subgraph_containing_node(node, subgraphs_dedup):
                     new_args = cur_node_orig.args
                     new_kwargs = cur_node_orig.kwargs
                 else:
-                    first_arg_for_copy: Optional[Node] = cur_node_copy
+                    first_arg_for_copy: Node | None = cur_node_copy
                     new_args = (first_arg_for_copy, *cur_node_orig.args[1:])
                     new_kwargs = cur_node_orig.kwargs
                 # make a copy of cur_node_orig
diff --git a/torch/ao/ns/fx/pattern_utils.py b/torch/ao/ns/fx/pattern_utils.py
index c4d231e713b20..d10fdd39da908 100644
--- a/torch/ao/ns/fx/pattern_utils.py
+++ b/torch/ao/ns/fx/pattern_utils.py
@@ -72,7 +72,7 @@ def get_reversed_fusions() -> list[tuple[NSFusionType, int]]:
     all_quant_patterns = _get_pattern_to_quantize_handlers(get_native_backend_config())
 
     default_base_op_idx = 0
-    for quant_pattern in all_quant_patterns.keys():
+    for quant_pattern in all_quant_patterns:
         # TODO: this is a temporary hack to flatten the patterns from quantization so
         # that it works with the ns matcher function, maybe we should use `_is_match`
         # in torch.ao.quantization.fx.match_utils to match the patterns
diff --git a/torch/ao/ns/fx/qconfig_multi_mapping.py b/torch/ao/ns/fx/qconfig_multi_mapping.py
index 750129807d00a..d36914b46929d 100644
--- a/torch/ao/ns/fx/qconfig_multi_mapping.py
+++ b/torch/ao/ns/fx/qconfig_multi_mapping.py
@@ -2,7 +2,7 @@
 from __future__ import annotations
 
 import copy
-from typing import Any, TYPE_CHECKING, Union
+from typing import Any, TYPE_CHECKING
 
 import torch
 from torch.ao.quantization import QConfigMapping
@@ -126,7 +126,7 @@ def _handle_list_size_mismatch(
     def _insert_qconfig_list(
         self,
         style: str,
-        args: list[Union[str, int, Callable]],
+        args: list[str | int | Callable],
         qconfig_list: list[QConfigAny],
     ) -> None:
         # we remove duplicates and None to make the ordering of qconfigs
@@ -149,7 +149,7 @@ def set_global(self, global_qconfig_list: list[QConfigAny]) -> QConfigMultiMappi
         return self
 
     def set_object_type(
-        self, object_type: Union[Callable, str], qconfig_list: list[QConfigAny]
+        self, object_type: Callable | str, qconfig_list: list[QConfigAny]
     ) -> QConfigMultiMapping:
         """
         Set object type QConfigs
diff --git a/torch/ao/ns/fx/utils.py b/torch/ao/ns/fx/utils.py
index 3423d8533204a..93e72ae2fd4b6 100644
--- a/torch/ao/ns/fx/utils.py
+++ b/torch/ao/ns/fx/utils.py
@@ -3,7 +3,6 @@
 import enum
 import operator
 from collections.abc import Callable
-from typing import Optional, Union
 
 import torch
 import torch.ao.nn.intrinsic.quantized as nniq
@@ -175,7 +174,7 @@ def get_node_input_qparams(
     node: Node,
     gm: GraphModule,
     node_type_to_io_type_map: dict[str, set[NSNodeTargetType]],
-) -> Optional[tuple[Union[torch.Tensor, float], Union[torch.Tensor, int]]]:
+) -> tuple[torch.Tensor | float, torch.Tensor | int] | None:
     """
     Returns the qparams (scale, zero_point) of the first input to `node`,
     if they can be inferred from the graph.
diff --git a/torch/ao/ns/fx/weight_utils.py b/torch/ao/ns/fx/weight_utils.py
index 86c4dc0bb3189..6bff44215e461 100644
--- a/torch/ao/ns/fx/weight_utils.py
+++ b/torch/ao/ns/fx/weight_utils.py
@@ -1,5 +1,4 @@
 from collections.abc import Callable
-from typing import Optional
 
 import torch
 import torch.ao.nn.intrinsic as nni
@@ -244,10 +243,9 @@ def get_op_to_type_to_weight_extraction_fn() -> dict[str, dict[Callable, Callabl
 def extract_weight_from_node(
     node: Node,
     gm: GraphModule,
-    op_to_type_to_weight_extraction_fn: Optional[
-        dict[str, dict[Callable, Callable]]
-    ] = None,
-) -> Optional[NSSingleResultType]:
+    op_to_type_to_weight_extraction_fn: dict[str, dict[Callable, Callable]]
+    | None = None,
+) -> NSSingleResultType | None:
     res_type = NSSingleResultValuesType.WEIGHT.value
 
     # Not all graphmodules have _node_name_to_scope, so only fill it
diff --git a/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py b/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
index ef43d7a1f7de2..241b4e70e8196 100644
--- a/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
+++ b/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
@@ -2,7 +2,7 @@
 import copy
 import warnings
 from collections import defaultdict
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from torch import nn
@@ -251,7 +251,7 @@ def register_layer(
         # or sparsify_hook()
         self.data_groups[name]["hook_state"] = "aggregate"  # aggregate hook is attached
 
-    def get_mask(self, name: Optional[str] = None, layer: Optional[nn.Module] = None):
+    def get_mask(self, name: str | None = None, layer: nn.Module | None = None):
         """
         Returns mask associated to the layer.
 
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py b/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py
index 07040584231e1..e76b5ccd7b5c5 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py
@@ -4,7 +4,7 @@
 import sys
 import warnings
 from collections import defaultdict
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from torch import nn
@@ -61,7 +61,7 @@ class BaseDataSparsifier(base_sparsifier.BaseSparsifier):
         >>> # tensor_1 and tensor_2 will have sparsity_level of 0.7 but tensor_3 will have sparsity_level=0.3
     """
 
-    def __init__(self, data_list: Optional[list[tuple[str, Any]]] = None, **defaults):
+    def __init__(self, data_list: list[tuple[str, Any]] | None = None, **defaults):
         super().__init__(defaults=defaults)
 
         self._container = _Container()
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/data_norm_sparsifier.py b/torch/ao/pruning/_experimental/data_sparsifier/data_norm_sparsifier.py
index 4dccb52ee24fb..aa2971cd0b3d0 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/data_norm_sparsifier.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/data_norm_sparsifier.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 import operator
 from functools import reduce
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from torch.nn import functional as F
@@ -38,10 +38,10 @@ class DataNormSparsifier(BaseDataSparsifier):
 
     def __init__(
         self,
-        data_list: Optional[list[tuple[str, Any]]] = None,
+        data_list: list[tuple[str, Any]] | None = None,
         sparsity_level: float = 0.5,
         sparse_block_shape: tuple[int, int] = (1, 4),
-        zeros_per_block: Optional[int] = None,
+        zeros_per_block: int | None = None,
         norm: str = "L1",
     ):
         if zeros_per_block is None:
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/data_sparsity.py b/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/data_sparsity.py
index 00e9b1cab6c3c..c1c8a91c5c9dc 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/data_sparsity.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/data_sparsity.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 from collections import defaultdict
 from copy import deepcopy
-from typing import Any, Optional, TYPE_CHECKING
+from typing import Any, TYPE_CHECKING
 
 import pytorch_lightning as pl  # type: ignore[import]
 
@@ -48,7 +48,7 @@ def __init__(self, data_sparsifier_class, data_sparsifier_args):
         self.data_sparsifier_class = data_sparsifier_class
         self.data_sparsifier_args = data_sparsifier_args
         self.data_sparsifier: Any = None
-        self.sparsified: Optional[torch.nn.Module] = None
+        self.sparsified: torch.nn.Module | None = None
 
     def on_fit_end(self, trainer, pl_module) -> None:
         self.sparsified = deepcopy(pl_module.model).eval()
@@ -129,7 +129,7 @@ def __init__(
         # fields
         self.data_sparsifier: Any = None
         self.data_scheduler: Any = None
-        self.sparsified: Optional[torch.nn.Module] = None
+        self.sparsified: torch.nn.Module | None = None
 
         self.data_sparsifier_state_dict: Any = None
 
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/quantization_utils.py b/torch/ao/pruning/_experimental/data_sparsifier/quantization_utils.py
index 5c3dbde4c3d4c..b727635d08151 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/quantization_utils.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/quantization_utils.py
@@ -1,5 +1,4 @@
 # mypy: allow-untyped-defs
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -28,7 +27,7 @@ def post_training_sparse_quantize(
     model,
     data_sparsifier_class,
     sparsify_first=True,
-    select_embeddings: Optional[list[nn.Module]] = None,
+    select_embeddings: list[nn.Module] | None = None,
     **sparse_config,
 ):
     """Takes in a model and applies sparsification and quantization to only embeddings & embeddingbags.
diff --git a/torch/ao/pruning/_experimental/pruner/FPGM_pruner.py b/torch/ao/pruning/_experimental/pruner/FPGM_pruner.py
index c1ce68d0d83ae..1a89de12bd934 100644
--- a/torch/ao/pruning/_experimental/pruner/FPGM_pruner.py
+++ b/torch/ao/pruning/_experimental/pruner/FPGM_pruner.py
@@ -1,6 +1,5 @@
 # mypy: allow-untyped-defs
 from collections.abc import Callable
-from typing import Optional, Union
 
 import torch
 
@@ -28,9 +27,7 @@ class FPGMPruner(BaseStructuredSparsifier):
             - W: width of kernel
     """
 
-    def __init__(
-        self, sparsity_level: float = 0.5, dist: Optional[Union[Callable, int]] = None
-    ):
+    def __init__(self, sparsity_level: float = 0.5, dist: Callable | int | None = None):
         defaults = {
             "sparsity_level": sparsity_level,
         }
diff --git a/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py b/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py
index 4725c44897135..d1676292f7d74 100644
--- a/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py
+++ b/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py
@@ -2,7 +2,6 @@
 from collections.abc import Callable
 from itertools import chain
 from operator import getitem
-from typing import Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -92,14 +91,14 @@ def _get_supported_activation_modules():
 
 
 def _get_default_structured_pruning_patterns() -> dict[
-    tuple[Union[type[nn.Module], Callable, MatchAllNode, str], ...],
+    tuple[type[nn.Module] | Callable | MatchAllNode | str, ...],
     Callable[..., None],
 ]:
     """
     Returns the patterns for conv2d / linear conversion for each element in the activation functions/modules defined above.
     """
     patterns: dict[
-        tuple[Union[type[nn.Module], Callable, MatchAllNode, str], ...],
+        tuple[type[nn.Module] | Callable | MatchAllNode | str, ...],
         Callable[..., None],
     ] = {
         # linear -> linear
@@ -228,7 +227,7 @@ def __init__(self, defaults, patterns=None):
     def make_config_from_model(
         self,
         model: nn.Module,
-        SUPPORTED_MODULES: Optional[set[type]] = None,
+        SUPPORTED_MODULES: set[type] | None = None,
     ) -> None:
         if SUPPORTED_MODULES is None:
             SUPPORTED_MODULES = _get_supported_structured_pruning_modules()
diff --git a/torch/ao/pruning/_experimental/pruner/match_utils.py b/torch/ao/pruning/_experimental/pruner/match_utils.py
index 64ef6d78c58c7..e22b979ab900c 100644
--- a/torch/ao/pruning/_experimental/pruner/match_utils.py
+++ b/torch/ao/pruning/_experimental/pruner/match_utils.py
@@ -2,7 +2,7 @@
 Contains utility functions to check if a pattern is in the graph and return the matching nodes
 """
 
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -14,7 +14,7 @@
 def _match(
     modules: dict[str, nn.ModuleDict],
     node: Node,
-    current: Union[nn.Module, Any],
+    current: nn.Module | Any,
 ) -> bool:
     r"""
     checks to see if a single node of a pattern matches
@@ -38,10 +38,10 @@ def _match(
 
 def apply_match(
     modules: dict[str, nn.ModuleDict],
-    pattern: Union[tuple[Any], Any],
+    pattern: tuple[Any] | Any,
     node: Node,
     matched_node_pattern: list[Node],
-) -> Optional[list[Node]]:
+) -> list[Node] | None:
     r"""
     This function will return the matched nodes if the pattern matches the node given
     If there is no match, it will return None
diff --git a/torch/ao/pruning/_experimental/pruner/prune_functions.py b/torch/ao/pruning/_experimental/pruner/prune_functions.py
index c567e5771859d..14a1c9a97b07c 100644
--- a/torch/ao/pruning/_experimental/pruner/prune_functions.py
+++ b/torch/ao/pruning/_experimental/pruner/prune_functions.py
@@ -5,7 +5,7 @@
 """
 
 from collections.abc import Callable
-from typing import cast, Optional
+from typing import cast
 
 import torch
 from torch import nn, Tensor
@@ -89,7 +89,7 @@ def _prune_module_bias(module: nn.Module, mask: Tensor) -> None:
         delattr(module, "_bias")
 
 
-def _propagate_module_bias(module: nn.Module, mask: Tensor) -> Optional[Tensor]:
+def _propagate_module_bias(module: nn.Module, mask: Tensor) -> Tensor | None:
     r"""
     In the case that we need to propagate biases, this function will return the biases we need
     """
@@ -143,7 +143,7 @@ def prune_linear_linear(linear1: nn.Linear, linear2: nn.Linear) -> None:
 
 def prune_linear_activation_linear(
     linear1: nn.Linear,
-    activation: Optional[Callable[[Tensor], Tensor]],
+    activation: Callable[[Tensor], Tensor] | None,
     linear2: nn.Linear,
 ):
     mask = _prune_linear_helper(linear1)
@@ -236,7 +236,7 @@ def prune_conv2d_conv2d(conv2d_1: nn.Conv2d, conv2d_2: nn.Conv2d) -> None:
 
 def prune_conv2d_activation_conv2d(
     conv2d_1: nn.Conv2d,
-    activation: Optional[Callable[[Tensor], Tensor]],
+    activation: Callable[[Tensor], Tensor] | None,
     conv2d_2: nn.Conv2d,
 ):
     r"""
@@ -295,7 +295,7 @@ def prune_conv2d_activation_conv2d(
 def prune_conv2d_pool_activation_conv2d(
     c1: nn.Conv2d,
     pool: nn.Module,
-    activation: Optional[Callable[[Tensor], Tensor]],
+    activation: Callable[[Tensor], Tensor] | None,
     c2: nn.Conv2d,
 ) -> None:
     prune_conv2d_activation_conv2d(c1, activation, c2)
@@ -303,7 +303,7 @@ def prune_conv2d_pool_activation_conv2d(
 
 def prune_conv2d_activation_pool_conv2d(
     c1: nn.Conv2d,
-    activation: Optional[Callable[[Tensor], Tensor]],
+    activation: Callable[[Tensor], Tensor] | None,
     pool: nn.Module,
     c2: nn.Conv2d,
 ) -> None:
@@ -313,7 +313,7 @@ def prune_conv2d_activation_pool_conv2d(
 def prune_conv2d_pool_flatten_linear(
     conv2d: nn.Conv2d,
     pool: nn.Module,
-    flatten: Optional[Callable[[Tensor], Tensor]],
+    flatten: Callable[[Tensor], Tensor] | None,
     linear: nn.Linear,
 ) -> None:
     mask = _prune_conv2d_helper(conv2d)
@@ -377,7 +377,7 @@ def prune_lstm_output_linear(
 def prune_lstm_output_layernorm_linear(
     lstm: nn.LSTM,
     getitem: Callable,
-    layernorm: Optional[nn.LayerNorm],
+    layernorm: nn.LayerNorm | None,
     linear: nn.Linear,
 ) -> None:
     for i in range(lstm.num_layers):
diff --git a/torch/ao/pruning/scheduler/lambda_scheduler.py b/torch/ao/pruning/scheduler/lambda_scheduler.py
index d9b6cb0a4d959..fe5737095bf66 100644
--- a/torch/ao/pruning/scheduler/lambda_scheduler.py
+++ b/torch/ao/pruning/scheduler/lambda_scheduler.py
@@ -1,6 +1,5 @@
 import warnings
 from collections.abc import Callable
-from typing import Union
 
 from torch.ao.pruning.sparsifier.base_sparsifier import BaseSparsifier
 
@@ -36,7 +35,7 @@ class LambdaSL(BaseScheduler):
     def __init__(
         self,
         sparsifier: BaseSparsifier,
-        sl_lambda: Union[Callable[[int], float], list[Callable[[int], float]]],
+        sl_lambda: Callable[[int], float] | list[Callable[[int], float]],
         last_epoch: int = -1,
         verbose: bool = False,
     ) -> None:
diff --git a/torch/ao/pruning/sparsifier/base_sparsifier.py b/torch/ao/pruning/sparsifier/base_sparsifier.py
index 14764c77cc604..1f55d63a26781 100644
--- a/torch/ao/pruning/sparsifier/base_sparsifier.py
+++ b/torch/ao/pruning/sparsifier/base_sparsifier.py
@@ -2,7 +2,7 @@
 import abc
 import copy
 from collections import defaultdict
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from torch import nn
@@ -52,7 +52,7 @@ class BaseSparsifier(abc.ABC):
         >>> sparsifier = BaseSparsifier(config, defaults)
     """
 
-    def __init__(self, defaults: Optional[dict[str, Any]] = None):
+    def __init__(self, defaults: dict[str, Any] | None = None):
         super().__init__()
         self.defaults: dict[str, Any] = defaults or {}
 
@@ -196,7 +196,7 @@ def prepare(self, model, config):
 
             # check that whatever was put into local_args agrees with what was obtained
             # from tensor_fqn
-            for key in info_from_tensor_fqn.keys():
+            for key in info_from_tensor_fqn:
                 if key in local_args:
                     if not (
                         info_from_tensor_fqn[key] == local_args[key]
@@ -227,8 +227,8 @@ def _prepare(self, *args, **kwargs):
 
     def squash_mask(
         self,
-        params_to_keep: Optional[tuple[str, ...]] = None,
-        params_to_keep_per_layer: Optional[dict[str, tuple[str, ...]]] = None,
+        params_to_keep: tuple[str, ...] | None = None,
+        params_to_keep_per_layer: dict[str, tuple[str, ...]] | None = None,
         *args,
         **kwargs,
     ):
@@ -306,7 +306,7 @@ def squash_mask(
     def convert(
         self,
         module: nn.Module,
-        mapping: Optional[dict[type[nn.Module], type[nn.Module]]] = None,
+        mapping: dict[type[nn.Module], type[nn.Module]] | None = None,
         inplace: bool = False,
         parameterization: type[nn.Module] = FakeSparsity,
     ):
diff --git a/torch/ao/pruning/sparsifier/utils.py b/torch/ao/pruning/sparsifier/utils.py
index a852b35017fcd..97461630bc3ae 100644
--- a/torch/ao/pruning/sparsifier/utils.py
+++ b/torch/ao/pruning/sparsifier/utils.py
@@ -1,6 +1,6 @@
 # mypy: allow-untyped-defs
 from itertools import chain
-from typing import Any, Optional
+from typing import Any
 
 from torch import nn
 from torch.nn.utils.parametrize import is_parametrized, type_before_parametrizations
@@ -67,9 +67,7 @@ def swap_module(
         return mod
 
 
-def module_to_fqn(
-    model: nn.Module, module: nn.Module, prefix: str = ""
-) -> Optional[str]:
+def module_to_fqn(model: nn.Module, module: nn.Module, prefix: str = "") -> str | None:
     """
     Returns the fqn for a module or None if module not a descendent of model.
     """
@@ -82,7 +80,7 @@ def module_to_fqn(
     return None
 
 
-def fqn_to_module(model: Optional[nn.Module], path: str) -> Optional[nn.Module]:
+def fqn_to_module(model: nn.Module | None, path: str) -> nn.Module | None:
     """
     Given an fqn, returns the corresponding module or tensor or None if the fqn given by `path`
     doesn't correspond to anything. Similar to model.get_submodule(path) but works for tensors.
diff --git a/torch/ao/pruning/sparsifier/weight_norm_sparsifier.py b/torch/ao/pruning/sparsifier/weight_norm_sparsifier.py
index a3645dc3ab872..0fd0368f15674 100644
--- a/torch/ao/pruning/sparsifier/weight_norm_sparsifier.py
+++ b/torch/ao/pruning/sparsifier/weight_norm_sparsifier.py
@@ -2,7 +2,6 @@
 import operator
 from collections.abc import Callable
 from functools import reduce
-from typing import Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -61,8 +60,8 @@ def __init__(
         self,
         sparsity_level: float = 0.5,
         sparse_block_shape: tuple[int, int] = (1, 4),
-        zeros_per_block: Optional[int] = None,
-        norm: Optional[Union[Callable, int]] = None,
+        zeros_per_block: int | None = None,
+        norm: Callable | int | None = None,
     ):
         if zeros_per_block is None:
             zeros_per_block = reduce(operator.mul, sparse_block_shape)
diff --git a/torch/ao/quantization/__init__.py b/torch/ao/quantization/__init__.py
index f66a0640fcadc..2efc24081b0c1 100644
--- a/torch/ao/quantization/__init__.py
+++ b/torch/ao/quantization/__init__.py
@@ -43,7 +43,7 @@
     from typing import TypeAliasType
 
     ObserverOrFakeQuantize = TypeAliasType(
-        "ObserverOrFakeQuantize", Union[ObserverBase, FakeQuantizeBase]
+        "ObserverOrFakeQuantize", ObserverBase | FakeQuantizeBase
     )
 
 for _f in [
@@ -219,10 +219,10 @@ def __init__(
         derive_qparams_fn: Callable[
             [list[ObserverOrFakeQuantize]], tuple[Tensor, Tensor]
         ],
-        quant_min: Optional[int] = None,
-        quant_max: Optional[int] = None,
-        qscheme: Optional[torch.qscheme] = None,
-        ch_axis: Optional[int] = None,
+        quant_min: int | None = None,
+        quant_max: int | None = None,
+        qscheme: torch.qscheme | None = None,
+        ch_axis: int | None = None,
     ):
         super().__init__(dtype)
         self.obs_or_fqs = obs_or_fqs
diff --git a/torch/ao/quantization/_equalize.py b/torch/ao/quantization/_equalize.py
index a78dd307fc6d6..e4ff327f285aa 100644
--- a/torch/ao/quantization/_equalize.py
+++ b/torch/ao/quantization/_equalize.py
@@ -270,7 +270,7 @@ def converged(curr_modules, prev_modules, threshold=1e-4):
     summed_norms = torch.tensor(0.0)
     if None in prev_modules.values():
         return False
-    for name in curr_modules.keys():
+    for name in curr_modules:
         curr_weight = get_module_weight(curr_modules[name])
         prev_weight = get_module_weight(prev_modules[name])
 
diff --git a/torch/ao/quantization/backend_config/_common_operator_config_utils.py b/torch/ao/quantization/backend_config/_common_operator_config_utils.py
index ab44cfa09197d..9cb322fd85d2c 100644
--- a/torch/ao/quantization/backend_config/_common_operator_config_utils.py
+++ b/torch/ao/quantization/backend_config/_common_operator_config_utils.py
@@ -3,7 +3,6 @@
 import operator
 from collections import namedtuple
 from collections.abc import Callable
-from typing import Union
 
 import torch
 import torch.ao.nn.intrinsic as nni
@@ -115,7 +114,7 @@
     scale_exact_match=2.0 / 256.0,
     zero_point_exact_match=128,
 )
-_FIXED_QPARAMS_OP_TO_CONSTRAINTS: dict[Union[Callable, str], DTypeWithConstraints] = {
+_FIXED_QPARAMS_OP_TO_CONSTRAINTS: dict[Callable | str, DTypeWithConstraints] = {
     torch.nn.Hardsigmoid: _FIXED_QPARAM_OP_0TO1_CONSTRAINTS,
     torch.nn.functional.hardsigmoid: _FIXED_QPARAM_OP_0TO1_CONSTRAINTS,
     "hardsigmoid": _FIXED_QPARAM_OP_0TO1_CONSTRAINTS,
@@ -678,7 +677,7 @@ def _get_bn_configs(dtype_configs: list[DTypeConfig]) -> list[BackendPatternConf
         torch.nn.BatchNorm2d: nni.BNReLU2d,
         torch.nn.BatchNorm3d: nni.BNReLU3d,
     }
-    for bn in bn_to_fused_bn.keys():
+    for bn in bn_to_fused_bn:
         fused_bn = bn_to_fused_bn[bn]
         # bn module + relu module fusion config
         bn_configs.append(
diff --git a/torch/ao/quantization/backend_config/backend_config.py b/torch/ao/quantization/backend_config/backend_config.py
index 17bbf15e63710..96a0b44a3afdf 100644
--- a/torch/ao/quantization/backend_config/backend_config.py
+++ b/torch/ao/quantization/backend_config/backend_config.py
@@ -3,7 +3,7 @@
 
 from dataclasses import dataclass
 from enum import Enum
-from typing import Any, Optional, TYPE_CHECKING, Union
+from typing import Any, TYPE_CHECKING
 
 import torch
 
@@ -102,13 +102,13 @@ class DTypeWithConstraints:
       the quantization parameters don't match, then the QConfig will be ignored.
     """
 
-    dtype: Optional[torch.dtype] = None
-    quant_min_lower_bound: Union[int, float, None] = None
-    quant_max_upper_bound: Union[int, float, None] = None
-    scale_min_lower_bound: Union[int, float, None] = None
-    scale_max_upper_bound: Union[int, float, None] = None
-    scale_exact_match: Optional[float] = None
-    zero_point_exact_match: Optional[int] = None
+    dtype: torch.dtype | None = None
+    quant_min_lower_bound: int | float | None = None
+    quant_max_upper_bound: int | float | None = None
+    scale_min_lower_bound: int | float | None = None
+    scale_max_upper_bound: int | float | None = None
+    scale_exact_match: float | None = None
+    zero_point_exact_match: int | None = None
 
 
 @dataclass
@@ -185,16 +185,16 @@ class DTypeConfig:
     input_dtype_with_constraints: DTypeWithConstraints
     output_dtype_with_constraints: DTypeWithConstraints
     weight_dtype_with_constraints: DTypeWithConstraints
-    bias_dtype: Optional[torch.dtype]
-    is_dynamic: Optional[bool]
+    bias_dtype: torch.dtype | None
+    is_dynamic: bool | None
 
     def __init__(
         self,
-        input_dtype: Union[torch.dtype, DTypeWithConstraints, None] = None,
-        output_dtype: Union[torch.dtype, DTypeWithConstraints, None] = None,
-        weight_dtype: Union[torch.dtype, DTypeWithConstraints, None] = None,
-        bias_dtype: Optional[torch.dtype] = None,
-        is_dynamic: Optional[bool] = None,
+        input_dtype: torch.dtype | DTypeWithConstraints | None = None,
+        output_dtype: torch.dtype | DTypeWithConstraints | None = None,
+        weight_dtype: torch.dtype | DTypeWithConstraints | None = None,
+        bias_dtype: torch.dtype | None = None,
+        is_dynamic: bool | None = None,
     ):
         if isinstance(input_dtype, DTypeWithConstraints):
             self.input_dtype_with_constraints = input_dtype
@@ -219,15 +219,15 @@ def __init__(
         self.is_dynamic = is_dynamic
 
     @property
-    def input_dtype(self) -> Optional[torch.dtype]:
+    def input_dtype(self) -> torch.dtype | None:
         return self.input_dtype_with_constraints.dtype
 
     @property
-    def output_dtype(self) -> Optional[torch.dtype]:
+    def output_dtype(self) -> torch.dtype | None:
         return self.output_dtype_with_constraints.dtype
 
     @property
-    def weight_dtype(self) -> Optional[torch.dtype]:
+    def weight_dtype(self) -> torch.dtype | None:
         return self.weight_dtype_with_constraints.dtype
 
     @classmethod
@@ -442,22 +442,22 @@ class BackendPatternConfig:
     For a detailed example usage, see :class:`~torch.ao.quantization.backend_config.BackendConfig`.
     """
 
-    def __init__(self, pattern: Optional[Pattern] = None):
-        self.pattern: Optional[Pattern] = pattern
+    def __init__(self, pattern: Pattern | None = None):
+        self.pattern: Pattern | None = pattern
         self.observation_type = ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT
         self.dtype_configs: list[DTypeConfig] = []
-        self.root_module: Optional[type[torch.nn.Module]] = None
-        self.qat_module: Optional[type[torch.nn.Module]] = None
-        self.reference_quantized_module: Optional[type[torch.nn.Module]] = None
-        self.fused_module: Optional[type[torch.nn.Module]] = None
-        self.fuser_method: Optional[Callable] = None
+        self.root_module: type[torch.nn.Module] | None = None
+        self.qat_module: type[torch.nn.Module] | None = None
+        self.reference_quantized_module: type[torch.nn.Module] | None = None
+        self.fused_module: type[torch.nn.Module] | None = None
+        self.fuser_method: Callable | None = None
 
         # Temporary/internal configs
-        self._root_node_getter: Optional[Callable] = None
-        self._extra_inputs_getter: Optional[Callable] = None
+        self._root_node_getter: Callable | None = None
+        self._extra_inputs_getter: Callable | None = None
         self._num_tensor_args_to_observation_type: dict[int, ObservationType] = {}
         self._input_type_to_index: dict[str, int] = {}
-        self._pattern_complex_format: Optional[Pattern] = None
+        self._pattern_complex_format: Pattern | None = None
 
     def __repr__(self):
         dict_nonempty = {
diff --git a/torch/ao/quantization/backend_config/utils.py b/torch/ao/quantization/backend_config/utils.py
index 0758c6a3b59d8..4d486a0611293 100644
--- a/torch/ao/quantization/backend_config/utils.py
+++ b/torch/ao/quantization/backend_config/utils.py
@@ -1,6 +1,6 @@
 # mypy: allow-untyped-defs
 from collections.abc import Callable
-from typing import Any, Union
+from typing import Any
 
 import torch
 import torch.nn as nn
@@ -78,8 +78,8 @@ def get_root_module_to_quantized_reference_module(
 
 def get_fuser_method_mapping(
     backend_config: BackendConfig,
-) -> dict[Pattern, Union[nn.Sequential, Callable]]:
-    fuser_method_mapping: dict[Pattern, Union[nn.Sequential, Callable]] = {}
+) -> dict[Pattern, nn.Sequential | Callable]:
+    fuser_method_mapping: dict[Pattern, nn.Sequential | Callable] = {}
     for pattern, config in backend_config._pattern_complex_format_to_config.items():
         if config.fuser_method is not None:
             # Note: both the fuser method and the pattern are specified in forward order in the
diff --git a/torch/ao/quantization/experimental/adaround_optimization.py b/torch/ao/quantization/experimental/adaround_optimization.py
index 1b34c8cbfdb8a..94aa8aa31009d 100644
--- a/torch/ao/quantization/experimental/adaround_optimization.py
+++ b/torch/ao/quantization/experimental/adaround_optimization.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 import copy
 from collections.abc import Callable
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch.ao.quantization.experimental.adaround_fake_quantize import (
@@ -17,12 +17,12 @@
 class AdaptiveRoundingOptimizer:
     def __init__(
         self,
-        model: Union[torch.nn.Module, torch.nn.DataParallel],
+        model: torch.nn.Module | torch.nn.DataParallel,
         callback: Callable[
             [
-                Union[torch.nn.Module, torch.nn.DataParallel],
+                torch.nn.Module | torch.nn.DataParallel,
                 Any,
-                Optional[torch.nn.Module],
+                torch.nn.Module | None,
             ],
             None,
         ],
@@ -35,7 +35,7 @@ def __init__(
         quant_max=127,
         qscheme: torch.qscheme = torch.per_tensor_symmetric,
         batch_size: int = 256,
-        feed_forward_wrapper: Optional[torch.nn.Module] = None,
+        feed_forward_wrapper: torch.nn.Module | None = None,
     ):
         if torch.cuda.is_available():
             self.model = model.cuda()
@@ -169,7 +169,7 @@ def optimize_adaptive_rounding(
         self,
         module: torch.nn.Module,
         q_module: torch.nn.Module,
-        activation: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,
+        activation: Callable[[torch.Tensor], torch.Tensor] | None = None,
     ) -> None:
         ada_quantizer = AdaroundFakeQuantizer(
             dtype=self.dtype,
diff --git a/torch/ao/quantization/fuse_modules.py b/torch/ao/quantization/fuse_modules.py
index c3d151858c7b8..4f664c6991449 100644
--- a/torch/ao/quantization/fuse_modules.py
+++ b/torch/ao/quantization/fuse_modules.py
@@ -1,6 +1,5 @@
 # mypy: allow-untyped-defs
 import copy
-from typing import Optional
 
 import torch.nn as nn
 
@@ -59,7 +58,7 @@ def fuse_known_modules(mod_list, is_qat, additional_fuser_method_mapping=None):
     fuser_method = get_fuser_method(types, additional_fuser_method_mapping)
     if fuser_method is None:
         raise NotImplementedError(f"Cannot fuse modules: {types}")
-    new_mod: list[Optional[nn.Module]] = [None] * len(mod_list)
+    new_mod: list[nn.Module | None] = [None] * len(mod_list)
     fused = fuser_method(is_qat, *mod_list)
     # NOTE: forward hooks not processed in the two following for loops will be lost after the fusion
     # Move pre forward hooks of the base module to resulting fused module
diff --git a/torch/ao/quantization/fuser_method_mappings.py b/torch/ao/quantization/fuser_method_mappings.py
index 4eef33698d100..d72a3579438bc 100644
--- a/torch/ao/quantization/fuser_method_mappings.py
+++ b/torch/ao/quantization/fuser_method_mappings.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 import itertools
 from collections.abc import Callable
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch.ao.nn.intrinsic as nni
 import torch.nn as nn
@@ -91,7 +91,7 @@ def fuse_conv_bn_relu(is_qat, conv, bn, relu):
         raise AssertionError(
             "Conv and BN both must be in the same mode (train or eval)."
         )
-    fused_module: Optional[type[nn.Sequential]] = None
+    fused_module: type[nn.Sequential] | None = None
     if is_qat:
         map_to_fused_module_train = {
             nn.Conv1d: nni.ConvBnReLU1d,
@@ -211,7 +211,7 @@ def fuser_method(is_qat, m1, m2):
     return fuser_method
 
 
-_DEFAULT_OP_LIST_TO_FUSER_METHOD: dict[tuple, Union[nn.Sequential, Callable]] = {
+_DEFAULT_OP_LIST_TO_FUSER_METHOD: dict[tuple, nn.Sequential | Callable] = {
     (nn.Conv1d, nn.BatchNorm1d): fuse_conv_bn,
     (nn.Conv1d, nn.BatchNorm1d, nn.ReLU): fuse_conv_bn_relu,
     (nn.Conv2d, nn.BatchNorm2d): fuse_conv_bn,
@@ -296,7 +296,7 @@ def _get_valid_patterns(op_pattern):
 
 def get_fuser_method_new(
     op_pattern: Pattern,
-    fuser_method_mapping: dict[Pattern, Union[nn.Sequential, Callable]],
+    fuser_method_mapping: dict[Pattern, nn.Sequential | Callable],
 ):
     """Get fuser method.
 
diff --git a/torch/ao/quantization/fx/_decomposed.py b/torch/ao/quantization/fx/_decomposed.py
index 08a95a7095f33..0754627a19dd1 100644
--- a/torch/ao/quantization/fx/_decomposed.py
+++ b/torch/ao/quantization/fx/_decomposed.py
@@ -1,6 +1,5 @@
 # mypy: allow-untyped-defs
 import math
-from typing import Optional
 
 import torch
 from torch._refs import _unsqueeze_multiple
@@ -249,7 +248,7 @@ def dequantize_per_tensor(
     quant_max: int,
     dtype: torch.dtype,
     *,
-    out_dtype: Optional[torch.dtype] = None,
+    out_dtype: torch.dtype | None = None,
 ) -> torch.Tensor:
     """Affine dequantization for the Tensor using the same quantization parameters to map
     from quantized values to floating point values
@@ -301,7 +300,7 @@ def dequantize_per_tensor_meta(
     quant_max: int,
     dtype: torch.dtype,
     *,
-    out_dtype: Optional[torch.dtype] = None,
+    out_dtype: torch.dtype | None = None,
 ) -> torch.Tensor:
     if out_dtype is None:
         out_dtype = torch.float32
@@ -327,7 +326,7 @@ def dequantize_per_tensor_tensor(
     quant_max: int,
     dtype: torch.dtype,
     *,
-    out_dtype: Optional[torch.dtype] = None,
+    out_dtype: torch.dtype | None = None,
 ) -> torch.Tensor:
     """Affine dequantization for the Tensor using the same quantization parameters to map
     from quantized values to floating point values
@@ -362,7 +361,7 @@ def dequantize_per_tensor_tensor_meta(
     quant_max: int,
     dtype: torch.dtype,
     *,
-    out_dtype: Optional[torch.dtype] = None,
+    out_dtype: torch.dtype | None = None,
 ) -> torch.Tensor:
     if out_dtype is None:
         out_dtype = torch.float32
@@ -404,7 +403,7 @@ def dequantize_per_tensor_tensor2(
     quant_max: torch.Tensor,
     dtype: torch.dtype,
     *,
-    out_dtype: Optional[torch.dtype] = None,
+    out_dtype: torch.dtype | None = None,
 ) -> torch.Tensor:
     """Affine dequantization for the Tensor using the same quantization parameters to map
     from quantized values to floating point values
@@ -439,7 +438,7 @@ def dequantize_per_tensor_tensor2_meta(
     quant_max,
     dtype,
     *,
-    out_dtype: Optional[torch.dtype] = None,
+    out_dtype: torch.dtype | None = None,
 ) -> torch.Tensor:
     return dequantize_per_tensor_tensor_meta(
         input, scale, zero_point, quant_min, quant_max, dtype, out_dtype=out_dtype
@@ -681,13 +680,13 @@ def quantize_per_channel_meta(
 def dequantize_per_channel(
     input: torch.Tensor,
     scales: torch.Tensor,
-    zero_points: Optional[torch.Tensor],
+    zero_points: torch.Tensor | None,
     axis: int,
     quant_min: int,
     quant_max: int,
     dtype: torch.dtype,
     *,
-    out_dtype: Optional[torch.dtype] = None,
+    out_dtype: torch.dtype | None = None,
 ) -> torch.Tensor:
     """Affine per channel dequantization for the Tensor using the same quantization
     parameters for each channel/axis to map from quantized values to floating point values
@@ -746,13 +745,13 @@ def dequantize_per_channel(
 def dequantize_per_channel_meta(
     input: torch.Tensor,
     scales: torch.Tensor,
-    zero_points: Optional[torch.Tensor],
+    zero_points: torch.Tensor | None,
     axis: int,
     quant_min: int,
     quant_max: int,
     dtype: torch.dtype,
     *,
-    out_dtype: Optional[torch.dtype] = None,
+    out_dtype: torch.dtype | None = None,
 ) -> torch.Tensor:
     if input.dtype != dtype:
         raise AssertionError(
@@ -1139,7 +1138,7 @@ def quantize_per_channel_group_meta(
 def dequantize_per_channel_group(
     w_int8: torch.Tensor,
     scales: torch.Tensor,
-    zero_points: Optional[torch.Tensor],
+    zero_points: torch.Tensor | None,
     quant_min: int,
     quant_max: int,
     dtype: torch.dtype,
diff --git a/torch/ao/quantization/fx/_equalize.py b/torch/ao/quantization/fx/_equalize.py
index b8809c1c60871..dda37214210e3 100644
--- a/torch/ao/quantization/fx/_equalize.py
+++ b/torch/ao/quantization/fx/_equalize.py
@@ -2,7 +2,7 @@
 import operator
 import warnings
 from collections import namedtuple
-from typing import Any, Optional
+from typing import Any
 
 import torch
 import torch.ao.nn.intrinsic as nni
@@ -340,7 +340,7 @@ def is_equalization_observer(observer: nn.Module) -> bool:
 
 def get_op_node_and_weight_eq_obs(
     input_eq_obs_node: Node, model: GraphModule, modules: dict[str, nn.Module]
-) -> tuple[Optional[Node], Optional[_WeightEqualizationObserver]]:
+) -> tuple[Node | None, _WeightEqualizationObserver | None]:
     """Gets the following weight equalization observer. There should always
     exist a weight equalization observer after an input equalization observer.
 
@@ -350,7 +350,7 @@ def get_op_node_and_weight_eq_obs(
 
     # Find the op node that comes directly after the input equalization observer
     op_node = None
-    for user in input_eq_obs_node.users.keys():
+    for user in input_eq_obs_node.users:
         if node_supports_equalization(user, modules):
             op_node = user
             break
@@ -401,7 +401,7 @@ def get_op_node_and_weight_eq_obs(
 
 def maybe_get_weight_eq_obs_node(
     op_node: Node, modules: dict[str, nn.Module]
-) -> Optional[Node]:
+) -> Node | None:
     """Gets the weight equalization observer node if it exists."""
     if op_node.op != "call_function":
         raise AssertionError(
@@ -422,7 +422,7 @@ def maybe_get_weight_eq_obs_node(
 
 def maybe_get_next_input_eq_obs(
     node: Node, modules: dict[str, nn.Module]
-) -> Optional[_InputEqualizationObserver]:
+) -> _InputEqualizationObserver | None:
     """Gets the following input equalization observer if it exists.
 
     For example, in the case of connecting linear layers:
@@ -476,7 +476,7 @@ def maybe_get_next_input_eq_obs(
 
 def maybe_get_next_equalization_scale(
     node: Node, modules: dict[str, nn.Module]
-) -> Optional[torch.Tensor]:
+) -> torch.Tensor | None:
     """If the next next node is an InputEqualizationObserver then we want to
     return its equalization scale, else we return 1
 
@@ -528,7 +528,7 @@ def scale_weight_node(
     node: Node,
     modules: dict[str, nn.Module],
     equalization_scale: torch.Tensor,
-    next_equalization_scale: Optional[torch.Tensor],
+    next_equalization_scale: torch.Tensor | None,
 ) -> None:
     """Scale the weights for input-weight equalization by multiplying the
     weight by 1/equalization_scale and next_equalization_scale
@@ -594,7 +594,7 @@ def scale_weight_functional(
     model: GraphModule,
     modules: dict[str, nn.Module],
     equalization_scale: torch.Tensor,
-    next_equalization_scale: Optional[torch.Tensor],
+    next_equalization_scale: torch.Tensor | None,
 ) -> None:
     """Scales the weight value for functional layers"""
     if equalization_scale is None:
diff --git a/torch/ao/quantization/fx/_lower_to_native_backend.py b/torch/ao/quantization/fx/_lower_to_native_backend.py
index 6ef9c6302d711..ad20bcc96251d 100644
--- a/torch/ao/quantization/fx/_lower_to_native_backend.py
+++ b/torch/ao/quantization/fx/_lower_to_native_backend.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 import operator
 from collections.abc import Callable
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 import torch.ao.nn.intrinsic as nni
@@ -334,7 +334,7 @@ def should_skip_lowering(op: torch.fx.node.Node, qconfig_map: dict[str, QConfigA
 # Mapping from a functional to lower to a 2-tuple of
 #   1) The quantized version of the op
 #   2) The quantized version of the op fused with relu, if it exists, else None
-STATIC_LOWER_FUNCTIONAL_MAP: dict[Callable, tuple[Callable, Optional[Callable]]] = {
+STATIC_LOWER_FUNCTIONAL_MAP: dict[Callable, tuple[Callable, Callable | None]] = {
     F.linear: (torch.ops.quantized.linear, torch.ops.quantized.linear_relu),
     F.conv1d: (torch.ops.quantized.conv1d, torch.ops.quantized.conv1d_relu),
     F.conv2d: (torch.ops.quantized.conv2d, torch.ops.quantized.conv2d_relu),
@@ -360,7 +360,7 @@ def should_skip_lowering(op: torch.fx.node.Node, qconfig_map: dict[str, QConfigA
 #   1) The dynamically quantized version of the op
 #   2) The dynamically quantized version of the op fused with relu, if it exists, else None
 DYNAMIC_LOWER_FUNCTIONAL_MAP: dict[
-    Callable, dict[tuple[torch.dtype, torch.dtype], tuple[Callable, Optional[Callable]]]
+    Callable, dict[tuple[torch.dtype, torch.dtype], tuple[Callable, Callable | None]]
 ] = {
     F.linear: {
         (torch.quint8, torch.qint8): (
@@ -397,7 +397,7 @@ def should_skip_lowering(op: torch.fx.node.Node, qconfig_map: dict[str, QConfigA
 }
 
 # TODO: add tests for lowering these ops
-QBIN_OP_MAPPING: dict[Union[Callable, str], Callable] = {
+QBIN_OP_MAPPING: dict[Callable | str, Callable] = {
     operator.add: torch.ops.quantized.add,
     torch.add: torch.ops.quantized.add,
     operator.mul: torch.ops.quantized.mul,
@@ -405,7 +405,7 @@ def should_skip_lowering(op: torch.fx.node.Node, qconfig_map: dict[str, QConfigA
     torch.mul: torch.ops.quantized.mul,
     torch.matmul: torch.ops.quantized.matmul,
 }
-QBIN_RELU_OP_MAPPING: dict[Union[Callable, str], Callable] = {
+QBIN_RELU_OP_MAPPING: dict[Callable | str, Callable] = {
     operator.add: torch.ops.quantized.add_relu,
     torch.add: torch.ops.quantized.add_relu,
     operator.mul: torch.ops.quantized.mul_relu,
@@ -541,7 +541,7 @@ def load_arg(a):
     return quantized_model
 
 
-def _get_module(node: Node, modules: dict[str, nn.Module]) -> Optional[nn.Module]:
+def _get_module(node: Node, modules: dict[str, nn.Module]) -> nn.Module | None:
     """
     Return the `torch.nn.Module` that corresponds to the specified node's target.
     If no such node exists, return None.
@@ -558,7 +558,7 @@ def _match_static_pattern(
     qconfig_map: dict[str, QConfigAny],
     matching_modules_or_ops: list[Callable],
     dequantize_node_arg_indices: list[int],
-) -> Union[tuple[Node, Node, Node], tuple[None, None, None]]:
+) -> tuple[Node, Node, Node] | tuple[None, None, None]:
     """
     Match the pattern (dequantize - ref node - quantize) against the node provided.
 
@@ -640,7 +640,7 @@ def _match_static_pattern_with_two_inputs(
     modules: dict[str, nn.Module],
     qconfig_map: dict[str, QConfigAny],
     matching_modules_or_ops: list[Callable],
-) -> Union[tuple[Node, Node], tuple[None, None]]:
+) -> tuple[Node, Node] | tuple[None, None]:
     """
                       (dequantize \
     Match the pattern (dequantize - ref node - quantize) against the node provided.
diff --git a/torch/ao/quantization/fx/_model_report/detector.py b/torch/ao/quantization/fx/_model_report/detector.py
index 993a6c41f176f..0a48bbbaaee90 100644
--- a/torch/ao/quantization/fx/_model_report/detector.py
+++ b/torch/ao/quantization/fx/_model_report/detector.py
@@ -743,7 +743,7 @@ def generate_detector_report(
         #   Populates the string based report with the information from module_dynamic_static_info
         #   Compiles the complete report by appending relevant formatted strings
 
-        for module_fqn in module_dynamic_static_info.keys():
+        for module_fqn in module_dynamic_static_info:
             # there is at least 1 module for suggestion
             modules_added = True
             module_info = module_dynamic_static_info[module_fqn]
diff --git a/torch/ao/quantization/fx/convert.py b/torch/ao/quantization/fx/convert.py
index 08ae102f69f41..9a19a40cab908 100644
--- a/torch/ao/quantization/fx/convert.py
+++ b/torch/ao/quantization/fx/convert.py
@@ -3,7 +3,7 @@
 import copy
 import operator
 import warnings
-from typing import Any, Optional, TYPE_CHECKING, Union
+from typing import Any, TYPE_CHECKING
 
 import torch
 from torch.ao.quantization import CUSTOM_KEY, NUMERIC_DEBUG_HANDLE_KEY
@@ -98,7 +98,7 @@ def _replace_observer_with_quantize_dequantize_node_decomposed(
     modules: dict[str, torch.nn.Module],
     node_name_to_scope: dict[str, tuple[str, type]],
     node_name_to_qconfig: dict[str, QConfigAny],
-    model_device: Optional[torch.device] = None,
+    model_device: torch.device | None = None,
 ) -> None:
     """Replace activation_post_process module call node with quantize and
     dequantize node working with decomposed Tensor
@@ -165,7 +165,7 @@ def add_dequantize_op_kwargs(dequantize_op, input_node):
 
         # 1. extract information for inserting q/dq node from activation_post_process
         node_type = "call_function"
-        quantize_op: Optional[Callable] = None
+        quantize_op: Callable | None = None
         scale, zero_point = activation_post_process.calculate_qparams()  # type: ignore[attr-defined, operator]
         if is_per_channel(activation_post_process.qscheme):  # type: ignore[attr-defined]
             ch_axis = int(activation_post_process.ch_axis)  # type: ignore[attr-defined, arg-type]
@@ -373,7 +373,7 @@ def _replace_observer_with_quantize_dequantize_node(
     modules: dict[str, torch.nn.Module],
     node_name_to_scope: dict[str, tuple[str, type]],
     node_name_to_qconfig: dict[str, QConfigAny],
-    model_device: Optional[torch.device] = None,
+    model_device: torch.device | None = None,
 ) -> None:
     """Replace activation_post_process module call node with quantize and
     dequantize node
@@ -430,7 +430,7 @@ def _replace_observer_with_quantize_dequantize_node(
         # 1. extract the information from activation_post_process module for generating
         # the quantize and dequantize operator
         node_type = "call_function"
-        quantize_op: Optional[Callable] = None
+        quantize_op: Callable | None = None
         scale, zero_point = activation_post_process.calculate_qparams()  # type: ignore[attr-defined, operator]
         if is_per_channel(activation_post_process.qscheme):  # type: ignore[attr-defined]
             ch_axis = int(activation_post_process.ch_axis)  # type: ignore[attr-defined, arg-type]
@@ -678,12 +678,12 @@ def _insert_dequantize_node(node: Node, graph: Graph) -> None:
 
 def _maybe_get_observer_for_node(
     node: Node, modules: dict[str, torch.nn.Module]
-) -> Optional[torch.nn.Module]:
+) -> torch.nn.Module | None:
     """
     If the node is observed, return the observer
     instance. Otherwise, return None.
     """
-    for maybe_obs_node in node.users.keys():
+    for maybe_obs_node in node.users:
         if maybe_obs_node.op == "call_module":
             maybe_obs = modules[str(maybe_obs_node.target)]
             if _is_activation_post_process(maybe_obs):
@@ -696,7 +696,7 @@ def convert_standalone_module(
     modules: dict[str, torch.nn.Module],
     model: torch.fx.GraphModule,
     is_reference: bool,
-    backend_config: Optional[BackendConfig],
+    backend_config: BackendConfig | None,
 ) -> None:
     """Converts a observed standalone module to a quantized standalone module by calling
     the fx convert api, currently using the same `is_reference` flag as parent, but we may
@@ -765,7 +765,7 @@ def convert_weighted_module(
     backend_config: BackendConfig,
     is_decomposed: bool = False,
     is_reference: bool = False,
-    model_device: Optional[torch.device] = None,
+    model_device: torch.device | None = None,
 ) -> None:
     """Convert a weighted module to reference quantized module in the model
     If the QConfig of a QAT module is not set, the module will still be converted to
@@ -1033,11 +1033,11 @@ def convert_custom_module(
 def convert(
     model: GraphModule,
     is_reference: bool = False,
-    convert_custom_config: Union[ConvertCustomConfig, dict[str, Any], None] = None,
+    convert_custom_config: ConvertCustomConfig | dict[str, Any] | None = None,
     is_standalone_module: bool = False,
     _remove_qconfig_flag: bool = True,
-    qconfig_mapping: Union[QConfigMapping, dict[str, Any], None] = None,
-    backend_config: Union[BackendConfig, dict[str, Any], None] = None,
+    qconfig_mapping: QConfigMapping | dict[str, Any] | None = None,
+    backend_config: BackendConfig | dict[str, Any] | None = None,
     is_decomposed: bool = False,
     keep_original_weights: bool = False,
 ) -> GraphModule:
diff --git a/torch/ao/quantization/fx/custom_config.py b/torch/ao/quantization/fx/custom_config.py
index 598c42ea22e3b..e749de94bd5c3 100644
--- a/torch/ao/quantization/fx/custom_config.py
+++ b/torch/ao/quantization/fx/custom_config.py
@@ -2,7 +2,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import Any, Optional
+from typing import Any
 
 from torch.ao.quantization import QConfigMapping
 from torch.ao.quantization.backend_config import BackendConfig
@@ -37,10 +37,10 @@
 class StandaloneModuleConfigEntry:
     # qconfig_mapping for the prepare function called in the submodule,
     # None means use qconfig from parent qconfig_mapping
-    qconfig_mapping: Optional[QConfigMapping]
+    qconfig_mapping: QConfigMapping | None
     example_inputs: tuple[Any, ...]
-    prepare_custom_config: Optional[PrepareCustomConfig]
-    backend_config: Optional[BackendConfig]
+    prepare_custom_config: PrepareCustomConfig | None
+    backend_config: BackendConfig | None
 
 
 class PrepareCustomConfig:
@@ -80,10 +80,10 @@ def __repr__(self):
     def set_standalone_module_name(
         self,
         module_name: str,
-        qconfig_mapping: Optional[QConfigMapping],
+        qconfig_mapping: QConfigMapping | None,
         example_inputs: tuple[Any, ...],
-        prepare_custom_config: Optional[PrepareCustomConfig],
-        backend_config: Optional[BackendConfig],
+        prepare_custom_config: PrepareCustomConfig | None,
+        backend_config: BackendConfig | None,
     ) -> PrepareCustomConfig:
         """
         Set the configuration for running a standalone module identified by ``module_name``.
@@ -100,10 +100,10 @@ def set_standalone_module_name(
     def set_standalone_module_class(
         self,
         module_class: type,
-        qconfig_mapping: Optional[QConfigMapping],
+        qconfig_mapping: QConfigMapping | None,
         example_inputs: tuple[Any, ...],
-        prepare_custom_config: Optional[PrepareCustomConfig],
-        backend_config: Optional[BackendConfig],
+        prepare_custom_config: PrepareCustomConfig | None,
+        backend_config: BackendConfig | None,
     ) -> PrepareCustomConfig:
         """
         Set the configuration for running a standalone module identified by ``module_class``.
@@ -207,7 +207,7 @@ def from_dict(
         This function is primarily for backward compatibility and may be removed in the future.
         """
 
-        def _get_qconfig_mapping(obj: Any, dict_key: str) -> Optional[QConfigMapping]:
+        def _get_qconfig_mapping(obj: Any, dict_key: str) -> QConfigMapping | None:
             """
             Convert the given object into a QConfigMapping if possible, else throw an exception.
             """
@@ -221,7 +221,7 @@ def _get_qconfig_mapping(obj: Any, dict_key: str) -> Optional[QConfigMapping]:
 
         def _get_prepare_custom_config(
             obj: Any, dict_key: str
-        ) -> Optional[PrepareCustomConfig]:
+        ) -> PrepareCustomConfig | None:
             """
             Convert the given object into a PrepareCustomConfig if possible, else throw an exception.
             """
@@ -233,7 +233,7 @@ def _get_prepare_custom_config(
                 f"Expected PrepareCustomConfig in prepare_custom_config_dict[\"{dict_key}\"], got '{type(obj)}'"
             )
 
-        def _get_backend_config(obj: Any, dict_key: str) -> Optional[BackendConfig]:
+        def _get_backend_config(obj: Any, dict_key: str) -> BackendConfig | None:
             """
             Convert the given object into a BackendConfig if possible, else throw an exception.
             """
diff --git a/torch/ao/quantization/fx/fuse.py b/torch/ao/quantization/fx/fuse.py
index f50f9132cb0e3..3f4ee15779a18 100644
--- a/torch/ao/quantization/fx/fuse.py
+++ b/torch/ao/quantization/fx/fuse.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 import warnings
 from collections.abc import Callable
-from typing import Any, Union
+from typing import Any
 
 from torch.ao.quantization.backend_config import (
     BackendConfig,
@@ -33,8 +33,8 @@
 def fuse(
     model: GraphModule,
     is_qat: bool,
-    fuse_custom_config: Union[FuseCustomConfig, dict[str, Any], None] = None,
-    backend_config: Union[BackendConfig, dict[str, Any], None] = None,
+    fuse_custom_config: FuseCustomConfig | dict[str, Any] | None = None,
+    backend_config: BackendConfig | dict[str, Any] | None = None,
 ) -> GraphModule:
     if fuse_custom_config is None:
         fuse_custom_config = FuseCustomConfig()
diff --git a/torch/ao/quantization/fx/fuse_handler.py b/torch/ao/quantization/fx/fuse_handler.py
index b164bd08c344d..e100795e14f32 100644
--- a/torch/ao/quantization/fx/fuse_handler.py
+++ b/torch/ao/quantization/fx/fuse_handler.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 from abc import ABC, abstractmethod
 from collections.abc import Callable
-from typing import Any, Union
+from typing import Any
 
 import torch
 from torch.ao.quantization.backend_config import BackendConfig
@@ -43,7 +43,7 @@ def fuse(
         extra_inputs: list[Any],
         matched_node_pattern: NodePattern,
         fuse_custom_config: FuseCustomConfig,
-        fuser_method_mapping: dict[Pattern, Union[torch.nn.Sequential, Callable]],
+        fuser_method_mapping: dict[Pattern, torch.nn.Sequential | Callable],
         is_qat: bool,
     ) -> Node:
         pass
@@ -62,7 +62,7 @@ def fuse(
         extra_inputs: list[Any],
         matched_node_pattern: NodePattern,
         fuse_custom_config: FuseCustomConfig,
-        fuser_method_mapping: dict[Pattern, Union[torch.nn.Sequential, Callable]],
+        fuser_method_mapping: dict[Pattern, torch.nn.Sequential | Callable],
         is_qat: bool,
     ) -> Node:
         if root_node.op != "call_module":
diff --git a/torch/ao/quantization/fx/graph_module.py b/torch/ao/quantization/fx/graph_module.py
index 15d8fc7852e0f..87ec3179a68ee 100644
--- a/torch/ao/quantization/fx/graph_module.py
+++ b/torch/ao/quantization/fx/graph_module.py
@@ -1,6 +1,6 @@
 # mypy: allow-untyped-defs
 import copy
-from typing import Any, Union
+from typing import Any
 
 import torch
 from torch.fx import GraphModule
@@ -18,7 +18,7 @@
 class FusedGraphModule(GraphModule):
     def __init__(
         self,
-        root: Union[torch.nn.Module, dict[str, Any]],
+        root: torch.nn.Module | dict[str, Any],
         graph: Graph,
         preserved_attr_names: set[str],
     ):
@@ -48,7 +48,7 @@ def __deepcopy__(self, memo):
 class ObservedGraphModule(GraphModule):
     def __init__(
         self,
-        root: Union[torch.nn.Module, dict[str, Any]],
+        root: torch.nn.Module | dict[str, Any],
         graph: Graph,
         preserved_attr_names: set[str],
     ):
@@ -91,7 +91,7 @@ def _is_observed_module(module: Any) -> bool:
 
 
 def _get_observed_graph_module_attr(
-    model: Union[torch.nn.Module, GraphModule], attr_name: str
+    model: torch.nn.Module | GraphModule, attr_name: str
 ) -> Any:
     if hasattr(model, "meta") and "_observed_graph_module_attrs" in model.meta:  # type: ignore[operator, index]
         return getattr(model.meta["_observed_graph_module_attrs"], attr_name)  # type: ignore[index]
@@ -101,7 +101,7 @@ def _get_observed_graph_module_attr(
 class ObservedStandaloneGraphModule(ObservedGraphModule):
     def __init__(
         self,
-        root: Union[torch.nn.Module, dict[str, Any]],
+        root: torch.nn.Module | dict[str, Any],
         graph: Graph,
         preserved_attr_names: set[str],
     ):
@@ -148,7 +148,7 @@ class QuantizedGraphModule(GraphModule):
 
     def __init__(
         self,
-        root: Union[torch.nn.Module, dict[str, Any]],
+        root: torch.nn.Module | dict[str, Any],
         graph: Graph,
         preserved_attr_names: set[str],
     ):
diff --git a/torch/ao/quantization/fx/lstm_utils.py b/torch/ao/quantization/fx/lstm_utils.py
index b609cd2b2157d..78849692a45ef 100644
--- a/torch/ao/quantization/fx/lstm_utils.py
+++ b/torch/ao/quantization/fx/lstm_utils.py
@@ -1,6 +1,6 @@
 import copy
 import operator
-from typing import Any, Optional, TYPE_CHECKING
+from typing import Any, TYPE_CHECKING
 
 import torch
 from torch.ao.quantization import (
@@ -23,12 +23,12 @@
 def _get_lstm_with_individually_observed_parts(
     float_lstm: torch.nn.LSTM,
     example_inputs: tuple[Any, ...],
-    backend_config: Optional[BackendConfig] = None,
-    linear_output_obs_ctr: Optional[_PartialWrapper] = None,
-    sigmoid_obs_ctr: Optional[_PartialWrapper] = None,
-    tanh_obs_ctr: Optional[_PartialWrapper] = None,
-    cell_state_obs_ctr: Optional[_PartialWrapper] = None,
-    hidden_state_obs_ctr: Optional[_PartialWrapper] = None,
+    backend_config: BackendConfig | None = None,
+    linear_output_obs_ctr: _PartialWrapper | None = None,
+    sigmoid_obs_ctr: _PartialWrapper | None = None,
+    tanh_obs_ctr: _PartialWrapper | None = None,
+    cell_state_obs_ctr: _PartialWrapper | None = None,
+    hidden_state_obs_ctr: _PartialWrapper | None = None,
     split_gates: bool = False,
 ) -> torch.ao.nn.quantizable.LSTM:
     """
@@ -139,7 +139,7 @@ def make_qconfig(obs_ctr: _PartialWrapper) -> QConfig:
         add_count = 0
         mul_count = 0
         for node in cell.graph.nodes:
-            op_index: Optional[tuple[Callable, int]] = None  # e.g. (torch.add, 1)
+            op_index: tuple[Callable, int] | None = None  # e.g. (torch.add, 1)
             if node.target is torch.add:
                 op_index = (torch.add, add_count)
                 add_count += 1
@@ -167,7 +167,7 @@ def make_qconfig(obs_ctr: _PartialWrapper) -> QConfig:
 
 def _get_reference_quantized_lstm_module(
     observed_lstm: torch.ao.nn.quantizable.LSTM,
-    backend_config: Optional[BackendConfig] = None,
+    backend_config: BackendConfig | None = None,
 ) -> torch.ao.nn.quantized.LSTM:
     """
     Return a `torch.ao.nn.quantized.LSTM` created from a `torch.ao.nn.quantizable.LSTM`
diff --git a/torch/ao/quantization/fx/match_utils.py b/torch/ao/quantization/fx/match_utils.py
index 86dee6a8965b5..79194caa4a17b 100644
--- a/torch/ao/quantization/fx/match_utils.py
+++ b/torch/ao/quantization/fx/match_utils.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 import sys
 from collections.abc import Callable, Iterable
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from torch.ao.quantization.qconfig import QConfigAny
@@ -17,10 +17,10 @@
 
 # TODO(future PR): the 1st argument is typed as `List[Node]`, but a better type
 # would be a recursive `List[Union[Node, Tuple[Union[Node, ...]]]]`
-_MatchResult = tuple[Node, list[Node], Optional[Pattern], QuantizeHandler]
+_MatchResult = tuple[Node, list[Node], Pattern | None, QuantizeHandler]
 
 _MatchResultWithQConfig = tuple[
-    Node, list[Node], Optional[Pattern], QuantizeHandler, QConfigAny
+    Node, list[Node], Pattern | None, QuantizeHandler, QConfigAny
 ]
 
 
@@ -83,9 +83,9 @@ def _find_matches(
     modules: dict[str, torch.nn.Module],
     patterns: dict[Pattern, QuantizeHandler],
     root_node_getter_mapping: dict[Pattern, Callable],
-    standalone_module_names: Optional[list[str]] = None,
-    standalone_module_classes: Optional[list[type]] = None,
-    custom_module_classes: Optional[list[Any]] = None,
+    standalone_module_names: list[str] | None = None,
+    standalone_module_classes: list[type] | None = None,
+    custom_module_classes: list[Any] | None = None,
 ) -> dict[str, _MatchResult]:
     """
     Matches the nodes in the input graph to quantization patterns, and
diff --git a/torch/ao/quantization/fx/prepare.py b/torch/ao/quantization/fx/prepare.py
index 0c05e6499901d..0c2fab3f27eb9 100644
--- a/torch/ao/quantization/fx/prepare.py
+++ b/torch/ao/quantization/fx/prepare.py
@@ -2,7 +2,7 @@
 import copy
 import warnings
 from dataclasses import asdict
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch._subclasses import FakeTensor
@@ -117,7 +117,7 @@
 
 
 def _get_observer_kwargs(
-    quant_spec: Union[QuantizationSpec, FixedQParamsQuantizationSpec],
+    quant_spec: QuantizationSpec | FixedQParamsQuantizationSpec,
 ):
     kwargs_dict = asdict(quant_spec)
     return copy.deepcopy(kwargs_dict)
@@ -127,14 +127,14 @@ def _get_qspec_for_arg(
     arg: Node,
     input_qspec_map: dict[Node, QuantizationSpecBase],
     named_modules: dict[str, torch.nn.Module],
-) -> Optional[QuantizationSpecBase]:
+) -> QuantizationSpecBase | None:
     while _is_activation_post_process_node(arg, named_modules):
         arg = arg.args[0]  # type: ignore[assignment]
     return input_qspec_map.get(arg)
 
 
 def _create_obs_or_fq_from_qspec(
-    quantization_spec: Optional[QuantizationSpecBase],
+    quantization_spec: QuantizationSpecBase | None,
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize],
     is_qat: bool,
 ):
@@ -249,8 +249,8 @@ def _is_activation_post_process_node(
 
 
 def _get_dtype_and_is_dynamic(
-    obs_or_fq: Optional[ObserverOrFakeQuantize],
-) -> tuple[Optional[torch.dtype], bool]:
+    obs_or_fq: ObserverOrFakeQuantize | None,
+) -> tuple[torch.dtype | None, bool]:
     """Given a constructor for observer or fake quant module, returns
     a Tuple of dtype and is_dynamic
     """
@@ -393,8 +393,8 @@ def _is_observer_in_same_graph(
 
 
 def _is_pattern_dtype_config_and_qconfig_supported_by_backend(
-    pattern: Optional[Pattern],
-    matched_node_pattern: Optional[list[Node]],
+    pattern: Pattern | None,
+    matched_node_pattern: list[Node] | None,
     qconfig: QConfigAny,
     backend_config: BackendConfig,
 ) -> bool:
@@ -437,10 +437,8 @@ def _get_standalone_module_configs(
     named_modules: dict[str, torch.nn.Module],
     prepare_custom_config: PrepareCustomConfig,
     parent_qconfig: QConfigAny,
-    parent_backend_config: Optional[BackendConfig],
-) -> tuple[
-    QConfigMapping, tuple[Any, ...], PrepareCustomConfig, Optional[BackendConfig]
-]:
+    parent_backend_config: BackendConfig | None,
+) -> tuple[QConfigMapping, tuple[Any, ...], PrepareCustomConfig, BackendConfig | None]:
     """
     Returns the standalone module QConfigMapping and PrepareCustomConfig
     for `node`, assuming that the module pointed to by `node` is
@@ -486,7 +484,7 @@ def _insert_obs_or_fq(
     model: torch.nn.Module,
     named_modules: dict[str, torch.nn.Module],
     graph: Graph,
-    model_device: Optional[torch.device] = None,
+    model_device: torch.device | None = None,
 ) -> Node:
     """
     Attaches `obs_or_fq` to `model`, and creates a node which calls
@@ -516,7 +514,7 @@ def _set_target_dtype_info_for_matched_node_pattern(
     matched_node_pattern: NodePattern,
     last_node: Node,
     qconfig: QConfigAny,
-    qhandler: Optional[QuantizeHandler],
+    qhandler: QuantizeHandler | None,
     backend_config: BackendConfig,
     named_modules: dict[str, torch.nn.Module],
     cache_for_no_tensor_check: dict[Node, bool],
@@ -571,7 +569,7 @@ def _set_target_dtype_info_for_matched_node_pattern(
 def _get_target_activation_dtype_for_node(
     node: Node,
     qconfig: QConfigAny,
-    qhandler: Optional[QuantizeHandler],
+    qhandler: QuantizeHandler | None,
     named_modules: dict[str, torch.nn.Module],
     backend_config: BackendConfig,
     cache_for_no_tensor_check: dict[Node, bool],
@@ -674,7 +672,7 @@ def _get_output_act_obs_or_fq(
     named_modules: dict[str, torch.nn.Module],
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize],
     is_qat: bool,
-) -> Optional[ObserverOrFakeQuantize]:
+) -> ObserverOrFakeQuantize | None:
     """Get the constructor for observer or fake quant object for
     the argument in the original graph as the output of previous node,
     skipping inserted observers
@@ -745,7 +743,7 @@ def _get_arg_target_dtype_as_output(
     named_modules: dict[str, torch.nn.Module],
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize],
     is_qat: bool,
-) -> Optional[torch.dtype]:
+) -> torch.dtype | None:
     arg_as_output_act_obs_or_fq = _get_output_act_obs_or_fq(
         arg, named_modules, obs_or_fq_map, is_qat
     )
@@ -761,7 +759,7 @@ def _get_arg_as_input_act_obs_or_fq(
     named_modules: dict[str, torch.nn.Module],
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize],
     is_qat: bool,
-) -> Optional[ObserverOrFakeQuantize]:
+) -> ObserverOrFakeQuantize | None:
     """Get the observer or fake quant constructor for the Argument `arg`, as input
     to Node `node`
     """
@@ -809,18 +807,18 @@ def _get_arg_as_input_act_obs_or_fq(
 
 
 def _maybe_insert_input_observer_for_arg_or_kwarg(
-    node: Union[Node, Any],
+    node: Node | Any,
     arg: Argument,
     qconfig: QConfigAny,
     model: torch.nn.Module,
     named_modules: dict[str, torch.nn.Module],
     graph: Graph,
-    qhandler: Optional[QuantizeHandler],
+    qhandler: QuantizeHandler | None,
     prepare_custom_config: PrepareCustomConfig,
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize],
     is_qat: bool,
-    backend_config: Optional[BackendConfig] = None,
-    model_device: Optional[torch.device] = None,
+    backend_config: BackendConfig | None = None,
+    model_device: torch.device | None = None,
 ) -> Argument:
     """
     Given a `node` and an `arg`, inserts an input observer between
@@ -950,7 +948,7 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
         # we should remove this
         # removing this means we insert one observer for each use, even if they
         # have the same dtype, we can have an extra pass that removes the extra observers
-        for maybe_obs_node in arg.users.keys():
+        for maybe_obs_node in arg.users:
             if maybe_obs_node.op == "call_module":
                 maybe_obs_mod = named_modules[maybe_obs_node.target]  # type: ignore[index]
                 if (
@@ -987,12 +985,12 @@ def _maybe_insert_input_observers_for_node(
     model: torch.nn.Module,
     named_modules: dict[str, torch.nn.Module],
     graph: Graph,
-    qhandler: Optional[QuantizeHandler],
+    qhandler: QuantizeHandler | None,
     prepare_custom_config: PrepareCustomConfig,
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize],
     is_qat: bool,
-    backend_config: Optional[BackendConfig] = None,
-    model_device: Optional[torch.device] = None,
+    backend_config: BackendConfig | None = None,
+    model_device: torch.device | None = None,
 ) -> None:
     """
     If needed, inserts observers to the input args and kwargs of `node`.
@@ -1108,7 +1106,7 @@ def _maybe_insert_output_observer_for_node(
     graph: Graph,
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize],
     is_qat: bool,
-) -> Optional[Node]:
+) -> Node | None:
     """
     If `node` needs an output observer, creates it, inserts it into `graph`
     and returns it.
@@ -1286,7 +1284,7 @@ def _recursive_maybe_replace_node_with_obs(
 
 def _maybe_propagate_dtype_for_node(
     node: Node,
-    target_dtype: Union[torch.dtype, type],
+    target_dtype: torch.dtype | type,
     node_name_to_match_result_with_qconfig: dict[str, _MatchResultWithQConfig],
 ) -> None:
     """
@@ -1440,7 +1438,7 @@ def _maybe_make_input_output_share_observers(
             setattr(named_modules[parent_name], name, obs_mod_to_use)
 
     # set the output observer node to use that module
-    for output_obs_node in node.users.keys():
+    for output_obs_node in node.users:
         if not _is_activation_post_process_node(output_obs_node, named_modules):
             raise AssertionError(
                 "output_obs_node must be an activation post process node"
@@ -1490,7 +1488,7 @@ def insert_observers_for_model(
     backend_config: BackendConfig,
     observed_node_names: set[str],
     is_qat: bool,
-) -> Optional[Node]:
+) -> Node | None:
     """
     Inserts observers, using the following high level algorithm:
 
@@ -2023,13 +2021,13 @@ def _save_state(
 
 def prepare(
     model: GraphModule,
-    qconfig_mapping: Union[QConfigMapping, dict[str, Any]],
+    qconfig_mapping: QConfigMapping | dict[str, Any],
     is_qat: bool,
     node_name_to_scope: dict[str, tuple[str, type]],
     example_inputs: tuple[Any, ...],
-    prepare_custom_config: Union[PrepareCustomConfig, dict[str, Any], None] = None,
-    _equalization_config: Union[QConfigMapping, dict[str, Any], None] = None,
-    backend_config: Union[BackendConfig, dict[str, Any], None] = None,
+    prepare_custom_config: PrepareCustomConfig | dict[str, Any] | None = None,
+    _equalization_config: QConfigMapping | dict[str, Any] | None = None,
+    backend_config: BackendConfig | dict[str, Any] | None = None,
     is_standalone_module: bool = False,
 ) -> GraphModule:
     """standalone_module means it a submodule that is not inlined in
diff --git a/torch/ao/quantization/fx/qconfig_mapping_utils.py b/torch/ao/quantization/fx/qconfig_mapping_utils.py
index 74f90505ea2af..783cba8149e6e 100644
--- a/torch/ao/quantization/fx/qconfig_mapping_utils.py
+++ b/torch/ao/quantization/fx/qconfig_mapping_utils.py
@@ -2,7 +2,7 @@
 import re
 from collections import defaultdict, OrderedDict
 from collections.abc import Callable
-from typing import Any, Union
+from typing import Any
 
 import torch
 from torch.ao.nn.intrinsic import _FusedModule
@@ -206,7 +206,7 @@ def _check_is_valid_config_dict(
       `config_dict`: dictionary whose keys we want to check
     """
 
-    for k in config_dict.keys():
+    for k in config_dict:
         if k not in allowed_keys:
             raise ValueError(
                 "Expected "
@@ -250,7 +250,7 @@ def _compare_prepare_convert_qconfig_mappings(
         _MODULE_NAME_REGEX_DICT_KEY,
     ]
     for i in range(len(prepare_dicts)):
-        for name in prepare_dicts[i].keys():
+        for name in prepare_dicts[i]:
             if name not in convert_dicts[i]:
                 raise AssertionError(
                     f"Missing key {dict_names[i]} {name} in convert QConfigMapping when it was present in prepare"
@@ -311,7 +311,7 @@ def _is_qconfig_supported_by_dtype_configs(
 
 def _get_object_type_qconfig(
     qconfig_mapping: QConfigMapping,
-    object_type: Union[Callable, str],
+    object_type: Callable | str,
     fallback_qconfig: QConfigAny,
 ) -> QConfigAny:
     return qconfig_mapping.object_type_qconfigs.get(object_type, fallback_qconfig)
@@ -356,7 +356,7 @@ def _maybe_adjust_qconfig_for_module_type_or_name(
 
 def _get_flattened_qconfig_dict(
     qconfig_mapping: QConfigMapping,
-) -> dict[Union[Callable, str], QConfigAny]:
+) -> dict[Callable | str, QConfigAny]:
     """flatten the global, object_type and module_name qconfig
     to the same qconfig_dict so that it can be used by
     propagate_qconfig_ function.
@@ -380,9 +380,7 @@ def _get_flattened_qconfig_dict(
       "conv": qconfig
     }
     """
-    flattened: dict[Union[Callable, str], QConfigAny] = {
-        "": qconfig_mapping.global_qconfig
-    }
+    flattened: dict[Callable | str, QConfigAny] = {"": qconfig_mapping.global_qconfig}
     flattened.update(qconfig_mapping.object_type_qconfigs)
     flattened.update(qconfig_mapping.module_name_qconfigs)  # type: ignore[arg-type]
     return flattened
diff --git a/torch/ao/quantization/fx/quantize_handler.py b/torch/ao/quantization/fx/quantize_handler.py
index 6ab33a2283112..0bd8d7fe3a174 100644
--- a/torch/ao/quantization/fx/quantize_handler.py
+++ b/torch/ao/quantization/fx/quantize_handler.py
@@ -1,7 +1,6 @@
 # mypy: allow-untyped-defs
 from abc import ABC
 from collections.abc import Callable
-from typing import Optional
 
 import torch
 from torch.ao.quantization.backend_config import (
@@ -49,7 +48,7 @@ def __init__(
         self,
         node_pattern: NodePattern,
         modules: dict[str, torch.nn.Module],
-        root_node_getter: Optional[Callable] = None,
+        root_node_getter: Callable | None = None,
         is_custom_module=False,
         is_standalone_module=False,
     ):
@@ -115,7 +114,7 @@ def __init__(
             self,
             node_pattern: NodePattern,
             modules: dict[str, torch.nn.Module],
-            root_node_getter: Optional[Callable] = None,
+            root_node_getter: Callable | None = None,
         ):
             super().__init__(node_pattern, modules, root_node_getter)
             if num_tensor_args_to_observation_type:
diff --git a/torch/ao/quantization/fx/utils.py b/torch/ao/quantization/fx/utils.py
index 3e2afaaa1d9f3..0a46d2057c548 100644
--- a/torch/ao/quantization/fx/utils.py
+++ b/torch/ao/quantization/fx/utils.py
@@ -6,7 +6,7 @@
 from collections import namedtuple
 from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 import torch.nn as nn
@@ -83,8 +83,8 @@ class ObservedGraphModuleAttrs:
     is_qat: bool
     observed_node_names: set[str]
     is_observed_standalone_module: bool = False
-    standalone_module_input_quantized_idxs: Optional[list[int]] = None
-    standalone_module_output_quantized_idxs: Optional[list[int]] = None
+    standalone_module_input_quantized_idxs: list[int] | None = None
+    standalone_module_output_quantized_idxs: list[int] | None = None
 
 
 def node_arg_is_weight(node: Node, arg: Any) -> bool:
@@ -192,13 +192,15 @@ def get_attr_name(i: int):
     return get_new_attr_name
 
 
-def collect_producer_nodes(node: Node) -> Optional[list[Node]]:
+def collect_producer_nodes(node: Node) -> list[Node] | None:
     r"""Starting from a target node, trace back until we hit input or
     getattr node. This is used to extract the chain of operators
-    starting from getattr to the target node, for example
-    def forward(self, x):
-      observed = self.observer(self.weight)
-      return F.linear(x, observed)
+    starting from getattr to the target node, for example::
+
+        def forward(self, x):
+            observed = self.observer(self.weight)
+            return F.linear(x, observed)
+
     collect_producer_nodes(observed) will either return a list of nodes that
     produces the observed node or None if we can't extract a self contained
     graph without free variables(inputs of the forward function).
@@ -263,7 +265,7 @@ def create_getattr_from_value(
     graph: Graph,
     prefix: str,
     value: Any,
-    device: Optional[torch.device] = None,
+    device: torch.device | None = None,
 ) -> Node:
     """
     Given a value of any type, creates a getattr node corresponding to the value and
@@ -394,7 +396,7 @@ def arg_indices_func(node: Node) -> list[int]:
 # for them would cause errors
 
 NON_OBSERVABLE_ARG_DICT: dict[
-    NodeInfo, dict[Union[type, torch.dtype], Callable[[Node], list[int]]]
+    NodeInfo, dict[type | torch.dtype, Callable[[Node], list[int]]]
 ] = {
     NodeInfo("call_method", "masked_fill"): {
         torch.bool: return_arg_list([1]),
@@ -412,12 +414,12 @@ def arg_indices_func(node: Node) -> list[int]:
     NodeInfo("call_method", "view"): {int: all_node_args_except_first},
 }
 
-EMPTY_ARG_DICT: dict[Union[type, torch.dtype], Callable[[Node], list[int]]] = {}
+EMPTY_ARG_DICT: dict[type | torch.dtype, Callable[[Node], list[int]]] = {}
 
 
 def get_non_observable_arg_indexes_and_types(
     node: Node,
-) -> dict[Union[type, torch.dtype], Callable[[Node], list[int]]]:
+) -> dict[type | torch.dtype, Callable[[Node], list[int]]]:
     """
     Returns a dict with of non float tensor types as keys and values which correspond to a
     function to retrieve the list (which takes the node as an argument)
@@ -430,9 +432,9 @@ def get_non_observable_arg_indexes_and_types(
 def maybe_get_next_module(
     node: Node,
     modules: dict[str, nn.Module],
-    target_module_type: Optional[type[nn.Module]] = None,
+    target_module_type: type[nn.Module] | None = None,
     target_functional_type: Any = None,
-) -> Optional[Node]:
+) -> Node | None:
     """Gets the next module that matches what is needed in
     is_target_module_type if it exists
 
@@ -442,7 +444,7 @@ def maybe_get_next_module(
         target_functional_type: Functional type that we want to check
     """
 
-    for user in node.users.keys():
+    for user in node.users:
         if (
             user.op == "call_module"
             and target_module_type is not None
@@ -499,7 +501,7 @@ def _is_custom_module_lstm(
     named_modules: dict[str, torch.nn.Module],
     qconfig: QConfigAny = None,
     # QuantizeHandler, but we cannot include the type here due to circular imports
-    qhandler: Optional[Any] = None,
+    qhandler: Any | None = None,
 ) -> bool:
     """
     Return whether this refers to the custom module LSTM flow.
@@ -524,7 +526,7 @@ def _is_custom_module_mha(
     named_modules: dict[str, torch.nn.Module],
     qconfig: QConfigAny = None,
     # QuantizeHandler, but we cannot include the type here due to circular imports
-    qhandler: Optional[Any] = None,
+    qhandler: Any | None = None,
 ) -> bool:
     """
     Return whether this refers to the custom module MultiheadAttention flow.
@@ -546,7 +548,7 @@ def _is_custom_module_mha(
 
 def _get_module(
     node: Node, named_modules: dict[str, torch.nn.Module]
-) -> Optional[torch.nn.Module]:
+) -> torch.nn.Module | None:
     """
     If `node` refers to a call_module node, return the module, else None.
     """
@@ -674,7 +676,7 @@ def _insert_dequant_stubs_for_custom_module_lstm_output(
 def _maybe_get_custom_module_lstm_from_node_arg(
     arg: Node,
     named_modules: dict[str, torch.nn.Module],
-) -> Optional[Node]:
+) -> Node | None:
     """
     Given an argument of a node, if the argument refers to the path through which the node
     is a consumer of custom module LSTM, return the custom module LSTM node, or None otherwise.
@@ -711,7 +713,7 @@ def match_getitem(a):
     def match_tuple(a):
         return a.op == "call_function" and a.target is tuple
 
-    def _match_pattern(match_pattern: list[Callable]) -> Optional[Node]:
+    def _match_pattern(match_pattern: list[Callable]) -> Node | None:
         """
         Traverse up the graph and match the args one by one.
         If there is a match, return the last matched node, or None otherwise.
@@ -849,7 +851,7 @@ def find_patterns(
 
 
 def _get_observer_from_activation_post_process(
-    activation_post_process: Union[ObserverBase, FakeQuantizeBase],
+    activation_post_process: ObserverBase | FakeQuantizeBase,
 ) -> ObserverBase:
     """
     If `activation_post_process` is an observer, return the observer.
@@ -885,7 +887,7 @@ def _qconfig_satisfies_dtype_config_constraints(
 
     # TODO: log warnings only when the user enabled a debug flag
     def _activation_post_process_satisfies_dtype_config_constraints(
-        activation_post_process: Union[ObserverBase, FakeQuantizeBase],
+        activation_post_process: ObserverBase | FakeQuantizeBase,
         dtype_with_constraints: DTypeWithConstraints,
         debug_string: str,
     ) -> bool:
diff --git a/torch/ao/quantization/observer.py b/torch/ao/quantization/observer.py
index e7e04795302f2..abb81c2a54d00 100644
--- a/torch/ao/quantization/observer.py
+++ b/torch/ao/quantization/observer.py
@@ -14,7 +14,7 @@
 from abc import ABCMeta, abstractmethod
 from collections import OrderedDict
 from functools import partial
-from typing import Any, Optional
+from typing import Any
 
 import torch
 import torch.nn as nn
@@ -1547,7 +1547,7 @@ class RecordingObserver(ObserverBase):
         reduce_range: Reduces the range of the quantized data type by 1 bit
     """
 
-    __annotations__ = {"tensor_val": list[Optional[torch.Tensor]]}
+    __annotations__ = {"tensor_val": list[torch.Tensor | None]}
 
     def __init__(self, dtype=torch.quint8):
         super().__init__(dtype=dtype, is_dynamic=False)
@@ -1838,13 +1838,13 @@ def __init__(
         mapping_type: MappingType,
         target_dtype: torch.dtype,
         granularity: Granularity,
-        quant_min: Optional[int] = None,
-        quant_max: Optional[int] = None,
-        eps: Optional[float] = None,
-        scale_dtype: Optional[torch.dtype] = None,
-        zero_point_dtype: Optional[torch.dtype] = None,
+        quant_min: int | None = None,
+        quant_max: int | None = None,
+        eps: float | None = None,
+        scale_dtype: torch.dtype | None = None,
+        zero_point_dtype: torch.dtype | None = None,
         preserve_zero: bool = True,
-        zero_point_domain: Optional[ZeroPointDomain] = ZeroPointDomain.INT,
+        zero_point_domain: ZeroPointDomain | None = ZeroPointDomain.INT,
         # there could be some extra args that's ignored
         **kwargs,
     ):
diff --git a/torch/ao/quantization/pt2e/_affine_quantization.py b/torch/ao/quantization/pt2e/_affine_quantization.py
index 02e9c9e6deb8d..aa75f32eb8d80 100644
--- a/torch/ao/quantization/pt2e/_affine_quantization.py
+++ b/torch/ao/quantization/pt2e/_affine_quantization.py
@@ -3,7 +3,7 @@
 # PLEASE DON'T MODIFY THIS FILE SO THAT WE DON'T GET OUT OF SYNC
 import logging
 from abc import ABCMeta
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch.ao.quantization.observer import (
@@ -40,7 +40,7 @@
 Map from dtype to the bound value of integers
 TODO: maybe can replace this with call to torch.iinfo
 """
-_DTYPE_TO_QVALUE_BOUNDS: dict[Union[torch.dtype, TorchAODType], tuple[int, int]] = {
+_DTYPE_TO_QVALUE_BOUNDS: dict[torch.dtype | TorchAODType, tuple[int, int]] = {
     torch.uint8: (0, 255),
     torch.int8: (-128, 127),
     torch.int16: (-(2**15), 2**15 - 1),
@@ -204,13 +204,13 @@ def choose_qparams_affine_with_min_max(
     mapping_type: MappingType,
     block_size: tuple[int, ...],
     target_dtype: torch.dtype,
-    quant_min: Optional[int] = None,
-    quant_max: Optional[int] = None,
-    eps: Optional[float] = None,
-    scale_dtype: Optional[torch.dtype] = None,
-    zero_point_dtype: Optional[torch.dtype] = None,
+    quant_min: int | None = None,
+    quant_max: int | None = None,
+    eps: float | None = None,
+    scale_dtype: torch.dtype | None = None,
+    zero_point_dtype: torch.dtype | None = None,
     preserve_zero: bool = True,
-    zero_point_domain: Optional[ZeroPointDomain] = ZeroPointDomain.INT,
+    zero_point_domain: ZeroPointDomain | None = ZeroPointDomain.INT,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """A variant of :func:`~torchao.quantization.quant_primitives.choose_qparams_affine`
     operator that pass in min_val and max_val directly instead of deriving these from a single input.
@@ -241,19 +241,19 @@ def choose_qparams_affine_with_min_max(
 
 @register_custom_op
 def _choose_qparams_affine(
-    input: Optional[torch.Tensor],
+    input: torch.Tensor | None,
     mapping_type: str,
     block_size: list[int],
     target_dtype: torch.dtype,
-    quant_min: Optional[Union[int, float, bool]] = None,
-    quant_max: Optional[Union[int, float, bool]] = None,
-    eps: Optional[float] = None,
-    scale_dtype: Optional[torch.dtype] = None,
-    zero_point_dtype: Optional[torch.dtype] = None,
+    quant_min: int | float | bool | None = None,
+    quant_max: int | float | bool | None = None,
+    eps: float | None = None,
+    scale_dtype: torch.dtype | None = None,
+    zero_point_dtype: torch.dtype | None = None,
     preserve_zero: bool = True,
-    zero_point_domain: Optional[str] = "INT",
-    min_val: Optional[torch.Tensor] = None,
-    max_val: Optional[torch.Tensor] = None,
+    zero_point_domain: str | None = "INT",
+    min_val: torch.Tensor | None = None,
+    max_val: torch.Tensor | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """op definition that has compatible signatures with custom op library
 
@@ -388,11 +388,11 @@ def quantize_affine(
     input: torch.Tensor,
     block_size: tuple[int, ...],
     scale: torch.Tensor,
-    zero_point: Optional[torch.Tensor],
+    zero_point: torch.Tensor | None,
     output_dtype: torch.dtype,
-    quant_min: Optional[Union[int, float]] = None,
-    quant_max: Optional[Union[int, float]] = None,
-    zero_point_domain: Optional[ZeroPointDomain] = ZeroPointDomain.INT,
+    quant_min: int | float | None = None,
+    quant_max: int | float | None = None,
+    zero_point_domain: ZeroPointDomain | None = ZeroPointDomain.INT,
 ) -> torch.Tensor:
     """
     Args:
@@ -445,11 +445,11 @@ def _quantize_affine(
     input: torch.Tensor,
     block_size: list[int],
     scale: torch.Tensor,
-    zero_point: Optional[torch.Tensor],
+    zero_point: torch.Tensor | None,
     output_dtype: torch.dtype,
-    quant_min: Optional[Union[int, float, bool]] = None,
-    quant_max: Optional[Union[int, float, bool]] = None,
-    zero_point_domain: Optional[str] = ZeroPointDomain.INT.name,
+    quant_min: int | float | bool | None = None,
+    quant_max: int | float | bool | None = None,
+    zero_point_domain: str | None = ZeroPointDomain.INT.name,
 ) -> torch.Tensor:
     """op definition that has compatible signatures with custom op library
 
@@ -480,10 +480,10 @@ def _quantize_affine_no_dtype_cast(
     input: torch.Tensor,
     block_size: list[int],
     scale: torch.Tensor,
-    zero_point: Optional[torch.Tensor],
-    quant_min: Union[int, float],
-    quant_max: Union[int, float],
-    zero_point_domain: Optional[str] = ZeroPointDomain.INT.name,
+    zero_point: torch.Tensor | None,
+    quant_min: int | float,
+    quant_max: int | float,
+    zero_point_domain: str | None = ZeroPointDomain.INT.name,
 ) -> torch.Tensor:
     """
     The op does the following:
@@ -544,10 +544,10 @@ def dequantize_affine(
     input: torch.Tensor,
     block_size: tuple[int, ...],
     scale: torch.Tensor,
-    zero_point: Optional[torch.Tensor],
+    zero_point: torch.Tensor | None,
     input_dtype: torch.dtype,
-    quant_min: Optional[Union[int, float]] = None,
-    quant_max: Optional[Union[int, float]] = None,
+    quant_min: int | float | None = None,
+    quant_max: int | float | None = None,
     zero_point_domain: ZeroPointDomain = ZeroPointDomain.INT,
     *,
     output_dtype: torch.dtype = torch.float32,
@@ -592,11 +592,11 @@ def _dequantize_affine(
     input: torch.Tensor,
     block_size: list[int],
     scale: torch.Tensor,
-    zero_point: Optional[torch.Tensor],
+    zero_point: torch.Tensor | None,
     input_dtype: torch.dtype,
-    quant_min: Optional[Union[int, float, bool]] = None,
-    quant_max: Optional[Union[int, float, bool]] = None,
-    zero_point_domain: Optional[str] = ZeroPointDomain.INT.name,
+    quant_min: int | float | bool | None = None,
+    quant_max: int | float | bool | None = None,
+    zero_point_domain: str | None = ZeroPointDomain.INT.name,
     output_dtype: torch.dtype = torch.float32,
 ) -> torch.Tensor:
     """op definition that has compatible signatures with custom op library"""
@@ -623,10 +623,10 @@ def _dequantize_affine_no_dtype_check(
     input: torch.Tensor,
     block_size: list[int],
     scale: torch.Tensor,
-    zero_point: Optional[torch.Tensor],
-    quant_min: Union[int, float],
-    quant_max: Union[int, float],
-    zero_point_domain: Optional[str] = ZeroPointDomain.INT.name,
+    zero_point: torch.Tensor | None,
+    quant_min: int | float,
+    quant_max: int | float,
+    zero_point_domain: str | None = ZeroPointDomain.INT.name,
     output_dtype: torch.dtype = torch.float32,
 ) -> torch.Tensor:
     """This function converts AQT tensors to their high precision floating point representation
@@ -758,14 +758,14 @@ def __init__(
         target_dtype: torch.dtype,
         granularity: Granularity,
         averaging_constant=0.01,
-        quant_min: Optional[int] = None,
-        quant_max: Optional[int] = None,
-        eps: Optional[float] = None,
+        quant_min: int | None = None,
+        quant_max: int | None = None,
+        eps: float | None = None,
         is_dynamic=False,
-        scale_dtype: Optional[torch.dtype] = None,
-        zero_point_dtype: Optional[torch.dtype] = None,
+        scale_dtype: torch.dtype | None = None,
+        zero_point_dtype: torch.dtype | None = None,
         preserve_zero: bool = True,
-        zero_point_domain: Optional[ZeroPointDomain] = ZeroPointDomain.INT,
+        zero_point_domain: ZeroPointDomain | None = ZeroPointDomain.INT,
         # there could be some extra args that's ignored
         **kwargs,
     ):
@@ -854,14 +854,14 @@ def __init__(
         mapping_type: MappingType,
         target_dtype: torch.dtype,
         granularity: Granularity,
-        quant_min: Optional[int] = None,
-        quant_max: Optional[int] = None,
-        eps: Optional[float] = None,
+        quant_min: int | None = None,
+        quant_max: int | None = None,
+        eps: float | None = None,
         is_dynamic=False,
-        scale_dtype: Optional[torch.dtype] = None,
-        zero_point_dtype: Optional[torch.dtype] = None,
+        scale_dtype: torch.dtype | None = None,
+        zero_point_dtype: torch.dtype | None = None,
         preserve_zero: bool = True,
-        zero_point_domain: Optional[ZeroPointDomain] = ZeroPointDomain.INT,
+        zero_point_domain: ZeroPointDomain | None = ZeroPointDomain.INT,
         # there could be some extra args that's ignored
         **kwargs,
     ):
diff --git a/torch/ao/quantization/pt2e/_numeric_debugger.py b/torch/ao/quantization/pt2e/_numeric_debugger.py
index 040a352f6be3d..6eaeaa46a9248 100644
--- a/torch/ao/quantization/pt2e/_numeric_debugger.py
+++ b/torch/ao/quantization/pt2e/_numeric_debugger.py
@@ -2,7 +2,6 @@
 import logging
 from collections.abc import Callable, Sequence
 from dataclasses import dataclass
-from typing import Optional
 
 import torch
 from torch.ao.ns.fx.utils import compute_sqnr
@@ -131,8 +130,8 @@ class OutputLogger(torch.nn.Module):
     def __init__(
         self,
         debug_handle: int,
-        node_name: Optional[str] = None,
-        nn_module_stack: Optional[object] = None,
+        node_name: str | None = None,
+        nn_module_stack: object | None = None,
     ) -> None:
         super().__init__()
         self.node_name = node_name
@@ -271,7 +270,7 @@ def _module_stack_to_str(module_stack: object) -> str:
 
 def extract_results_from_loggers(
     model: GraphModule,
-) -> dict[int, tuple[Optional[str], object, list[object]]]:
+) -> dict[int, tuple[str | None, object, list[object]]]:
     """For a given model, extract the tensors stats and related information for each debug handle.
     The reason we have a list of object, instead of Tensor is because the output of node may not be
     a Tensor, it could be (nested) list, tuple or dict as well.
@@ -282,7 +281,7 @@ def extract_results_from_loggers(
 
     """
     # Results maps debug handle to a tensor list for each model being compared.
-    handles: dict[int, tuple[Optional[str], object, list[object]]] = {}
+    handles: dict[int, tuple[str | None, object, list[object]]] = {}
     for _name, module in model.named_children():
         if isinstance(module, OutputLogger) and len(module.stats) > 0:
             handles[module.debug_handle] = (
@@ -295,8 +294,8 @@ def extract_results_from_loggers(
 
 
 def compare_results(
-    ref_results: dict[int, tuple[Optional[str], object, list[torch.Tensor]]],
-    actual_results: dict[int, tuple[Optional[str], object, list[torch.Tensor]]],
+    ref_results: dict[int, tuple[str | None, object, list[torch.Tensor]]],
+    actual_results: dict[int, tuple[str | None, object, list[torch.Tensor]]],
 ) -> dict[int, NodeAccuracySummary]:
     """Given two dict mapping from `debug_handle_id` (int) to list of tensors
     return a map from `debug_handle_id` to `NodeAccuracySummary` that contains
diff --git a/torch/ao/quantization/pt2e/graph_utils.py b/torch/ao/quantization/pt2e/graph_utils.py
index f00def3c1668b..a6b46011d8c41 100644
--- a/torch/ao/quantization/pt2e/graph_utils.py
+++ b/torch/ao/quantization/pt2e/graph_utils.py
@@ -3,7 +3,7 @@
 import operator
 from collections import OrderedDict
 from collections.abc import Callable, Sequence
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch.export import ExportedProgram
@@ -95,7 +95,7 @@ def find_sequential_partitions(
     gm: torch.fx.GraphModule,
     partition_types: list[Any],
     include_functional_equivalent=True,
-    filter_fn: Optional[Callable[[Node], bool]] = None,
+    filter_fn: Callable[[Node], bool] | None = None,
 ):
     if not _valid_type_sequence(partition_types):
         raise ValueError(
@@ -166,7 +166,7 @@ def _get_control_flow_submodules(
 
 
 def bfs_trace_with_node_process(
-    model: Union[ExportedProgram, torch.fx.GraphModule], node_op: Callable
+    model: ExportedProgram | torch.fx.GraphModule, node_op: Callable
 ) -> None:
     """Traverse the graph module and apply node_op to each node."""
 
diff --git a/torch/ao/quantization/pt2e/port_metadata_pass.py b/torch/ao/quantization/pt2e/port_metadata_pass.py
index aab4c435c872f..be5878042b046 100644
--- a/torch/ao/quantization/pt2e/port_metadata_pass.py
+++ b/torch/ao/quantization/pt2e/port_metadata_pass.py
@@ -1,6 +1,5 @@
 # mypy: allow-untyped-defs
 import logging
-from typing import Optional
 
 import torch
 from torch._export.error import InternalError
@@ -55,7 +54,7 @@ def _has_quant_annotation(node: torch.fx.Node) -> bool:
     return "quantization_annotation" in node.meta
 
 
-def _find_choose_qparams_node(node: torch.fx.Node) -> Optional[torch.fx.Node]:
+def _find_choose_qparams_node(node: torch.fx.Node) -> torch.fx.Node | None:
     # BFS to look for choose qparams
     from collections import deque
 
@@ -66,7 +65,7 @@ def _find_choose_qparams_node(node: torch.fx.Node) -> Optional[torch.fx.Node]:
             continue
         if n.op == "call_function" and n.target in _CHOOSE_QPARAMS_OPS:
             return n
-        for k in n.users.keys():
+        for k in n.users:
             queue.append(k)
     return None
 
@@ -74,7 +73,7 @@ def _find_choose_qparams_node(node: torch.fx.Node) -> Optional[torch.fx.Node]:
 def _port_metadata_for_input_quant_nodes(
     input_node: torch.fx.Node,
     node: torch.fx.Node,
-    qspec: Optional[QuantizationSpecBase],
+    qspec: QuantizationSpecBase | None,
 ):
     if qspec is None:
         return
@@ -132,7 +131,7 @@ def _port_metadata_for_input_quant_nodes(
 
 
 def _port_metadata_for_output_quant_nodes(
-    node: torch.fx.Node, qspec: Optional[QuantizationSpecBase]
+    node: torch.fx.Node, qspec: QuantizationSpecBase | None
 ):
     if qspec is None:
         return
diff --git a/torch/ao/quantization/pt2e/prepare.py b/torch/ao/quantization/pt2e/prepare.py
index 6eac69a96ba42..7e3c8b4b33d88 100644
--- a/torch/ao/quantization/pt2e/prepare.py
+++ b/torch/ao/quantization/pt2e/prepare.py
@@ -1,5 +1,5 @@
 # mypy: allow-untyped-defs
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch._subclasses import FakeTensor
@@ -217,7 +217,7 @@ def _get_edge_or_node_to_group_id(
     # means the observer of key should be shared with observer with value, by default it will
     # be shared with itself
     shared_with_map: dict[EdgeOrNode, EdgeOrNode] = {
-        k: k for k in edge_or_node_to_qspec.keys()
+        k: k for k in edge_or_node_to_qspec
     }
     for edge_or_node, qspec in edge_or_node_to_qspec.items():
         if isinstance(edge_or_node, torch.fx.Node):
@@ -282,7 +282,7 @@ def _get_edge_or_node_to_group_id(
     # now that we get the sharing relations between all edges and nodes, we can assign group ids
     cur_group_id = 0
     edge_or_node_to_group_id: dict[EdgeOrNode, int] = {}
-    for edge_or_node in shared_with_map.keys():
+    for edge_or_node in shared_with_map:
         root = _find_root_edge_or_node(edge_or_node, shared_with_map)
         if root not in edge_or_node_to_group_id:
             edge_or_node_to_group_id[root] = cur_group_id
@@ -316,14 +316,14 @@ def _get_obs_or_fq_map(
 
 
 def _maybe_insert_input_observer_for_arg_or_kwarg(
-    node: Union[Node, Any],
+    node: Node | Any,
     arg: Argument,
     qconfig: QConfigAny,
     model: torch.nn.Module,
     named_modules: dict[str, torch.nn.Module],
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize],
     is_qat: bool,
-    model_device: Optional[torch.device] = None,
+    model_device: torch.device | None = None,
 ) -> Argument:
     """
     Given a `node` and an `arg`, inserts an input observer between
@@ -391,7 +391,7 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
     # instead of inserting new observers we will have:
     # conv1 -> obs1 -> existing_obs -> conv2
     #                            \ -> conv3
-    for maybe_obs_node in arg.users.keys():
+    for maybe_obs_node in arg.users:
         if not _is_activation_post_process_node(maybe_obs_node, named_modules):
             continue
         maybe_obs_mod = named_modules[maybe_obs_node.target]  # type: ignore[index]
@@ -420,7 +420,7 @@ def _maybe_insert_input_observers_for_node(
     named_modules: dict[str, torch.nn.Module],
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize],
     is_qat: bool,
-    model_device: Optional[torch.device] = None,
+    model_device: torch.device | None = None,
 ) -> None:
     """
     If needed, inserts observers to the input args and kwargs of `node`.
@@ -473,8 +473,8 @@ def _maybe_insert_output_observer_for_node(
     graph: Graph,
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize],
     is_qat: bool,
-    model_device: Optional[torch.device] = None,
-) -> Optional[Node]:
+    model_device: torch.device | None = None,
+) -> Node | None:
     if node in obs_or_fq_map:
         output_act_obs_or_fq = obs_or_fq_map[node]
         new_output = _insert_obs_or_fq(
@@ -506,7 +506,7 @@ def _maybe_insert_input_and_output_observers_for_node(
     model: torch.fx.GraphModule,
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize],
     is_qat: bool,
-    model_device: Optional[torch.device] = None,
+    model_device: torch.device | None = None,
 ):
     this_node_quantization_annotation = node.meta.get("quantization_annotation", None)
     if this_node_quantization_annotation is None:
diff --git a/torch/ao/quantization/pt2e/qat_utils.py b/torch/ao/quantization/pt2e/qat_utils.py
index e5a245dc3dadd..9498a4f16f78f 100644
--- a/torch/ao/quantization/pt2e/qat_utils.py
+++ b/torch/ao/quantization/pt2e/qat_utils.py
@@ -4,7 +4,7 @@
 import itertools
 import operator
 from collections.abc import Callable
-from typing import Any, Optional, TYPE_CHECKING
+from typing import Any, TYPE_CHECKING
 
 import torch
 import torch.nn.functional as F
@@ -375,7 +375,7 @@ def _get_conv_bn_pattern_nodes(r: ReplacedPatterns) -> dict[str, tuple[Node, Nod
         "conv_bias_q", "conv_bias_dq"
     """
 
-    def _get_nodes(nodes: list[Node]) -> tuple[Node, Node, Optional[Node]]:
+    def _get_nodes(nodes: list[Node]) -> tuple[Node, Node, Node | None]:
         """
         Return a 3-tuple of (conv_node, bn_node, getitem_node).
         This asserts that the match contains exactly one of each node.
diff --git a/torch/ao/quantization/pt2e/representation/rewrite.py b/torch/ao/quantization/pt2e/representation/rewrite.py
index 0f055cc3019a8..52084784f5036 100644
--- a/torch/ao/quantization/pt2e/representation/rewrite.py
+++ b/torch/ao/quantization/pt2e/representation/rewrite.py
@@ -2,7 +2,7 @@
 from collections.abc import Callable
 from dataclasses import dataclass
 from functools import partial
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from torch._export.utils import _disable_aten_to_metadata_assertions
@@ -614,8 +614,8 @@ class _RewriteInfo:
     pattern: Callable
     replacement: Callable
     # post transformation on the exported pattern and replacement GraphModule
-    pattern_post_trans: Optional[Callable[[GraphModule], GraphModule]] = None
-    replacement_post_trans: Optional[Callable[[GraphModule], GraphModule]] = None
+    pattern_post_trans: Callable[[GraphModule], GraphModule] | None = None
+    replacement_post_trans: Callable[[GraphModule], GraphModule] | None = None
 
 
 def reference_representation_rewrite(model: GraphModule) -> GraphModule:
diff --git a/torch/ao/quantization/pt2e/utils.py b/torch/ao/quantization/pt2e/utils.py
index f6e9789e94827..69a74ea6a0dfa 100644
--- a/torch/ao/quantization/pt2e/utils.py
+++ b/torch/ao/quantization/pt2e/utils.py
@@ -2,7 +2,7 @@
 import operator
 import types
 from collections.abc import Callable
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 import torch.ao.quantization.pt2e._affine_quantization  # noqa: F401
@@ -211,7 +211,7 @@ def _is_bn_node(n: Node):
 def fold_bn_weights_into_conv_node(
     conv_node: Node,
     conv_weight_node: Node,
-    conv_bias_node: Optional[Node],
+    conv_bias_node: Node | None,
     bn_node: Node,
     m: GraphModule,
 ) -> None:
@@ -421,7 +421,7 @@ def _is_literal(arg):
 def _replace_literals_with_new_placeholders(
     gm: torch.fx.GraphModule,
     merge_dup: bool = False,
-    exclude_literals: Optional[list[Any]] = None,
+    exclude_literals: list[Any] | None = None,
 ):
     """Replace the literals in the graph with placeholder nodes that's created on the fly while we
     traverse the graph, so that the literal arguments in the graph can be matched and replaced
@@ -472,7 +472,7 @@ def pattern(self, x, new_ph):
     """
     last_ph = None
     cnt = 0
-    literal_to_ph: dict[Union[float, bool, int, torch.dtype], Node] = {}
+    literal_to_ph: dict[float | bool | int | torch.dtype, Node] = {}
     if exclude_literals is None:
         exclude_literals = []
 
@@ -513,8 +513,8 @@ def pattern(self, x, new_ph):
 
 def _replace_literals_with_existing_placeholders(
     gm: torch.fx.GraphModule,
-    exclude_literals: Optional[list[Any]] = None,
-    literal_to_ph_idx: Optional[dict[Union[float, int, bool, torch.dtype], int]] = None,
+    exclude_literals: list[Any] | None = None,
+    literal_to_ph_idx: dict[float | int | bool | torch.dtype, int] | None = None,
 ):
     """Replace the literals in the graph with **existing** placeholder nodes, so that the literal arguments
     in the graph can be matched and replaced
diff --git a/torch/ao/quantization/qconfig.py b/torch/ao/quantization/qconfig.py
index 89c5bb107c931..ff5d1f341751a 100644
--- a/torch/ao/quantization/qconfig.py
+++ b/torch/ao/quantization/qconfig.py
@@ -544,7 +544,7 @@ def get_default_qat_qconfig_dict(backend="x86", version=1):
     ).to_dict()
 
 
-def _assert_valid_qconfig(qconfig: Optional[QConfig], mod: torch.nn.Module) -> None:
+def _assert_valid_qconfig(qconfig: QConfig | None, mod: torch.nn.Module) -> None:
     """
     Verifies that this `qconfig` is valid.
     """
@@ -578,11 +578,11 @@ def _assert_valid_qconfig(qconfig: Optional[QConfig], mod: torch.nn.Module) -> N
 else:
     from typing import TypeAliasType
 
-    QConfigAny = TypeAliasType("QConfigAny", Optional[QConfig])
+    QConfigAny = TypeAliasType("QConfigAny", QConfig | None)
 
 
 def _add_module_to_qconfig_obs_ctr(
-    qconfig: QConfigAny, module: Optional[nn.Module]
+    qconfig: QConfigAny, module: nn.Module | None
 ) -> Any:
     r"""This is a helper function for use in quantization prepare that updates a qconfig so that
     the constructors stored in the qconfig will create observers on the same device that
@@ -707,7 +707,7 @@ def _is_memoryless(observer):
         return _is_memoryless(act)
 
 
-def _is_reuse_input_qconfig(qconfig: Optional[QConfig]):
+def _is_reuse_input_qconfig(qconfig: QConfig | None):
     return (
         qconfig is not None
         and isinstance(qconfig.activation(), ReuseInputObserver)
diff --git a/torch/ao/quantization/qconfig_mapping.py b/torch/ao/quantization/qconfig_mapping.py
index 10111d4ab8a2a..cf896a96da055 100644
--- a/torch/ao/quantization/qconfig_mapping.py
+++ b/torch/ao/quantization/qconfig_mapping.py
@@ -2,7 +2,7 @@
 from __future__ import annotations
 
 from collections import OrderedDict
-from typing import Any, TYPE_CHECKING, Union
+from typing import Any, TYPE_CHECKING
 
 import torch
 
@@ -45,7 +45,7 @@
 _MODULE_NAME_OBJECT_TYPE_ORDER_DICT_KEY = "module_name_object_type_order"
 
 # TODO: derive this map from the BackendConfig
-_FIXED_QPARAMS_OP_TO_OBSERVER: dict[Union[Callable, str], _PartialWrapper] = {
+_FIXED_QPARAMS_OP_TO_OBSERVER: dict[Callable | str, _PartialWrapper] = {
     torch.nn.Hardsigmoid: default_fixed_qparams_range_0to1_observer,
     torch.nn.functional.hardsigmoid: default_fixed_qparams_range_0to1_observer,
     "hardsigmoid": default_fixed_qparams_range_0to1_observer,
@@ -187,7 +187,7 @@ def _get_default_qconfig_mapping_with_default_qconfig(
     else:
         qconfig_mapping = get_default_qconfig_mapping(backend)
     qconfig_mapping.set_global(default_qconfig)
-    for pattern in qconfig_mapping.object_type_qconfigs.keys():
+    for pattern in qconfig_mapping.object_type_qconfigs:
         if pattern not in _FIXED_QPARAMS_OP_TO_OBSERVER:
             qconfig_mapping.set_object_type(pattern, default_qconfig)
     return qconfig_mapping
@@ -236,7 +236,7 @@ class QConfigMapping:
     def __init__(self) -> None:
         # In increasing match priority:
         self.global_qconfig: QConfigAny = None
-        self.object_type_qconfigs: OrderedDict[Union[Callable, str], QConfigAny] = (
+        self.object_type_qconfigs: OrderedDict[Callable | str, QConfigAny] = (
             OrderedDict()
         )
         self.module_name_regex_qconfigs: OrderedDict[str, QConfigAny] = OrderedDict()
@@ -253,7 +253,7 @@ def set_global(self, global_qconfig: QConfigAny) -> QConfigMapping:
         return self
 
     def set_object_type(
-        self, object_type: Union[Callable, str], qconfig: QConfigAny
+        self, object_type: Callable | str, qconfig: QConfigAny
     ) -> QConfigMapping:
         """
         Set the QConfig for a given module type, function, or method name.
diff --git a/torch/ao/quantization/quantization_mappings.py b/torch/ao/quantization/quantization_mappings.py
index c9173e6bc6e91..647ed5a4d4f39 100644
--- a/torch/ao/quantization/quantization_mappings.py
+++ b/torch/ao/quantization/quantization_mappings.py
@@ -1,6 +1,6 @@
 import copy
 from collections.abc import Callable
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 import torch.ao.nn as ao_nn
@@ -178,7 +178,7 @@
 
 # Default mapping from floating point function or torch ops to quantized ops
 # TODO: merge with default static mapping
-DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS: dict[Union[Callable, str], Callable] = {
+DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS: dict[Callable | str, Callable] = {
     F.elu: torch.ops.quantized.elu,
     F.hardswish: torch.ops.quantized.hardswish,
     F.instance_norm: torch.ops.quantized.instance_norm,
@@ -237,7 +237,7 @@ def get_default_static_sparse_quant_module_mappings() -> dict[Callable, Any]:
 
 def get_static_quant_module_class(
     float_module_class: Callable,
-    additional_static_quant_mapping: Optional[dict[Callable, Any]] = None,
+    additional_static_quant_mapping: dict[Callable, Any] | None = None,
     is_reference: bool = False,
 ) -> Any:
     r"""n Get the statically quantized module class corresponding to
@@ -262,7 +262,7 @@ def get_static_quant_module_class(
 
 def get_dynamic_quant_module_class(
     float_module_class: Callable,
-    additional_dynamic_quant_mapping: Optional[dict[Callable, Any]] = None,
+    additional_dynamic_quant_mapping: dict[Callable, Any] | None = None,
 ) -> Any:
     r"""n Get the dynamically quantized module class corresponding to
     the floating point module class
@@ -337,13 +337,13 @@ def get_default_compare_output_module_list() -> set[Callable]:
 
 
 def get_default_float_to_quantized_operator_mappings() -> dict[
-    Union[Callable, str], Callable
+    Callable | str, Callable
 ]:
     return copy.deepcopy(DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS)
 
 
 # TODO: merge with get_static_quant_module_class
-def get_quantized_operator(float_op: Union[Callable, str]) -> Callable:
+def get_quantized_operator(float_op: Callable | str) -> Callable:
     """Get the quantized operator corresponding to the float operator"""
     quantized_op = DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS.get(float_op)
     if quantized_op is None:
@@ -353,7 +353,7 @@ def get_quantized_operator(float_op: Union[Callable, str]) -> Callable:
     return quantized_op
 
 
-def _get_special_act_post_process(module: torch.nn.Module) -> Optional[Callable]:
+def _get_special_act_post_process(module: torch.nn.Module) -> Callable | None:
     r"""Get the special activation post process for `module`, this has
     higher priority than the activation post process in `qconfig`
     e.g.
diff --git a/torch/ao/quantization/quantize_fx.py b/torch/ao/quantization/quantize_fx.py
index c59d35c573505..ba6ab86aaa048 100644
--- a/torch/ao/quantization/quantize_fx.py
+++ b/torch/ao/quantization/quantize_fx.py
@@ -1,7 +1,7 @@
 import copy
 import typing_extensions
 import warnings
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch.fx import GraphModule
@@ -23,7 +23,7 @@
 
 
 def attach_preserved_attrs_to_model(
-    model: Union[GraphModule, torch.nn.Module],
+    model: GraphModule | torch.nn.Module,
     preserved_attrs: dict[str, Any],
 ) -> None:
     """Store preserved attributes to the model.meta so that it can be preserved during deepcopy"""
@@ -75,8 +75,8 @@ def _swap_ff_with_fxff(model: torch.nn.Module) -> None:
 def _fuse_fx(
     model: GraphModule,
     is_qat: bool,
-    fuse_custom_config: Union[FuseCustomConfig, dict[str, Any], None] = None,
-    backend_config: Union[BackendConfig, dict[str, Any], None] = None,
+    fuse_custom_config: FuseCustomConfig | dict[str, Any] | None = None,
+    backend_config: BackendConfig | dict[str, Any] | None = None,
 ) -> GraphModule:
     r"""Internal helper function to fuse modules in preparation for quantization
 
@@ -89,12 +89,12 @@ def _fuse_fx(
 
 def _prepare_fx(
     model: torch.nn.Module,
-    qconfig_mapping: Union[QConfigMapping, dict[str, Any]],
+    qconfig_mapping: QConfigMapping | dict[str, Any],
     is_qat: bool,
     example_inputs: tuple[Any, ...],
-    prepare_custom_config: Union[PrepareCustomConfig, dict[str, Any], None] = None,
-    _equalization_config: Optional[Union[QConfigMapping, dict[str, Any]]] = None,
-    backend_config: Union[BackendConfig, dict[str, Any], None] = None,
+    prepare_custom_config: PrepareCustomConfig | dict[str, Any] | None = None,
+    _equalization_config: QConfigMapping | dict[str, Any] | None = None,
+    backend_config: BackendConfig | dict[str, Any] | None = None,
     is_standalone_module: bool = False,
 ) -> GraphModule:
     r"""Internal helper function for prepare_fx
@@ -161,11 +161,11 @@ def _prepare_fx(
 
 def _prepare_standalone_module_fx(
     model: torch.nn.Module,
-    qconfig_mapping: Union[QConfigMapping, dict[str, Any]],
+    qconfig_mapping: QConfigMapping | dict[str, Any],
     is_qat: bool,
     example_inputs: tuple[Any, ...],
-    prepare_custom_config: Union[PrepareCustomConfig, dict[str, Any], None] = None,
-    backend_config: Union[BackendConfig, dict[str, Any], None] = None,
+    prepare_custom_config: PrepareCustomConfig | dict[str, Any] | None = None,
+    backend_config: BackendConfig | dict[str, Any] | None = None,
 ) -> GraphModule:
     r"""[Internal use only] Prepare a standalone module, so that it can be used when quantizing the
     parent module.
@@ -203,8 +203,8 @@ def _prepare_standalone_module_fx(
 
 def fuse_fx(
     model: torch.nn.Module,
-    fuse_custom_config: Union[FuseCustomConfig, dict[str, Any], None] = None,
-    backend_config: Union[BackendConfig, dict[str, Any], None] = None,
+    fuse_custom_config: FuseCustomConfig | dict[str, Any] | None = None,
+    backend_config: BackendConfig | dict[str, Any] | None = None,
 ) -> GraphModule:
     r"""Fuse modules like conv+bn, conv+bn+relu etc, model must be in eval mode.
     Fusion rules are defined in torch.ao.quantization.fx.fusion_pattern.py
@@ -253,11 +253,11 @@ def fuse_fx(
 @typing_extensions.deprecated(DEPRECATION_WARNING)
 def prepare_fx(
     model: torch.nn.Module,
-    qconfig_mapping: Union[QConfigMapping, dict[str, Any]],
+    qconfig_mapping: QConfigMapping | dict[str, Any],
     example_inputs: tuple[Any, ...],
-    prepare_custom_config: Union[PrepareCustomConfig, dict[str, Any], None] = None,
-    _equalization_config: Optional[Union[QConfigMapping, dict[str, Any]]] = None,
-    backend_config: Union[BackendConfig, dict[str, Any], None] = None,
+    prepare_custom_config: PrepareCustomConfig | dict[str, Any] | None = None,
+    _equalization_config: QConfigMapping | dict[str, Any] | None = None,
+    backend_config: BackendConfig | dict[str, Any] | None = None,
 ) -> GraphModule:
     r""" Prepare a model for post training quantization
 
@@ -405,10 +405,10 @@ def calibrate(model, data_loader):
 @typing_extensions.deprecated(DEPRECATION_WARNING)
 def prepare_qat_fx(
     model: torch.nn.Module,
-    qconfig_mapping: Union[QConfigMapping, dict[str, Any]],
+    qconfig_mapping: QConfigMapping | dict[str, Any],
     example_inputs: tuple[Any, ...],
-    prepare_custom_config: Union[PrepareCustomConfig, dict[str, Any], None] = None,
-    backend_config: Union[BackendConfig, dict[str, Any], None] = None,
+    prepare_custom_config: PrepareCustomConfig | dict[str, Any] | None = None,
+    backend_config: BackendConfig | dict[str, Any] | None = None,
 ) -> GraphModule:
     r"""Prepare a model for quantization aware training
 
@@ -518,11 +518,11 @@ def train_loop(model, train_data):
 def _convert_fx(
     graph_module: GraphModule,
     is_reference: bool,
-    convert_custom_config: Union[ConvertCustomConfig, dict[str, Any], None] = None,
+    convert_custom_config: ConvertCustomConfig | dict[str, Any] | None = None,
     is_standalone_module: bool = False,
     _remove_qconfig: bool = True,
-    qconfig_mapping: Union[QConfigMapping, dict[str, Any], None] = None,
-    backend_config: Union[BackendConfig, dict[str, Any], None] = None,
+    qconfig_mapping: QConfigMapping | dict[str, Any] | None = None,
+    backend_config: BackendConfig | dict[str, Any] | None = None,
     is_decomposed: bool = False,
     keep_original_weights: bool = False,
 ) -> GraphModule:
@@ -566,10 +566,10 @@ def _convert_fx(
 @typing_extensions.deprecated(DEPRECATION_WARNING)
 def convert_fx(
     graph_module: GraphModule,
-    convert_custom_config: Union[ConvertCustomConfig, dict[str, Any], None] = None,
+    convert_custom_config: ConvertCustomConfig | dict[str, Any] | None = None,
     _remove_qconfig: bool = True,
-    qconfig_mapping: Union[QConfigMapping, dict[str, Any], None] = None,
-    backend_config: Union[BackendConfig, dict[str, Any], None] = None,
+    qconfig_mapping: QConfigMapping | dict[str, Any] | None = None,
+    backend_config: BackendConfig | dict[str, Any] | None = None,
     keep_original_weights: bool = False,
 ) -> GraphModule:
     r"""Convert a calibrated or trained model to a quantized model
@@ -635,10 +635,10 @@ def convert_fx(
 
 def convert_to_reference_fx(
     graph_module: GraphModule,
-    convert_custom_config: Union[ConvertCustomConfig, dict[str, Any], None] = None,
+    convert_custom_config: ConvertCustomConfig | dict[str, Any] | None = None,
     _remove_qconfig: bool = True,
-    qconfig_mapping: Union[QConfigMapping, dict[str, Any], None] = None,
-    backend_config: Union[BackendConfig, dict[str, Any], None] = None,
+    qconfig_mapping: QConfigMapping | dict[str, Any] | None = None,
+    backend_config: BackendConfig | dict[str, Any] | None = None,
 ) -> GraphModule:
     r"""Convert a calibrated or trained model to a reference quantized model,
     see https://github.com/pytorch/rfcs/blob/master/RFC-0019-Extending-PyTorch-Quantization-to-Custom-Backends.md for more details,
@@ -685,9 +685,9 @@ def convert_to_reference_fx(
 
 def _convert_to_reference_decomposed_fx(
     graph_module: GraphModule,
-    convert_custom_config: Union[ConvertCustomConfig, dict[str, Any], None] = None,
-    qconfig_mapping: Union[QConfigMapping, dict[str, Any], None] = None,
-    backend_config: Union[BackendConfig, dict[str, Any], None] = None,
+    convert_custom_config: ConvertCustomConfig | dict[str, Any] | None = None,
+    qconfig_mapping: QConfigMapping | dict[str, Any] | None = None,
+    backend_config: BackendConfig | dict[str, Any] | None = None,
 ) -> GraphModule:
     r"""Convert a calibrated or trained model to a reference quantized model, with
     decomposed representation for quantized Tensor
@@ -741,7 +741,7 @@ def _convert_to_reference_decomposed_fx(
 def _convert_standalone_module_fx(
     graph_module: GraphModule,
     is_reference: bool = False,
-    convert_custom_config: Union[ConvertCustomConfig, dict[str, Any], None] = None,
+    convert_custom_config: ConvertCustomConfig | dict[str, Any] | None = None,
 ) -> GraphModule:
     r"""[Internal use only] Convert a model produced by :func:`~torch.ao.quantization.prepare_standalone_module_fx`
     and convert it to a quantized model
diff --git a/torch/ao/quantization/quantize_jit.py b/torch/ao/quantization/quantize_jit.py
index 79f8db1a792fc..ec4caab1edcd0 100644
--- a/torch/ao/quantization/quantize_jit.py
+++ b/torch/ao/quantization/quantize_jit.py
@@ -68,7 +68,7 @@ def fuse_conv_bn_jit(model, inplace=False):
 def _prepare_jit(model, qconfig_dict, inplace=False, quant_type=QuantType.STATIC):
     _check_is_script_module(model)
     _check_forward_method(model)
-    if not all(isinstance(x, str) for x in qconfig_dict.keys()):
+    if not all(isinstance(x, str) for x in qconfig_dict):
         raise ValueError("qconfig_dict should only contain names(str) as keys.")
     scripted_qconfig_dict = script_qconfig_dict(qconfig_dict)
     model = fuse_conv_bn_jit(model, inplace)
@@ -90,7 +90,7 @@ def _prepare_ondevice_jit(
     quant_type=QuantType.STATIC,
 ):
     _check_is_script_module(model)
-    if not all(isinstance(x, str) for x in qconfig_dict.keys()):
+    if not all(isinstance(x, str) for x in qconfig_dict):
         raise ValueError("qconfig_dict should only contain names(str) as keys.")
     scripted_qconfig_dict = script_qconfig_dict(qconfig_dict)
     method_graph = model._c._get_method(method_name).graph
diff --git a/torch/ao/quantization/quantizer/quantizer.py b/torch/ao/quantization/quantizer/quantizer.py
index 91c7159a89afd..d6e10526b4cc4 100644
--- a/torch/ao/quantization/quantizer/quantizer.py
+++ b/torch/ao/quantization/quantizer/quantizer.py
@@ -2,7 +2,7 @@
 from abc import ABC, abstractmethod
 from collections.abc import Callable
 from dataclasses import dataclass, field
-from typing import Annotated, Optional, Union
+from typing import Annotated
 
 import torch
 from torch import Tensor
@@ -41,10 +41,10 @@ class QuantizationSpec(QuantizationSpecBase):
     # or we can attach some custom args to them
     # e.g. MinMaxObserver.with_args(eps=eps)
     observer_or_fake_quant_ctr: _ObserverOrFakeQuantizeConstructor
-    quant_min: Optional[int] = None
-    quant_max: Optional[int] = None
-    qscheme: Optional[torch.qscheme] = None
-    ch_axis: Optional[int] = None
+    quant_min: int | None = None
+    quant_max: int | None = None
+    qscheme: torch.qscheme | None = None
+    ch_axis: int | None = None
     is_dynamic: bool = False
 
     def __post_init__(self):
@@ -70,9 +70,9 @@ class FixedQParamsQuantizationSpec(QuantizationSpecBase):
     dtype: torch.dtype
     scale: float
     zero_point: int
-    quant_min: Optional[int] = None
-    quant_max: Optional[int] = None
-    qscheme: Optional[torch.qscheme] = None
+    quant_min: int | None = None
+    quant_max: int | None = None
+    qscheme: torch.qscheme | None = None
     is_dynamic: bool = False
 
 
@@ -82,7 +82,7 @@ class FixedQParamsQuantizationSpec(QuantizationSpecBase):
 input edge is the connection between input node and the node consuming the input, so it's a Tuple[Node, Node]
 output value is an fx Node
 """
-EdgeOrNode = Annotated[Union[tuple[Node, Node], Node], None]
+EdgeOrNode = Annotated[tuple[Node, Node] | Node, None]
 EdgeOrNode.__module__ = "torch.ao.quantization.quantizer.quantizer"
 
 
@@ -103,10 +103,10 @@ class DerivedQuantizationSpec(QuantizationSpecBase):
     derived_from: list[EdgeOrNode]
     derive_qparams_fn: Callable[[list[ObserverOrFakeQuantize]], tuple[Tensor, Tensor]]
     dtype: torch.dtype
-    quant_min: Optional[int] = None
-    quant_max: Optional[int] = None
-    qscheme: Optional[torch.qscheme] = None
-    ch_axis: Optional[int] = None
+    quant_min: int | None = None
+    quant_max: int | None = None
+    qscheme: torch.qscheme | None = None
+    ch_axis: int | None = None
     is_dynamic: bool = False
 
 
@@ -118,13 +118,13 @@ class QuantizationAnnotation:
     """
 
     # a map from torch.fx.Node to a type of QuantizationSpecBase
-    input_qspec_map: dict[Node, Optional[QuantizationSpecBase]] = field(
+    input_qspec_map: dict[Node, QuantizationSpecBase | None] = field(
         default_factory=dict
     )
 
     # How the output of this node is quantized, expressed as QuantizationSpec
     # TODO: change the value to QuantizationSpec in a separate PR
-    output_qspec: Optional[QuantizationSpecBase] = None
+    output_qspec: QuantizationSpecBase | None = None
 
     # For a Node: node1 and edge: (node1, node2), since they are observing the same
     # Tensor, we may want to implicitly share observers, this flag allows people to
diff --git a/torch/ao/quantization/quantizer/utils.py b/torch/ao/quantization/quantizer/utils.py
index 7c65a8e680149..06463ae0f2f3a 100644
--- a/torch/ao/quantization/quantizer/utils.py
+++ b/torch/ao/quantization/quantizer/utils.py
@@ -1,5 +1,4 @@
 from collections.abc import Callable
-from typing import Optional
 
 from torch.ao.quantization.pt2e.utils import _is_sym_size_node
 from torch.ao.quantization.quantizer.quantizer import (
@@ -13,7 +12,7 @@
 
 
 def _annotate_input_qspec_map(
-    node: Node, input_node: Node, qspec: Optional[QuantizationSpecBase]
+    node: Node, input_node: Node, qspec: QuantizationSpecBase | None
 ) -> None:
     quantization_annotation = node.meta.get(
         "quantization_annotation", QuantizationAnnotation()
@@ -24,7 +23,7 @@ def _annotate_input_qspec_map(
     node.meta["quantization_annotation"] = quantization_annotation
 
 
-def _annotate_output_qspec(node: Node, qspec: Optional[QuantizationSpecBase]) -> None:
+def _annotate_output_qspec(node: Node, qspec: QuantizationSpecBase | None) -> None:
     quantization_annotation = node.meta.get(
         "quantization_annotation", QuantizationAnnotation()
     )
diff --git a/torch/ao/quantization/quantizer/x86_inductor_quantizer.py b/torch/ao/quantization/quantizer/x86_inductor_quantizer.py
index b10163d4b1e50..e9cde0e2d12a6 100644
--- a/torch/ao/quantization/quantizer/x86_inductor_quantizer.py
+++ b/torch/ao/quantization/quantizer/x86_inductor_quantizer.py
@@ -5,7 +5,7 @@
 import warnings
 from collections.abc import Callable, Sequence
 from dataclasses import dataclass
-from typing import Any, Optional, TYPE_CHECKING, TypeAlias, Union
+from typing import Any, Optional, TYPE_CHECKING, TypeAlias
 
 import torch
 import torch.nn.functional as F
@@ -97,7 +97,7 @@ class _X86InductorQuantizationAnnotation(QuantizationAnnotation):
 QUANT_ANNOTATION_KEY = "quantization_annotation"
 
 
-def _skip_annotate(nodes: list[Node], filter_fn: Optional[FilterFn] = None) -> bool:
+def _skip_annotate(nodes: list[Node], filter_fn: FilterFn | None = None) -> bool:
     """Determine whether to skip annotation for a list of nodes."""
 
     # 1) Skip annotate if any node is already annotated
@@ -355,7 +355,7 @@ def get_x86_inductor_linear_dynamic_fp16_config():
     return quantization_config
 
 
-def _annotate_nodes_not_quantize(nodes: Union[Node, list[Node]]) -> None:
+def _annotate_nodes_not_quantize(nodes: Node | list[Node]) -> None:
     """Annotate nodes to exclude them from quantization (their `quantization_config` is `None`)."""
     if not isinstance(nodes, list):
         nodes = [nodes]
@@ -398,8 +398,8 @@ class _CurrentQuantizationMode:
         True    | quantizer will do QAT                                       | QAT + dynamic | QAT + static
     """
 
-    qat_state: Optional[bool]
-    dynamic_state: Optional[bool]
+    qat_state: bool | None
+    dynamic_state: bool | None
 
 
 class X86InductorQuantizer(Quantizer):
@@ -407,11 +407,11 @@ class X86InductorQuantizer(Quantizer):
 
     def __init__(self) -> None:
         super().__init__()
-        self.global_config: Optional[QuantizationConfig] = None
+        self.global_config: QuantizationConfig | None = None
         self.operator_type_qconfig: dict[
-            torch._ops.OpOverloadPacket, Optional[QuantizationConfig]
+            torch._ops.OpOverloadPacket, QuantizationConfig | None
         ] = {}
-        self.module_name_qconfig: dict[str, Optional[QuantizationConfig]] = {}
+        self.module_name_qconfig: dict[str, QuantizationConfig | None] = {}
 
     def _get_current_quantization_mode(self) -> _CurrentQuantizationMode:
         """Retrieves the current quantization mode based on all configurations."""
@@ -452,9 +452,7 @@ def _get_current_quantization_mode(self) -> _CurrentQuantizationMode:
             qat_state=qat_state, dynamic_state=dynamic_state
         )
 
-    def _need_skip_config(
-        self, quantization_config: Optional[QuantizationConfig]
-    ) -> bool:
+    def _need_skip_config(self, quantization_config: QuantizationConfig | None) -> bool:
         """Check if the provided quantization config is valid for X86InductorQuantizer.
 
         Mixed static/dynamic configurations or mixed QAT/non-QAT configurations are not supported.
@@ -508,7 +506,7 @@ def get_global_quantization_config(self):
     def set_function_type_qconfig(
         self,
         function_type: Callable,
-        quantization_config: Optional[QuantizationConfig],
+        quantization_config: QuantizationConfig | None,
     ) -> "X86InductorQuantizer":
         if function_type in X86InductorQuantizer.module_function_to_aten_operator_type:
             self._set_aten_operator_qconfig(
@@ -528,7 +526,7 @@ def set_function_type_qconfig(
     def set_module_type_qconfig(
         self,
         module_type: torch.nn.Module,
-        quantization_config: Optional[QuantizationConfig],
+        quantization_config: QuantizationConfig | None,
     ) -> "X86InductorQuantizer":
         if module_type in X86InductorQuantizer.module_function_to_aten_operator_type:
             self._set_aten_operator_qconfig(
@@ -544,7 +542,7 @@ def set_module_type_qconfig(
 
     @_config_checker
     def set_module_name_qconfig(
-        self, module_name: str, quantization_config: Optional[QuantizationConfig]
+        self, module_name: str, quantization_config: QuantizationConfig | None
     ):
         """Set quantization_config for a submodule with name: `module_name`, for example:
         quantizer.set_module_name_qconfig("blocks.sub"), it will quantize all supported operator/operator
@@ -558,7 +556,7 @@ def set_module_name_qconfig(
     def _set_aten_operator_qconfig(
         self,
         operator_type: torch._ops.OpOverloadPacket,
-        quantization_config: Optional[QuantizationConfig],
+        quantization_config: QuantizationConfig | None,
     ) -> "X86InductorQuantizer":
         if operator_type in quantizable_ops:
             self.operator_type_qconfig[operator_type] = quantization_config
@@ -573,7 +571,7 @@ def _annotate_conv_node_helper(
         self,
         conv_node: torch.fx.Node,
         annotate_output: bool,
-        quantization_config: Optional[QuantizationConfig],
+        quantization_config: QuantizationConfig | None,
     ) -> None:
         """Helper function to annotate the conv node"""
         if quantization_config is None:
@@ -607,7 +605,7 @@ def _annotate_linear_node_helper(
         self,
         linear_node: torch.fx.Node,
         annotate_output: bool,
-        quantization_config: Optional[QuantizationConfig],
+        quantization_config: QuantizationConfig | None,
     ) -> None:
         """Helper function to annotate the linear node"""
         if quantization_config is None:
@@ -736,7 +734,7 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
     def _annotate_with_config(
         self,
         model: torch.fx.GraphModule,
-        quantization_config: Optional[QuantizationConfig],
+        quantization_config: QuantizationConfig | None,
         filter_fn: FilterFn,
     ) -> None:
         """Annotate the model with the given quantization configuration.
@@ -765,8 +763,8 @@ def _annotate_with_config(
     def _annotate_qat_conv2d_fusion_pattern(
         self,
         model: torch.fx.GraphModule,
-        quantization_config: Optional[QuantizationConfig],
-        filter_fn: Optional[FilterFn] = None,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
     ):
         # Annotate QAT Specific patterns
         self._annotate_qat_conv2d_bn_binary_unary(model, quantization_config, filter_fn)
@@ -777,8 +775,8 @@ def _annotate_qat_conv2d_fusion_pattern(
     def _annotate_qat_conv2d_bn_binary_unary(
         self,
         gm: torch.fx.GraphModule,
-        quantization_config: Optional[QuantizationConfig],
-        filter_fn: Optional[FilterFn] = None,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
     ) -> None:
         fused_partitions = find_sequential_partitions(
             gm, [torch.nn.Conv2d, torch.nn.BatchNorm2d, operator.add, torch.nn.ReLU]
@@ -856,8 +854,8 @@ def _annotate_qat_conv2d_bn_binary_unary(
     def _annotate_qat_conv2d_bn_binary(
         self,
         gm: torch.fx.GraphModule,
-        quantization_config: Optional[QuantizationConfig],
-        filter_fn: Optional[FilterFn] = None,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
     ) -> None:
         fused_partitions = find_sequential_partitions(
             gm, [torch.nn.Conv2d, torch.nn.BatchNorm2d, operator.add]
@@ -921,8 +919,8 @@ def _annotate_qat_conv2d_bn_binary(
     def _annotate_qat_conv2d_bn_unary(
         self,
         gm: torch.fx.GraphModule,
-        quantization_config: Optional[QuantizationConfig],
-        filter_fn: Optional[FilterFn] = None,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
     ) -> None:
         fused_partitions = []
         unary_patterns = [
@@ -977,8 +975,8 @@ def _annotate_qat_conv2d_bn_unary(
     def _annotate_qat_conv2d_bn(
         self,
         gm: torch.fx.GraphModule,
-        quantization_config: Optional[QuantizationConfig],
-        filter_fn: Optional[FilterFn] = None,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
     ) -> None:
         fused_partitions = find_sequential_partitions(
             gm, [torch.nn.Conv2d, torch.nn.BatchNorm2d]
@@ -1017,8 +1015,8 @@ def _annotate_qat_conv2d_bn(
     def _annotate_conv2d_fusion_pattern(
         self,
         model: torch.fx.GraphModule,
-        quantization_config: Optional[QuantizationConfig],
-        filter_fn: Optional[FilterFn] = None,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
     ):
         if (quantization_config is None) or (quantization_config.is_qat):
             # Annotate QAT specific pattern: mainly due to BN not folded in prepare_qat
@@ -1033,8 +1031,8 @@ def _annotate_conv2d_fusion_pattern(
     def _annotate_linear_fusion_pattern(
         self,
         model: torch.fx.GraphModule,
-        quantization_config: Optional[QuantizationConfig],
-        filter_fn: Optional[FilterFn] = None,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
     ):
         self._annotate_linear_binary_unary(model, quantization_config, filter_fn)
         self._annotate_linear_unary(model, quantization_config, filter_fn)
@@ -1043,8 +1041,8 @@ def _annotate_linear_fusion_pattern(
     def _annotate_matmul(
         self,
         model: torch.fx.GraphModule,
-        quantization_config: Optional[QuantizationConfig],
-        filter_fn: Optional[FilterFn] = None,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
     ):
         for node in model.graph.nodes:
             if node.target != torch.ops.aten.matmul.default:
@@ -1069,8 +1067,8 @@ def _annotate_matmul(
     def _annotate_conv2d_binary_unary(
         self,
         gm: torch.fx.GraphModule,
-        quantization_config: Optional[QuantizationConfig],
-        filter_fn: Optional[FilterFn] = None,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
     ) -> None:
         # Conv2d + add + unary op
         fused_partitions = find_sequential_partitions(
@@ -1123,8 +1121,8 @@ def _annotate_conv2d_binary_unary(
     def _annotate_conv2d_binary(
         self,
         gm: torch.fx.GraphModule,
-        quantization_config: Optional[QuantizationConfig],
-        filter_fn: Optional[FilterFn] = None,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
     ) -> None:
         # Conv2d + add
         fused_partitions = find_sequential_partitions(
@@ -1176,8 +1174,8 @@ def _annotate_conv2d_binary(
     def _annotate_conv2d_unary(
         self,
         gm: torch.fx.GraphModule,
-        quantization_config: Optional[QuantizationConfig],
-        filter_fn: Optional[FilterFn] = None,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
     ) -> None:
         fused_partitions = []
         unary_patterns = [
@@ -1220,8 +1218,8 @@ def _annotate_conv2d_unary(
     def _annotate_conv2d(
         self,
         gm: torch.fx.GraphModule,
-        quantization_config: Optional[QuantizationConfig],
-        filter_fn: Optional[FilterFn] = None,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
     ) -> None:
         conv_partitions = get_source_partitions(
             gm.graph, [torch.nn.Conv2d, torch.nn.functional.conv2d]
@@ -1244,7 +1242,7 @@ def _annotate_conv2d(
     def _annotate_maxpool2d(
         self,
         node: Node,
-        quantization_config: Optional[QuantizationConfig],
+        quantization_config: QuantizationConfig | None,
     ) -> None:
         if node.target is not torch.ops.aten.max_pool2d.default:
             return
@@ -1308,8 +1306,8 @@ def _annotate_cat(
     def _annotate_propagation_quantizable_pattern_entry(
         self,
         gm: torch.fx.GraphModule,
-        quantization_config: Optional[QuantizationConfig],
-        filter_fn: Optional[FilterFn] = None,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
     ):
         for node in gm.graph.nodes:
             self._annotate_propagation_quantizable_pattern(
@@ -1361,9 +1359,7 @@ def is_all_inputs_connected_to_quantized_op(input_nodes):
             elif (
                 node.target is torch.ops.aten.flatten.using_ints
                 and len(node.users) > 0
-                and not any(
-                    user.target in quantizable_ops for user in node.users.keys()
-                )
+                and not any(user.target in quantizable_ops for user in node.users)
             ):
                 # Recipe of flatten: check if any users of flatten node are quantizable ops or not
                 return
@@ -1451,8 +1447,8 @@ def _annotate_output_for_int8_in_int8_out_pattern(
     def _annotate_linear(
         self,
         gm: torch.fx.GraphModule,
-        quantization_config: Optional[QuantizationConfig],
-        filter_fn: Optional[FilterFn] = None,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
     ) -> None:
         linear_partitions = get_source_partitions(
             gm.graph, [torch.nn.Linear, torch.nn.functional.linear]
@@ -1479,8 +1475,8 @@ def _annotate_linear(
     def _annotate_linear_unary(
         self,
         gm: torch.fx.GraphModule,
-        quantization_config: Optional[QuantizationConfig],
-        filter_fn: Optional[FilterFn] = None,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
     ) -> None:
         postop_list = [
             torch.nn.ReLU,
@@ -1519,8 +1515,8 @@ def _annotate_linear_unary(
     def _annotate_linear_binary_unary(
         self,
         gm: torch.fx.GraphModule,
-        quantization_config: Optional[QuantizationConfig],
-        filter_fn: Optional[FilterFn] = None,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
     ) -> None:
         # linear + binary_op + (optional) unary op
         binary_op_list = [operator.add]
diff --git a/torch/ao/quantization/quantizer/xnnpack_quantizer.py b/torch/ao/quantization/quantizer/xnnpack_quantizer.py
index 792285dc8aead..d3a2234fdff3f 100644
--- a/torch/ao/quantization/quantizer/xnnpack_quantizer.py
+++ b/torch/ao/quantization/quantizer/xnnpack_quantizer.py
@@ -4,7 +4,7 @@
 import copy
 import functools
 import typing_extensions
-from typing import Any, Optional, TYPE_CHECKING
+from typing import Any, TYPE_CHECKING
 
 import torch
 import torch._dynamo as torchdynamo
@@ -284,12 +284,12 @@ class XNNPACKQuantizer(Quantizer):
 
     def __init__(self) -> None:
         super().__init__()
-        self.global_config: Optional[QuantizationConfig] = None
+        self.global_config: QuantizationConfig | None = None
         self.operator_type_config: dict[
-            torch._ops.OpOverloadPacket, Optional[QuantizationConfig]
+            torch._ops.OpOverloadPacket, QuantizationConfig | None
         ] = {}
-        self.module_type_config: dict[Callable, Optional[QuantizationConfig]] = {}
-        self.module_name_config: dict[str, Optional[QuantizationConfig]] = {}
+        self.module_type_config: dict[Callable, QuantizationConfig | None] = {}
+        self.module_name_config: dict[str, QuantizationConfig | None] = {}
 
     @classmethod
     def get_supported_quantization_configs(cls) -> list[QuantizationConfig]:
@@ -300,7 +300,7 @@ def get_supported_quantization_configs(cls) -> list[QuantizationConfig]:
 
     @classmethod
     def get_supported_operator_for_quantization_config(
-        cls, quantization_config: Optional[QuantizationConfig]
+        cls, quantization_config: QuantizationConfig | None
     ) -> list[OperatorPatternType]:
         if quantization_config is None:
             all_ops = []
@@ -341,7 +341,7 @@ def set_module_type(
         return self
 
     def set_module_name(
-        self, module_name: str, quantization_config: Optional[QuantizationConfig]
+        self, module_name: str, quantization_config: QuantizationConfig | None
     ):
         """Set quantization_config for a submodule with name: `module_name`, for example:
         quantizer.set_module_name("blocks.sub"), it will quantize all supported operator/operator
@@ -371,8 +371,8 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
     def _annotate_all_static_patterns(
         self,
         model: torch.fx.GraphModule,
-        quantization_config: Optional[QuantizationConfig],
-        filter_fn: Optional[Callable[[Node], bool]] = None,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: Callable[[Node], bool] | None = None,
     ) -> torch.fx.GraphModule:
         # TODO: implement the support for None to be canceling out previous annotations
         if quantization_config is None:
@@ -388,8 +388,8 @@ def _annotate_all_static_patterns(
     def _annotate_all_dynamic_patterns(
         self,
         model: torch.fx.GraphModule,
-        quantization_config: Optional[QuantizationConfig],
-        filter_fn: Optional[Callable[[Node], bool]] = None,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: Callable[[Node], bool] | None = None,
     ) -> torch.fx.GraphModule:
         # TODO: implement the support for None to be canceling out previous annotations
         if quantization_config is None:
diff --git a/torch/ao/quantization/quantizer/xnnpack_quantizer_utils.py b/torch/ao/quantization/quantizer/xnnpack_quantizer_utils.py
index 36289b49331aa..22282d3d071a8 100644
--- a/torch/ao/quantization/quantizer/xnnpack_quantizer_utils.py
+++ b/torch/ao/quantization/quantizer/xnnpack_quantizer_utils.py
@@ -3,7 +3,7 @@
 import typing
 from collections.abc import Callable
 from dataclasses import dataclass
-from typing import NamedTuple, Optional
+from typing import NamedTuple
 
 import torch
 import torch.nn.functional as F
@@ -47,10 +47,10 @@
 # In the absence of better name, just winging it with QuantizationConfig
 @dataclass(eq=True, frozen=True)
 class QuantizationConfig:
-    input_activation: Optional[QuantizationSpec]
-    output_activation: Optional[QuantizationSpec]
-    weight: Optional[QuantizationSpec]
-    bias: Optional[QuantizationSpec]
+    input_activation: QuantizationSpec | None
+    output_activation: QuantizationSpec | None
+    weight: QuantizationSpec | None
+    bias: QuantizationSpec | None
     # TODO: remove, since we can use observer_or_fake_quant_ctr to express this
     is_qat: bool = False
 
@@ -64,10 +64,10 @@ class QuantizationConfig:
 AnnotatorType = Callable[
     [
         torch.fx.GraphModule,
-        Optional[QuantizationConfig],
-        Optional[Callable[[Node], bool]],
+        QuantizationConfig | None,
+        Callable[[Node], bool] | None,
     ],
-    Optional[list[list[Node]]],
+    list[list[Node]] | None,
 ]
 OP_TO_ANNOTATOR: dict[str, AnnotatorType] = {}
 
@@ -115,7 +115,7 @@ def _mark_nodes_as_annotated(nodes: list[Node]):
             node.meta["quantization_annotation"]._annotated = True
 
 
-def get_input_act_qspec(quantization_config: Optional[QuantizationConfig]):
+def get_input_act_qspec(quantization_config: QuantizationConfig | None):
     if quantization_config is None:
         return None
     if quantization_config.input_activation is None:
@@ -131,7 +131,7 @@ def get_input_act_qspec(quantization_config: Optional[QuantizationConfig]):
     return quantization_spec
 
 
-def get_output_act_qspec(quantization_config: Optional[QuantizationConfig]):
+def get_output_act_qspec(quantization_config: QuantizationConfig | None):
     if quantization_config is None:
         return None
     if quantization_config.output_activation is None:
@@ -147,7 +147,7 @@ def get_output_act_qspec(quantization_config: Optional[QuantizationConfig]):
     return quantization_spec
 
 
-def get_weight_qspec(quantization_config: Optional[QuantizationConfig]):
+def get_weight_qspec(quantization_config: QuantizationConfig | None):
     if quantization_config is None:
         return None
     if quantization_config is None:
@@ -166,7 +166,7 @@ def get_weight_qspec(quantization_config: Optional[QuantizationConfig]):
     return quantization_spec
 
 
-def get_bias_qspec(quantization_config: Optional[QuantizationConfig]):
+def get_bias_qspec(quantization_config: QuantizationConfig | None):
     if quantization_config is None:
         return None
     if quantization_config is None:
@@ -184,9 +184,9 @@ def get_bias_qspec(quantization_config: Optional[QuantizationConfig]):
 @register_annotator("linear")
 def _annotate_linear(
     gm: torch.fx.GraphModule,
-    quantization_config: Optional[QuantizationConfig],
-    filter_fn: Optional[Callable[[Node], bool]] = None,
-) -> Optional[list[list[Node]]]:
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
     annotated_partitions = []
     input_act_qspec = get_input_act_qspec(quantization_config)
     output_act_qspec = get_output_act_qspec(quantization_config)
@@ -232,9 +232,9 @@ def _annotate_linear(
 @register_annotator("linear_relu")
 def _annotate_linear_relu(
     gm: torch.fx.GraphModule,
-    quantization_config: Optional[QuantizationConfig],
-    filter_fn: Optional[Callable[[Node], bool]] = None,
-) -> Optional[list[list[Node]]]:
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
     annotated_partitions = []
     input_act_qspec = get_input_act_qspec(quantization_config)
     output_act_qspec = get_output_act_qspec(quantization_config)
@@ -300,9 +300,9 @@ def _annotate_linear_relu(
 @register_annotator("conv")
 def _annotate_conv(
     gm: torch.fx.GraphModule,
-    quantization_config: Optional[QuantizationConfig],
-    filter_fn: Optional[Callable[[Node], bool]] = None,
-) -> Optional[list[list[Node]]]:
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
     annotated_partitions = []
     for n in gm.graph.nodes:
         if n.op != "call_function" or n.target not in [
@@ -349,8 +349,8 @@ def _annotate_conv(
 
 def _do_annotate_conv_relu(
     gm: torch.fx.GraphModule,
-    quantization_config: Optional[QuantizationConfig],
-    filter_fn: Optional[Callable[[Node], bool]] = None,
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
     is_conv_transpose: bool = False,
 ):
     annotated_partitions = []
@@ -415,9 +415,9 @@ def _do_annotate_conv_relu(
 @register_annotator("conv_relu")
 def _annotate_conv_relu(
     gm: torch.fx.GraphModule,
-    quantization_config: Optional[QuantizationConfig],
-    filter_fn: Optional[Callable[[Node], bool]] = None,
-) -> Optional[list[list[Node]]]:
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
     return _do_annotate_conv_relu(
         gm, quantization_config, filter_fn, is_conv_transpose=False
     )
@@ -426,9 +426,9 @@ def _annotate_conv_relu(
 @register_annotator("conv_transpose_relu")
 def _annotate_conv_transpose_relu(
     gm: torch.fx.GraphModule,
-    quantization_config: Optional[QuantizationConfig],
-    filter_fn: Optional[Callable[[Node], bool]] = None,
-) -> Optional[list[list[Node]]]:
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
     return _do_annotate_conv_relu(
         gm, quantization_config, filter_fn, is_conv_transpose=True
     )
@@ -437,9 +437,9 @@ def _annotate_conv_transpose_relu(
 @register_annotator("conv_bn")
 def _annotate_conv_bn(
     gm: torch.fx.GraphModule,
-    quantization_config: Optional[QuantizationConfig],
-    filter_fn: Optional[Callable[[Node], bool]] = None,
-) -> Optional[list[list[Node]]]:
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
     """
     Find conv + batchnorm partitions
     Note: This is only used for QAT. In PTQ, batchnorm should already be fused into the conv.
@@ -450,9 +450,9 @@ def _annotate_conv_bn(
 @register_annotator("conv_bn_relu")
 def _annotate_conv_bn_relu(
     gm: torch.fx.GraphModule,
-    quantization_config: Optional[QuantizationConfig],
-    filter_fn: Optional[Callable[[Node], bool]] = None,
-) -> Optional[list[list[Node]]]:
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
     """
     Find conv + batchnorm + relu partitions
     Note: This is only used for QAT. In PTQ, batchnorm should already be fused into the conv.
@@ -463,9 +463,9 @@ def _annotate_conv_bn_relu(
 @register_annotator("conv_transpose_bn")
 def _annotate_conv_transpose_bn(
     gm: torch.fx.GraphModule,
-    quantization_config: Optional[QuantizationConfig],
-    filter_fn: Optional[Callable[[Node], bool]] = None,
-) -> Optional[list[list[Node]]]:
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
     """
     Find conv_transpose + batchnorm partitions
     Note: This is only used for QAT. In PTQ, batchnorm should already be fused into the conv.
@@ -478,9 +478,9 @@ def _annotate_conv_transpose_bn(
 @register_annotator("conv_transpose_bn_relu")
 def _annotate_conv_transpose_bn_relu(
     gm: torch.fx.GraphModule,
-    quantization_config: Optional[QuantizationConfig],
-    filter_fn: Optional[Callable[[Node], bool]] = None,
-) -> Optional[list[list[Node]]]:
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
     """
     Find conv_transpose + batchnorm + relu partitions
     Note: This is only used for QAT. In PTQ, batchnorm should already be fused into the conv.
@@ -492,8 +492,8 @@ def _annotate_conv_transpose_bn_relu(
 
 def _do_annotate_conv_bn(
     gm: torch.fx.GraphModule,
-    quantization_config: Optional[QuantizationConfig],
-    filter_fn: Optional[Callable[[Node], bool]],
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None,
     has_relu: bool,
     is_conv_transpose: bool = False,
 ) -> list[list[Node]]:
@@ -633,9 +633,9 @@ def _conv_bn(x, conv_weight, conv_bias, bn_weight, bn_bias, bn_rm, bn_rv):
 @register_annotator("gru_io_only")
 def _annotate_gru_io_only(
     gm: torch.fx.GraphModule,
-    quantization_config: Optional[QuantizationConfig],
-    filter_fn: Optional[Callable[[Node], bool]] = None,
-) -> Optional[list[list[Node]]]:
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
     gru_partitions = get_source_partitions(gm.graph, [torch.nn.GRU], filter_fn)
     gru_partitions = list(itertools.chain.from_iterable(gru_partitions.values()))
     annotated_partitions = []
@@ -689,9 +689,9 @@ def _annotate_gru_io_only(
 @register_annotator("adaptive_avg_pool2d")
 def _annotate_adaptive_avg_pool2d(
     gm: torch.fx.GraphModule,
-    quantization_config: Optional[QuantizationConfig],
-    filter_fn: Optional[Callable[[Node], bool]] = None,
-) -> Optional[list[list[Node]]]:
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
     """Always annotate adaptive_avg_pool2d op"""
     module_partitions = get_source_partitions(
         gm.graph, [torch.nn.AdaptiveAvgPool2d, F.adaptive_avg_pool2d], filter_fn
@@ -764,9 +764,9 @@ def _is_input_non_float_tensor(node: Node):
 @register_annotator("add_relu")
 def _annotate_add_relu(
     gm: torch.fx.GraphModule,
-    quantization_config: Optional[QuantizationConfig],
-    filter_fn: Optional[Callable[[Node], bool]] = None,
-) -> Optional[list[list[Node]]]:
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
     annotated_partitions = []
     for node in gm.graph.nodes:
         if node.op != "call_function" or node.target not in [
@@ -839,9 +839,9 @@ def _annotate_add_relu(
 @register_annotator("add")
 def _annotate_add(
     gm: torch.fx.GraphModule,
-    quantization_config: Optional[QuantizationConfig],
-    filter_fn: Optional[Callable[[Node], bool]] = None,
-) -> Optional[list[list[Node]]]:
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
     annotated_partitions = []
     for node in gm.graph.nodes:
         if node.op != "call_function" or node.target not in [
@@ -892,9 +892,9 @@ def _annotate_add(
 @register_annotator("mul_relu")
 def _annotate_mul_relu(
     gm: torch.fx.GraphModule,
-    quantization_config: Optional[QuantizationConfig],
-    filter_fn: Optional[Callable[[Node], bool]] = None,
-) -> Optional[list[list[Node]]]:
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
     annotated_partitions = []
     for node in gm.graph.nodes:
         if node.op != "call_function" or node.target not in [
@@ -966,9 +966,9 @@ def _annotate_mul_relu(
 @register_annotator("mul")
 def _annotate_mul(
     gm: torch.fx.GraphModule,
-    quantization_config: Optional[QuantizationConfig],
-    filter_fn: Optional[Callable[[Node], bool]] = None,
-) -> Optional[list[list[Node]]]:
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
     annotated_partitions = []
     for node in gm.graph.nodes:
         if node.op != "call_function" or node.target not in [
@@ -1020,9 +1020,9 @@ def _annotate_mul(
 @register_annotator("cat")
 def _annotate_cat(
     gm: torch.fx.GraphModule,
-    quantization_config: Optional[QuantizationConfig],
-    filter_fn: Optional[Callable[[Node], bool]] = None,
-) -> Optional[list[list[Node]]]:
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
     cat_partitions = get_source_partitions(gm.graph, [torch.cat], filter_fn)
     cat_partitions = list(itertools.chain.from_iterable(cat_partitions.values()))
     annotated_partitions = []
diff --git a/torch/ao/quantization/quantizer/xpu_inductor_quantizer.py b/torch/ao/quantization/quantizer/xpu_inductor_quantizer.py
index 1888eb57396e9..d19968c2787f4 100644
--- a/torch/ao/quantization/quantizer/xpu_inductor_quantizer.py
+++ b/torch/ao/quantization/quantizer/xpu_inductor_quantizer.py
@@ -1,6 +1,6 @@
 # mypy: allow-untyped-defs
 import functools
-from typing import Any, Optional, TYPE_CHECKING
+from typing import Any, TYPE_CHECKING
 
 import torch
 from torch.ao.quantization.observer import HistogramObserver, PerChannelMinMaxObserver
@@ -91,15 +91,15 @@ def __init__(self) -> None:
     def _annotate_qat_conv2d_fusion_pattern(
         self,
         model: torch.fx.GraphModule,
-        quantization_config: Optional[QuantizationConfig],
-        filter_fn: Optional[FilterFn] = None,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
     ):
         pass
 
     def _annotate_maxpool2d(
         self,
         node: Node,
-        quantization_config: Optional[QuantizationConfig],
+        quantization_config: QuantizationConfig | None,
     ) -> None:
         """
         Here we skip the annotate logic for maxpool at XPU backend
diff --git a/torch/ao/quantization/stubs.py b/torch/ao/quantization/stubs.py
index ebfffcb756f76..8dd05374eff84 100644
--- a/torch/ao/quantization/stubs.py
+++ b/torch/ao/quantization/stubs.py
@@ -1,4 +1,4 @@
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from torch import nn
@@ -17,7 +17,7 @@ class QuantStub(nn.Module):
             if qconfig is not provided, we will get qconfig from parent modules
     """
 
-    def __init__(self, qconfig: Optional[QConfig] = None):
+    def __init__(self, qconfig: QConfig | None = None):
         super().__init__()
         if qconfig:
             self.qconfig = qconfig
@@ -35,7 +35,7 @@ class DeQuantStub(nn.Module):
             if qconfig is not provided, we will get qconfig from parent modules
     """
 
-    def __init__(self, qconfig: Optional[Any] = None):
+    def __init__(self, qconfig: Any | None = None):
         super().__init__()
         if qconfig:
             self.qconfig = qconfig
diff --git a/torch/ao/quantization/utils.py b/torch/ao/quantization/utils.py
index cc21ca2818662..84a027e17e6b0 100644
--- a/torch/ao/quantization/utils.py
+++ b/torch/ao/quantization/utils.py
@@ -9,7 +9,7 @@
 from collections import OrderedDict
 from collections.abc import Callable
 from inspect import getfullargspec, signature
-from typing import Any, Optional, Union
+from typing import Any, Union
 
 import torch
 from torch.ao.quantization.quant_type import QuantType
@@ -24,7 +24,7 @@
     from typing import TypeAliasType
 
     NodePattern = TypeAliasType(
-        "NodePattern", Union[tuple[Node, Node], tuple[Node, tuple[Node, Node]], Any]
+        "NodePattern", tuple[Node, Node] | tuple[Node, tuple[Node, Node]] | Any
     )
 
 
@@ -52,12 +52,10 @@
 
     Pattern = TypeAliasType(
         "Pattern",
-        Union[
-            Callable,
-            tuple[Callable, Callable],
-            tuple[Callable, tuple[Callable, Callable]],
-            Any,
-        ],
+        Callable
+        | tuple[Callable, Callable]
+        | tuple[Callable, tuple[Callable, Callable]]
+        | Any,
     )
 
 
@@ -548,7 +546,7 @@ def has_no_children_ignoring_parametrizations(module):
 
 def _get_path_of_module(
     root: torch.nn.Module, submodule: torch.nn.Module
-) -> Optional[str]:
+) -> str | None:
     """Get the path (fully qualified name) of a submodule
 
     Example::
diff --git a/torch/autograd/profiler_legacy.py b/torch/autograd/profiler_legacy.py
index 9f60295655ddb..5dd26c0881370 100644
--- a/torch/autograd/profiler_legacy.py
+++ b/torch/autograd/profiler_legacy.py
@@ -296,9 +296,9 @@ def _get_record_key(record):
                         f"Expected CPU and CUDA memory allocation handles to match, "
                         f"but got {num_open_handles_cpu} CPU and {num_open_handles_cuda} CUDA"
                     )
-                for handle in cpu_memory_allocs.keys():
+                for handle in cpu_memory_allocs:
                     cpu_memory_allocs[handle] += record.cpu_memory_usage()
-                for handle in cuda_memory_allocs.keys():
+                for handle in cuda_memory_allocs:
                     cuda_memory_allocs[handle] += record.cuda_memory_usage()
                 if num_open_handles_cpu == 0:
                     # output event as a top-level memory event
diff --git a/torch/autograd/profiler_util.py b/torch/autograd/profiler_util.py
index b2d6530049e61..1db1e15d9ef24 100644
--- a/torch/autograd/profiler_util.py
+++ b/torch/autograd/profiler_util.py
@@ -24,7 +24,73 @@
 
 
 class EventList(list):
-    """A list of Events (for pretty printing)."""
+    """A list of profiling events with helper methods for analysis and visualization.
+
+    EventList extends the standard Python list to provide specialized methods for
+    working with profiling events (FunctionEvent or FunctionEventAvg objects).
+    It includes utilities for aggregating statistics, formatting output tables,
+    and exporting profiling data.
+
+    This class is typically returned by profiler methods and should not be
+    instantiated directly by users.
+
+    Args:
+        *args: Standard list arguments.
+        use_device (str, optional): Device type for profiling ("cuda", "xpu", etc.).
+        profile_memory (bool, optional): Whether memory profiling was enabled. Default: False.
+        with_flops (bool, optional): Whether to include FLOP counts. Default: False.
+
+    Attributes:
+        _use_device (str): Device type being profiled.
+        _profile_memory (bool): Whether memory profiling is enabled.
+        _with_flops (bool): Whether FLOP counting is enabled.
+        _tree_built (bool): Whether the event tree structure has been built.
+
+    Key Methods:
+        table(...): Format events as a table string for display.
+        export_chrome_trace(path): Export to Chrome tracing format.
+        export_stacks(path, metric): Export stack traces with metrics.
+        key_averages(...): Compute averaged statistics grouped by operation name.
+        total_average(): Compute aggregate totals across all events (sums, not averages).
+
+    Properties:
+        self_cpu_time_total: Sum of self CPU time across all events.
+
+    Example::
+
+        import torch
+        from torch.profiler import profile, ProfilerActivity
+
+        with profile(activities=[ProfilerActivity.CPU]) as prof:
+            x = torch.randn(100, 100)
+            y = torch.matmul(x, x)
+
+        # EventList is returned by prof.events()
+        events = prof.events()
+
+        # Display as formatted table
+        print(
+            events.table(
+                sort_by="cpu_time_total", row_limit=20, top_level_events_only=False
+            )
+        )
+
+        # Export to Chrome tracing format
+        events.export_chrome_trace("trace.json")
+
+        # Get averaged statistics
+        avg_events = events.key_averages()
+        print(avg_events.table())
+
+        # Export stack traces
+        events.export_stacks("stacks.txt", "self_cpu_time_total")
+
+    See Also:
+        - :class:`FunctionEvent`: Individual profiling event
+        - :class:`FunctionEventAvg`: Averaged profiling statistics
+        - :meth:`table`: Format events as a readable table
+        - :meth:`key_averages`: Aggregate events by operation name
+    """
 
     def __init__(self, *args, **kwargs):
         use_device = kwargs.pop("use_device", None)
@@ -373,10 +439,23 @@ def get_key(
         return avg_list
 
     def total_average(self):
-        """Averages all events.
+        """Compute aggregate statistics across all events.
+
+        Accumulates statistics from all events into a single FunctionEventAvg object.
+        This is primarily useful for computing total metrics (total CPU time, total
+        memory usage, etc.) across the entire profiling session, regardless of
+        operation type.
+
+        Note:
+            This sums up times and counts across ALL different operations, so the
+            "average" metrics (like cpu_time) represent the average time per operation
+            call across the entire session, mixing all operation types together.
+            For per-operation averages, use :meth:`key_averages` instead.
 
         Returns:
-            A FunctionEventAvg object.
+            FunctionEventAvg: A single aggregate object with key="Total" containing
+                accumulated statistics.
+
         """
         total_stat = FunctionEventAvg()
         for evt in self:
@@ -471,7 +550,64 @@ def elapsed_us(self):
 
 
 class FunctionEvent(FormattedTimesMixin):
-    """Profiling information about a single function."""
+    """Profiling information about a single function.
+
+    FunctionEvent records the execution of a single operation during profiling.
+    These events are obtained from the profiler/kineto and contain detailed
+    timing and memory usage information.
+
+    .. note::
+        FunctionEvent objects are typically created by the profiler/kineto and should not
+        be instantiated directly by users. Access them through the profiler's output.
+
+    Attributes:
+        id (int): Unique identifier for this event.
+        node_id (int): Node identifier for distributed profiling (-1 if not applicable).
+        name (str): Name of the profiled function/operator.
+        overload_name (str): Overload name for the operator (requires _ExperimentalConfig(capture_overload_names=True) set).
+        trace_name (str): Same as name, just changes ProfilerStep* to ProfilerStep#
+        time_range (Interval): Time interval containing start and end timestamps in microseconds.
+        thread (int): Thread ID where the operation started.
+        fwd_thread (int): Thread ID of the corresponding forward operation.
+        kernels (List[Kernel]): List of device kernels launched by this operation.
+        count (int): Number of times this event was called (usually 1).
+        cpu_children (List[FunctionEvent]): Direct CPU child operations.
+        cpu_parent (FunctionEvent): Direct CPU parent operation.
+        input_shapes (Tuple[int, ...]): Shapes of input tensors (requires record_shapes=true).
+        concrete_inputs (List[Any]): Concrete input values (requires record_shapes=true).
+        kwinputs (Dict[str, Any]): Keyword arguments (requires record_shapes=true).
+        stack (List[str]): Python stack trace where the operation was called (requires with_stack=true).
+        scope (int): at::RecordScope identifier (0=forward, 1=backward, etc.).
+        use_device (str): Device type being profiled ("cuda", "xpu", etc.).
+        cpu_memory_usage (int): CPU memory allocated in bytes.
+        device_memory_usage (int): Device memory allocated in bytes.
+        is_async (bool): Whether this is an asynchronous operation.
+        is_remote (bool): Whether this operation occurred on a remote node.
+        sequence_nr (int): Sequence number for autograd operations.
+        device_type (DeviceType): Type of device (CPU, CUDA, XPU, PrivateUse1, etc.).
+        device_index (int): Index of the device (e.g., GPU 0, 1, 2).
+        device_resource_id (int): Resource ID on the device (ie. stream ID).
+        is_legacy (bool): Whether this is from the legacy profiler.
+        flops (int): Estimated floating point operations.
+        is_user_annotation (bool): Whether this is a user-annotated region.
+        metadata_json (str): Additional metadata in JSON format.
+
+    Properties:
+        cpu_time_total (float): Total CPU time in microseconds.
+        device_time_total (float): Total device (CUDA/XPU/etc) time in microseconds.
+        self_cpu_time_total (float): CPU time excluding child operations.
+        self_device_time_total (float): Device time excluding child operations.
+        self_cpu_memory_usage (int): CPU memory usage excluding child operations.
+        self_device_memory_usage (int): Device memory usage excluding child operations.
+        cpu_time (float): Average CPU time per call.
+        device_time (float): Average device time per call.
+        key (str): Key used for grouping events (usually same as name).
+
+    See Also:
+        - :class:`torch.profiler.profile`: Context manager for profiling
+        - :class:`EventList`: List container for FunctionEvent objects with helper methods
+        - :class:`FunctionEventAvg`: Averaged statistics over multiple FunctionEvent objects
+    """
 
     def __init__(
         self,
@@ -701,7 +837,50 @@ def __repr__(self):
 
 
 class FunctionEventAvg(FormattedTimesMixin):
-    """Used to average stats over multiple FunctionEvent objects."""
+    """Averaged profiling statistics over multiple FunctionEvent objects.
+
+    FunctionEventAvg aggregates statistics from multiple FunctionEvent objects
+    with the same key (typically same operation name). This is useful for getting
+    average performance metrics across multiple invocations of the same operation.
+
+    This class is typically created by calling :meth:`EventList.key_averages()` on
+    a profiler's event list.
+
+    Attributes:
+        key (str): Grouping key for the events (typically operation name).
+        count (int): Total number of events aggregated.
+        node_id (int): Node identifier for distributed profiling (-1 if not applicable).
+        is_async (bool): Whether the operations are asynchronous.
+        is_remote (bool): Whether the operations occurred on a remote node.
+        use_device (str): Device type being profiled ("cuda", "xpu", etc.).
+        cpu_time_total (int): Accumulated total CPU time in microseconds.
+        device_time_total (int): Accumulated total device time in microseconds.
+        self_cpu_time_total (int): Accumulated self CPU time (excluding children) in microseconds.
+        self_device_time_total (int): Accumulated self device time (excluding children) in microseconds.
+        input_shapes (List[List[int]]): Input tensor shapes (requires record_shapes=true).
+        overload_name (str): Operator overload name (requires _ExperimentalConfig(capture_overload_names=True) set).
+        stack (List[str]): Python stack trace where the operation was called (requires with_stack=true).
+        scope (int): at::RecordScope identifier (0=forward, 1=backward, etc.).
+        cpu_memory_usage (int): Accumulated CPU memory usage in bytes.
+        device_memory_usage (int): Accumulated device memory usage in bytes.
+        self_cpu_memory_usage (int): Accumulated self CPU memory usage in bytes.
+        self_device_memory_usage (int): Accumulated self device memory usage in bytes.
+        cpu_children (List[FunctionEvent]): CPU child events.
+        cpu_parent (FunctionEvent): CPU parent event.
+        device_type (DeviceType): Type of device (CPU, CUDA, XPU, PrivateUse1, etc.).
+        is_legacy (bool): Whether from legacy profiler.
+        flops (int): Total floating point operations.
+        is_user_annotation (bool): Whether this is a user-annotated region.
+
+    Properties:
+        cpu_time (float): Average CPU time per invocation.
+        device_time (float): Average device time per invocation.
+
+    See Also:
+        - :class:`EventList.key_averages`: Method that creates FunctionEventAvg objects
+        - :class:`FunctionEvent`: Individual profiling event
+        - :class:`EventList`: Container for profiling events
+    """
 
     def __init__(self) -> None:
         self.key: Optional[str] = None
@@ -1224,3 +1403,43 @@ def override_time_unit(time_us, default_str, time_unit):
             f"time total: {override_time_unit(sum_self_device_time_total, _format_time(sum_self_device_time_total), time_unit)}"
         )
     return "".join(result)
+
+
+# Collect all events with stack traces and format them canonically
+def _canonicalize_profiler_events(events):
+    """
+    Extract and format all events with stack traces in a canonical way
+    for deterministic testing.
+    """
+    events_with_traces = []
+
+    for event in events:
+        # Extract relevant fields
+        event_name = event.get("name", "")
+        node_name = event["args"].get("node_name", "")
+        stack_trace = event["args"].get("stack_trace", "")
+
+        # Get the last non-empty line of the stack trace
+        lines = [s.strip() for s in stack_trace.split("\n") if s.strip()]
+        stack_trace = lines[-1] if lines else ""
+
+        events_with_traces.append(
+            {
+                "event_name": event_name[:20],
+                "node_name": node_name,
+                "stack_trace": stack_trace,
+                "start_time": event.get("ts", 0),
+            }
+        )
+
+    # Sort by node_name for deterministic ordering
+    events_with_traces.sort(key=lambda x: x["start_time"])
+
+    # Format as a string
+    lines: list[str] = []
+    for evt in events_with_traces:
+        lines.append(
+            f"event={evt['event_name']} node={evt['node_name']} stack_trace={evt['stack_trace']}"
+        )
+
+    return "\n".join(lines)
diff --git a/torch/compiler/__init__.py b/torch/compiler/__init__.py
index 1e744f54362dd..809ec86fa5ec4 100644
--- a/torch/compiler/__init__.py
+++ b/torch/compiler/__init__.py
@@ -36,6 +36,7 @@
     "skip_guard_on_all_nn_modules_unsafe",
     "keep_tensor_guards_unsafe",
     "skip_guard_on_globals_unsafe",
+    "skip_all_guards_unsafe",
     "nested_compile_region",
 ]
 
@@ -617,6 +618,23 @@ def skip_guard_on_globals_unsafe(guard_entries):
     return [not entry.is_global for entry in guard_entries]
 
 
+def skip_all_guards_unsafe(guard_entries):
+    """
+    A function for skipping all guards on a compiled function.
+
+    WARNING: This function will drop all the safety guarantees from Dynamo
+             compiled function. Use this with caution.
+
+    To use this API, use guard_filter_fn argument while calling torch.compile
+
+    >> opt_mod = torch.compile(
+    >>     mod,
+    >>     options={"guard_filter_fn": torch.compiler.skip_all_guards_unsafe},
+    >> )
+    """
+    return [False for entry in guard_entries]
+
+
 def nested_compile_region(fn=None):
     """
     Tells **``torch.compile``** that the marked set of operations forms a nested
diff --git a/torch/compiler/config.py b/torch/compiler/config.py
index e7578a57f2c0b..e507ddc18052e 100644
--- a/torch/compiler/config.py
+++ b/torch/compiler/config.py
@@ -35,6 +35,7 @@
     "enable_cpp_symbolic_shape_guards",
     "wrap_top_frame",
     "reorderable_logging_functions",
+    "force_disable_caches",
 ]
 
 
diff --git a/torch/csrc/DataLoader.cpp b/torch/csrc/DataLoader.cpp
index a6ad3f00b2782..31cec72d8a1c3 100644
--- a/torch/csrc/DataLoader.cpp
+++ b/torch/csrc/DataLoader.cpp
@@ -61,7 +61,7 @@ static void setSignalHandler(
       sigaction(signal, &sa, old_sa_ptr) != 0) {
     std::ostringstream oss;
     oss << "An error occurred while setting handler for " << strsignal(signal)
-        << ".";
+        << '.';
     TORCH_CHECK(false, oss.str());
   }
 }
diff --git a/torch/csrc/Device.cpp b/torch/csrc/Device.cpp
index f3babe4cd72bb..da7b287369dab 100644
--- a/torch/csrc/Device.cpp
+++ b/torch/csrc/Device.cpp
@@ -29,14 +29,14 @@ PyObject* THPDevice_New(const at::Device& device) {
 
 static PyObject* THPDevice_repr(THPDevice* self) {
   std::ostringstream oss;
-  oss << "device(type=\'" << self->device.type() << "\'";
+  oss << "device(type=\'" << self->device.type() << '\'';
   if (self->device.has_index()) {
     // `self->device.index()` returns uint8_t which is treated as ascii while
     // printing, hence casting it to uint16_t.
     // https://stackoverflow.com/questions/19562103/uint8-t-cant-be-printed-with-cout
     oss << ", index=" << static_cast<uint16_t>(self->device.index());
   }
-  oss << ")";
+  oss << ')';
   return THPUtils_packString(oss.str().c_str());
 }
 
diff --git a/torch/csrc/DeviceAccelerator.cpp b/torch/csrc/DeviceAccelerator.cpp
index b6176f11aaf6e..14e54851178f5 100644
--- a/torch/csrc/DeviceAccelerator.cpp
+++ b/torch/csrc/DeviceAccelerator.cpp
@@ -138,6 +138,13 @@ void initModule(PyObject* module) {
     at::accelerator::resetPeakStats(device_index);
   });
 
+  m.def("_accelerator_getMemoryInfo", [](c10::DeviceIndex device_index) {
+    const auto device_type = at::accelerator::getAccelerator(true).value();
+    torch::utils::maybe_initialize_device(device_type);
+    py::gil_scoped_release no_gil;
+    return at::accelerator::getMemoryInfo(device_index);
+  });
+
   m.def("_accelerator_setAllocatorSettings", [](std::string env) {
     c10::CachingAllocator::setAllocatorSettings(env);
   });
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index ad37abe3b560b..61ef99e8086f9 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -212,8 +212,8 @@ static PyObject* THPModule_initExtension(
         }
         auto frame_id = s_tb[idx];
         const auto& frame = s_tbs.all_frames.at(frame_id);
-        oss << "#" << idx << " " << frame.funcname << " from " << frame.filename
-            << ":" << frame.lineno << '\n';
+        oss << '#' << idx << ' ' << frame.funcname << " from " << frame.filename
+            << ':' << frame.lineno << '\n';
       }
       return oss.str();
     });
@@ -398,36 +398,27 @@ static PyObject* THPModule_swap_tensor_impl(PyObject* _unused, PyObject* args) {
 
   // weak_use_count() adds 1 if use_count is non-zero
   TORCH_CHECK(
-      a->cdata->weak_use_count() == 1,
+      a->cdata.weak_use_count() == 1,
       "Expected no weakrefs to t1's Tensor object but got  ",
-      a->cdata->weak_use_count() - 1);
+      a->cdata.weak_use_count() - 1);
   TORCH_CHECK(
-      b->cdata->weak_use_count() == 1,
+      b->cdata.weak_use_count() == 1,
       "Expected no weakrefs to t2's Tensor object but got  ",
-      b->cdata->weak_use_count() - 1);
+      b->cdata.weak_use_count() - 1);
+
+  // NB: Creating local copies of *both* Tensors here ensures that they each
+  // hold a strong reference to their PyObject. This avoids having to fix up
+  // reference counts when we swap the PyObject slots below.
+  at::Tensor tmp_a = a->cdata;
+  at::Tensor tmp_b = b->cdata;
 
   // Swap the Tensor Impl
-  c10::MaybeOwned<at::Tensor> tmp = a->cdata;
-
-  // The TensorImpls contain PyObjectSlots that have a reference to the PyObject
-  // associated with the TensorImpl. Swap this field as well.
-  std::optional<PyObject*> mb_obj_a =
-      a->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
-          /*ignore_hermetic_tls=*/false);
-  std::optional<PyObject*> mb_obj_b =
-      b->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
-          /*ignore_hermetic_tls=*/false);
-  TORCH_INTERNAL_ASSERT(
-      mb_obj_a.has_value() && mb_obj_b.has_value(),
-      "Both tensors should have PyObjects tagged by the current python interpreter");
-  TORCH_CHECK(mb_obj_a.value() == a_);
-  TORCH_CHECK(mb_obj_b.value() == b_);
-
-  a->cdata = b->cdata;
-  b->cdata = tmp;
-
-  a->cdata->unsafeGetTensorImpl()->pyobj_slot()->init_pyobj(a_);
-  b->cdata->unsafeGetTensorImpl()->pyobj_slot()->init_pyobj(b_);
+  a->cdata = tmp_b;
+  b->cdata = tmp_a;
+
+  // Fix up the PyObjects associated with each TensorImpl
+  a->cdata.unsafeGetTensorImpl()->pyobj_slot()->store_pyobj(a_);
+  b->cdata.unsafeGetTensorImpl()->pyobj_slot()->store_pyobj(b_);
 
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
@@ -2781,8 +2772,8 @@ Call this whenever a new thread is created in order to propagate values from
 
   py_module.def("_dump_local_tls_set", []() {
     auto local_keyset = c10::impl::tls_local_dispatch_key_set();
-    std::cout << "Included: " << toString(local_keyset.included_) << "\n";
-    std::cout << "Excluded: " << toString(local_keyset.excluded_) << "\n";
+    std::cout << "Included: " << toString(local_keyset.included_) << '\n';
+    std::cout << "Excluded: " << toString(local_keyset.excluded_) << '\n';
   });
 
   py_module.def(
@@ -2918,7 +2909,6 @@ static void pytorch_duplicate_guard() {
     abort();
   }
   initialized = 1;
-  ;
 }
 
 struct call_duplicate_guard {
diff --git a/torch/csrc/PyInterpreter.cpp b/torch/csrc/PyInterpreter.cpp
index 6b23752124228..7f36d88bdaa32 100644
--- a/torch/csrc/PyInterpreter.cpp
+++ b/torch/csrc/PyInterpreter.cpp
@@ -45,7 +45,9 @@ struct ConcretePyInterpreterVTable final
   std::string name() const override;
 
   void incref(PyObject* pyobj) const override;
-  void decref(PyObject* pyobj, bool has_pyobj_slot) const override;
+  void decref(PyObject* pyobj) const override;
+  bool try_incref(const c10::impl::PyObjectSlot& pyobj_slot) const override;
+  size_t refcnt(PyObject* pyobj) const override;
 
   // TODO: Need to make this work for StorageImpl too. I imagine I'll want to
   // operate upon a PyObjectSlot rather than a TensorImpl
@@ -235,53 +237,13 @@ py::object torchDispatchFromTensorImpl(
           TorchFunctionName::TorchDispatch));
 }
 
-// NOTE [PyInterpreter::decref takes a `has_pyobj_slot` arg]
-// Before calling PyInterpreter::decref, we must statically know if the
-// pyobj has a PyObjectSlot or not.
-// - If it has a PyObjectSlot, we need to be careful about PyObject resurrection
-// - If it does not have a PyObjectSlot, we can freely decref
-// One alternative to this is using PyObject_IsInstance
-// to get at this information. However, we don't want to risk an incorrect
-// `__instancecheck__` changing the semantics here.
-void ConcretePyInterpreterVTable::decref(PyObject* pyobj, bool has_pyobj_slot)
-    const {
+void ConcretePyInterpreterVTable::decref(PyObject* pyobj) const {
   // Leak the pyobj if not initialized.  This can happen if we are running
   // exit handlers that are destructing tensors with residual (owned)
   // PyObjects stored in them.
   if (!Py_IsInitialized())
     return;
-
   pybind11::gil_scoped_acquire gil;
-  // Two possibilities:
-  // 1. We are decref-ing an object that has a PyObjectSlot, like a Tensor or
-  // Storage. Then we must be careful about PyObject resurrection (see
-  // THPVariable_clear).
-  // 2. We are decref-ing some other Python object. We don't do
-  // PyObject resurrection on non-Tensors, so we just carry on as usual
-  if (has_pyobj_slot && Py_REFCNT(pyobj) > 1) {
-    if (THPVariable_Check(pyobj)) {
-      // It's still alive!  This can happen if a weak ref resurrected
-      // the PyObject without flipping ownership.  At this point it is
-      // too late to rescue the object, so just stub out the PyObject
-      // so that it fails on subsequent uses.  Don't raise an error here;
-      // you're probably in a destructor.
-      TORCH_WARN(
-          "Deallocating Tensor that still has live PyObject references.  "
-          "This probably happened because you took out a weak reference to "
-          "Tensor and didn't call _fix_weakref() after dereferencing it.  "
-          "Subsequent accesses to this tensor via the PyObject will now fail.");
-      (reinterpret_cast<THPVariable*>(pyobj))->cdata =
-          c10::MaybeOwned<torch::autograd::Variable>();
-    } else if (THPStorage_Check(pyobj)) {
-      TORCH_WARN(
-          "Deallocating UntypedStorage that still has live PyObject references.  "
-          "This probably happened because you took out a weak reference to "
-          "UntypedStorage and didn't call _fix_weakref() after dereferencing it.  "
-          "Subsequent accesses to this storage via the PyObject will now fail.");
-      (reinterpret_cast<THPStorage*>(pyobj))->cdata =
-          c10::MaybeOwned<c10::Storage>();
-    }
-  }
   Py_DECREF(pyobj);
 }
 
@@ -292,6 +254,25 @@ void ConcretePyInterpreterVTable::incref(PyObject* pyobj) const {
   Py_INCREF(pyobj);
 }
 
+bool ConcretePyInterpreterVTable::try_incref(
+    const c10::impl::PyObjectSlot& pyobj_slot) const {
+  if (!Py_IsInitialized())
+    return false;
+  pybind11::gil_scoped_acquire gil;
+  PyObject* pyobj = pyobj_slot.load_pyobj();
+  if (!pyobj) {
+    return false;
+  }
+  return PyUnstable_TryIncRef(pyobj);
+}
+
+size_t ConcretePyInterpreterVTable::refcnt(PyObject* pyobj) const {
+  if (!Py_IsInitialized() || pyobj == nullptr)
+    return 0;
+  pybind11::gil_scoped_acquire gil;
+  return Py_REFCNT(pyobj);
+}
+
 bool isPythonTensor(const at::Tensor& tensor) {
   return tensor.unsafeGetTensorImpl()->key_set().has(c10::DispatchKey::Python);
 }
@@ -357,6 +338,8 @@ void ConcretePyInterpreterVTable::dispatch(
       nullptr,
       torch_api_function_overload.ptr(),
       nullptr,
+      &op,
+      &arguments,
       TorchFunctionName::TorchDispatch);
   pushPyOutToStack(
       op, stack, py::reinterpret_steal<py::object>(obj), "__torch_dispatch__");
@@ -618,11 +601,7 @@ static void set_tensor_attr_with_capsule(
     const c10::TensorImpl* tensor,
     py::capsule& capsule,
     const char* attr_name) {
-  std::optional<PyObject*> mb_obj = tensor->pyobj_slot()->check_pyobj(
-      /*ignore_hermetic_tls=*/false);
-  TORCH_CHECK(
-      mb_obj.has_value(), "Tensor subclass's PyInterpreter has no value");
-  auto obj = mb_obj.value();
+  PyObject* obj = tensor->pyobj_slot()->load_pyobj();
   py::handle(obj).attr(attr_name) = capsule;
 }
 
@@ -646,11 +625,7 @@ static c10::ArrayRef<T> get_set_cached_attr(
     const c10::TensorImpl* tensor,
     const char* base_attr_name,
     const py::object& obj) {
-  std::optional<PyObject*> mb_obj =
-      tensor->pyobj_slot()->check_pyobj(getPyInterpreter());
-  TORCH_CHECK(
-      mb_obj.has_value(), "Tensor subclass's PyInterpreter has no value");
-  auto tensor_obj = mb_obj.value();
+  PyObject* tensor_obj = tensor->pyobj_slot()->load_pyobj();
   auto buffer_len_attr_name = std::string(base_attr_name) + std::string("_len");
 
   bool is_buffer_allocated = false;
diff --git a/torch/csrc/Storage.cpp b/torch/csrc/Storage.cpp
index 02558cbdf8968..671c28adef3e3 100644
--- a/torch/csrc/Storage.cpp
+++ b/torch/csrc/Storage.cpp
@@ -23,6 +23,8 @@
 #include <c10/util/intrusive_ptr.h>
 #include <fmt/format.h>
 
+using torch::utils::PyObjectPreservation;
+
 template <>
 void THPPointer<c10::StorageImpl>::free() {
   if (ptr) {
@@ -32,238 +34,72 @@ void THPPointer<c10::StorageImpl>::free() {
 
 PyTypeObject* THPStorageClass = nullptr;
 
-PyObject* THPStorage_NewWithStorage(
-    PyTypeObject* type,
-    c10::Storage _storage,
-    bool allow_preexisting_pyobj) {
-  TORCH_CHECK(
-      PyType_IsSubtype(type, &THPStorageType),
-      "Creating a Storage subclass from a class that does not inherit from ",
-      "Storage is not possible. Make sure your class inherits from Storage.");
-
-  auto maybe_pyobj = _storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj(
-      /*ignore_hermetic_tls=*/false);
-  if (maybe_pyobj.has_value() && maybe_pyobj.value()) {
-    TORCH_CHECK(
-        allow_preexisting_pyobj,
-        "Creating a new Storage subclass ",
-        type->tp_name,
-        " but the raw Storage object is already associated to a python object ",
-        "of type ",
-        maybe_pyobj.value()->ob_type->tp_name);
-    PyObject* obj = *maybe_pyobj;
-    PyTypeObject* obj_type = Py_TYPE(obj);
-    TORCH_CHECK(
-        obj_type == type || PyType_IsSubtype(obj_type, type),
-        "Creating a new Storage subclass ",
-        type->tp_name,
-        " but the raw Storage object is already associated to a python object ",
-        "of type ",
-        maybe_pyobj.value()->ob_type->tp_name,
-        " which is not a subclass of the "
-        "requested type");
-    return THPStorage_Wrap(std::move(_storage));
-  }
-
+// Create a new Python Storage object, but don't set the pyobj slot on the
+// c10::Storage object.
+static PyObject* THPStorage_New(PyTypeObject* type, c10::Storage _storage) {
   PyObject* obj = type->tp_alloc(type, 0);
   TORCH_CHECK(obj, "Failed to allocate a ", type->tp_name, " object");
 
-  auto s = reinterpret_cast<THPStorage*>(obj);
+  // Ensure that PyUnstable_TryIncref calls don't fail spuriously in
+  // free-threaded Python.
+  PyUnstable_EnableTryIncRef(obj);
 
-  new (&s->cdata) c10::MaybeOwned<c10::Storage>();
-
-  s->cdata = c10::MaybeOwned<c10::Storage>::owned(std::move(_storage));
+  auto s = (THPStorage*)obj;
+  new (&s->cdata) c10::Storage(std::move(_storage));
+  return obj;
+}
 
-  if (!c10::impl::HermeticPyObjectTLS::get_state()) {
-    s->is_hermetic = false;
-    const auto& storage = THPStorage_Unpack(s);
-    storage.unsafeGetStorageImpl()->pyobj_slot()->init_pyobj(obj);
-  } else {
-    s->is_hermetic = true;
-  }
+// Create a new Python Storage object for a new c10::Storage, and set the
+// pyobj slot. The c10::Storage must not already have a pyobj set.
+PyObject* THPStorage_NewWithStorage(PyTypeObject* type, c10::Storage _storage) {
+  TORCH_CHECK(
+      type == THPStorageClass || PyType_IsSubtype(type, &THPStorageType),
+      "Creating a Storage subclass from a class that does not inherit from ",
+      "Storage is not possible. Make sure your class inherits from Storage.");
+  TORCH_INTERNAL_ASSERT(_storage.use_count() == 1);
 
+  c10::StorageImpl* storage_impl = _storage.unsafeGetStorageImpl();
+  PyObject* obj = THPStorage_New(type, std::move(_storage));
+  PyObjectPreservation::init_fresh_nonatomic(
+      storage_impl, storage_impl->pyobj_slot(), obj);
   return obj;
 }
 
-// Wraps the c10::Storage with a storage PyObject
+// Returns a PyObject wrapper for the c10::Storage object. The existing
+// wrapper is returned if it already exists.
 PyObject* THPStorage_Wrap(c10::Storage storage) {
-  c10::StorageImpl* storage_impl = storage.unsafeGetStorageImpl();
   if (c10::impl::HermeticPyObjectTLS::get_state()) {
-    return THPStorage_NewWithStorage(THPStorageClass, std::move(storage));
+    return THPStorage_New(THPStorageClass, std::move(storage));
   }
-  c10::impl::PyObjectSlot* pyobj_slot = storage_impl->pyobj_slot();
 
-  std::optional<PyObject*> maybe_pyobj = pyobj_slot->check_pyobj(
-      /*ignore_hermetic_tls=*/false);
-  if (maybe_pyobj.has_value()) {
-    auto obj = *maybe_pyobj;
-    if (obj) {
-      TORCH_CHECK(
-          THPStorage_Check(obj),
-          "Expected a storage type, but got ",
-          Py_TYPE(obj)->tp_name);
-
-      if (pyobj_slot->owns_pyobj()) {
-        pyobj_slot->set_owns_pyobj(false);
-        reinterpret_cast<THPStorage*>(obj)->cdata =
-            c10::MaybeOwned<c10::Storage>::owned(std::move(storage));
-        return obj;
-      } else {
-        Py_INCREF(obj);
-        return obj;
-      }
-    }
-  }
-  return THPStorage_NewWithStorage(THPStorageClass, std::move(storage));
-}
-
-static bool THPStorage_isPreservable(THPStorage* self) {
-  if (self->cdata.unsafeIsBorrowed()) {
-    return false;
-  }
-  auto const& storage = THPStorage_Unpack(self);
-
-  if (self->is_hermetic) {
-    return false;
-  }
+  c10::StorageImpl* storage_impl = storage.unsafeGetStorageImpl();
+  c10::impl::PyObjectSlot* pyobj_slot = storage_impl->pyobj_slot();
 
-  if (storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj(
-          /*ignore_hermetic_tls=*/true) != reinterpret_cast<PyObject*>(self)) {
-    return false;
-  }
-  if (storage.use_count() <= 1) {
-    return false;
+  PyObject* obj = pyobj_slot->load_pyobj();
+  if (obj) {
+    return Py_NewRef(obj);
   }
-  return true;
-}
 
-static bool THPStorage_tryPreserve(THPStorage* self) {
-  if (!THPStorage_isPreservable(self)) {
-    return false;
+  obj = THPStorage_New(THPStorageClass, std::move(storage));
+  PyObject* wrapper =
+      PyObjectPreservation::init_once(storage_impl, pyobj_slot, obj);
+  if (wrapper != obj) {
+    // Another thread beat us to it
+    Py_DECREF(obj);
+    return Py_NewRef(wrapper);
   }
-
-  const auto& storage = THPStorage_Unpack(self);
-  c10::StorageImpl* storage_impl = storage.unsafeGetStorageImpl();
-
-  auto maybe_pyobj = storage_impl->pyobj_slot()->check_pyobj(
-      /*ignore_hermetic_tls=*/true);
-  // NOTE: It is possible to just set the PyObjectSlot here, but the point is
-  // that we should have already set PyObjectSlot when the storage PyObject
-  // was created.
-  TORCH_INTERNAL_ASSERT(
-      maybe_pyobj.has_value(),
-      "Trying to preserve a Python storage whose PyObjectSlot does not have a PyObject");
-
-  PyObject* pyobj = *maybe_pyobj;
-
-  TORCH_CHECK(
-      THPStorage_Check(pyobj),
-      "Expected a storage type, but got ",
-      Py_TYPE(pyobj)->tp_name);
-
-  TORCH_INTERNAL_ASSERT(
-      (void*)pyobj == (void*)self,
-      "Python storage and the PyObject in the internal PyObjectSlot are not at the same address");
-
-  TORCH_INTERNAL_ASSERT(!storage_impl->pyobj_slot()->owns_pyobj());
-
-  storage_impl->pyobj_slot()->set_owns_pyobj(true);
-  // When resurrecting, we MUST use _Py_NewReference and not Py_INCREF to
-  // ensure the PyObject is in a valid state
-  _Py_NewReference(reinterpret_cast<PyObject*>(self));
-
-  self->cdata = c10::MaybeOwned<c10::Storage>::borrowed(storage);
-  return true;
+  return obj;
 }
 
-static void THPStorage_subclass_dealloc(PyObject* self) {
+static void THPStorage_dealloc(PyObject* self) {
   THPStorage* _self = reinterpret_cast<THPStorage*>(self);
-
-  if (THPStorage_tryPreserve(_self)) {
-    return;
-  }
-
-  // Some subclass of StorageBase could be GC-tracked objects even
-  // though the base class is not
-  auto* type = Py_TYPE(self);
-  if (PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC) != 0) {
-    PyObject_GC_UnTrack(self);
-  }
-
-  bool has_finalizer = type->tp_finalize || type->tp_del;
-
-  if (type->tp_finalize) {
-    PyObject_GC_Track(self);
-    if (PyObject_CallFinalizerFromDealloc(self) < 0) {
-      // The finalizer has resurrected the PyObject and there is a new Python
-      // reference to it, so we can just stop deallocating. Read about
-      // resurrection from `__del__` here:
-      // https://docs.python.org/3/reference/datamodel.html#object.__del__
-      return;
-    }
-    PyObject_GC_UnTrack(self);
-  }
-
-  // base test is unnecessary as THPStorae does not set this
-  if (type->tp_weaklistoffset) {
-    PyObject_ClearWeakRefs(self);
+  auto pyobj_slot = _self->cdata.unsafeGetStorageImpl()->pyobj_slot();
+  if (pyobj_slot->load_pyobj() == self) {
+    TORCH_INTERNAL_ASSERT(_self->cdata.use_count() == 1);
+    pyobj_slot->clear();
   }
-
-  if (type->tp_del) {
-    PyObject_GC_Track(self);
-    type->tp_del(self);
-    if (Py_REFCNT(self) > 0) {
-      // Resurrected (see above comment about resurrection from `__del__`)
-      return;
-    }
-    PyObject_GC_UnTrack(self);
-  }
-
-  if (has_finalizer) {
-    /* New weakrefs could be created during the finalizer call.
-       If this occurs, clear them out without calling their
-       finalizers since they might rely on part of the object
-       being finalized that has already been destroyed. */
-    if (type->tp_weaklistoffset) {
-      /* Modeled after GET_WEAKREFS_LISTPTR() */
-      PyWeakReference** list = reinterpret_cast<PyWeakReference**>(
-          PyObject_GET_WEAKREFS_LISTPTR(self));
-      while (*list)
-        _PyWeakref_ClearRef(*list);
-    }
-  }
-
-  // Clear slots
-  {
-    PyTypeObject* base = type;
-    while (base != &THPStorageType) {
-      if (Py_SIZE(base)) {
-        clear_slots(base, self);
-      }
-      base = base->tp_base;
-      TORCH_INTERNAL_ASSERT(base);
-    }
-  }
-
-  // Clear __dict__
-  if (C10_LIKELY(type->tp_dictoffset)) {
-    PyObject** dictptr = _PyObject_GetDictPtr(self);
-    if (dictptr != nullptr) {
-      PyObject* dict = *dictptr;
-      if (dict != nullptr) {
-        Py_DECREF(dict);
-        *dictptr = nullptr;
-      }
-    }
-  }
-
-  TORCH_INTERNAL_ASSERT(Py_TYPE(self) == type);
-
-  _self->cdata.~MaybeOwned<c10::Storage>();
+  _self->cdata.~Storage();
   Py_TYPE(_self)->tp_free(self);
-
-  TORCH_INTERNAL_ASSERT(type->tp_flags & Py_TPFLAGS_HEAPTYPE);
-  Py_DECREF(type);
 }
 
 static PyObject* THPStorage_pynew(
@@ -553,64 +389,13 @@ static PyMappingMethods THPStorage_mappingmethods = {
     reinterpret_cast<binaryfunc>(THPStorage_get),
     reinterpret_cast<objobjargproc>(THPStorage_set)};
 
-struct THPStorageMeta {
-  PyHeapTypeObject base;
-};
-
-static int THPStorageMetaType_init(
-    PyObject* cls,
-    PyObject* args,
-    PyObject* kwargs);
-
-static PyTypeObject THPStorageMetaType = {
-    PyVarObject_HEAD_INIT(DEFERRED_ADDRESS(&PyType_Type), 0)
-    "torch._C._StorageMeta", /* tp_name */
-    sizeof(THPStorageMeta), /* tp_basicsize */
-    0, /* tp_itemsize */
-    nullptr, /* tp_dealloc */
-    0, /* tp_vectorcall_offset */
-    nullptr, /* tp_getattr */
-    nullptr, /* tp_setattr */
-    nullptr, /* tp_reserved */
-    nullptr, /* tp_repr */
-    nullptr, /* tp_as_number */
-    nullptr, /* tp_as_sequence */
-    nullptr, /* tp_as_mapping */
-    nullptr, /* tp_hash  */
-    nullptr, /* tp_call */
-    nullptr, /* tp_str */
-    nullptr, /* tp_getattro */
-    nullptr, /* tp_setattro */
-    nullptr, /* tp_as_buffer */
-    // NOLINTNEXTLINE(misc-redundant-expression)
-    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
-    nullptr, /* tp_doc */
-    nullptr, /* tp_traverse */
-    nullptr, /* tp_clear */
-    nullptr, /* tp_richcompare */
-    0, /* tp_weaklistoffset */
-    nullptr, /* tp_iter */
-    nullptr, /* tp_iternext */
-    nullptr, /* tp_methods */
-    nullptr, /* tp_members */
-    nullptr, /* tp_getset */
-    DEFERRED_ADDRESS(&PyType_Type), /* tp_base */
-    nullptr, /* tp_dict */
-    nullptr, /* tp_descr_get */
-    nullptr, /* tp_descr_set */
-    0, /* tp_dictoffset */
-    THPStorageMetaType_init, /* tp_init */
-    nullptr, /* tp_alloc */
-    nullptr, /* tp_new */
-};
-
 // TODO: implement equality
 PyTypeObject THPStorageType = {
-    PyVarObject_HEAD_INIT(&THPStorageMetaType, 0)
+    PyVarObject_HEAD_INIT(DEFERRED_ADDRESS(&PyType_Type), 0)
     "torch._C.StorageBase", /* tp_name */
     sizeof(THPStorage), /* tp_basicsize */
     0, /* tp_itemsize */
-    nullptr, /* tp_dealloc */
+    THPStorage_dealloc, /* tp_dealloc */
     0, /* tp_vectorcall_offset */
     nullptr, /* tp_getattr */
     nullptr, /* tp_setattr */
@@ -649,15 +434,6 @@ PyTypeObject THPStorageType = {
     THPStorage_pynew, /* tp_new */
 };
 
-int THPStorageMetaType_init(PyObject* cls, PyObject* args, PyObject* kwargs) {
-  if (PyType_Type.tp_init(cls, args, kwargs) < 0) {
-    return -1;
-  }
-  (reinterpret_cast<PyTypeObject*>(cls))->tp_dealloc =
-      static_cast<destructor>(THPStorage_subclass_dealloc);
-  return 0;
-}
-
 static PyObject* THPStorage_device(THPStorage* self, void* unused) {
   HANDLE_TH_ERRORS
   THPStorage_assertNotNull(self);
@@ -692,13 +468,6 @@ bool THPStorage_init(PyObject* module) {
   THPUtils_addPyMethodDefs(methods, THPStorage_getMethods());
   THPUtils_addPyMethodDefs(methods, THPStorage_getSharingMethods());
 
-  THPStorageMetaType.tp_base = &PyType_Type;
-  if (PyType_Ready(&THPStorageMetaType) < 0)
-    return false;
-  Py_INCREF(&THPStorageMetaType);
-  PyModule_AddObject(
-      module, "_StorageMeta", reinterpret_cast<PyObject*>(&THPStorageMetaType));
-
   THPStorageType.tp_methods = methods.data();
   THPStorageType.tp_getset = THPStorage_properties;
   if (PyType_Ready(&THPStorageType) < 0)
diff --git a/torch/csrc/Storage.h b/torch/csrc/Storage.h
index 698cd80548efa..89e853181f3da 100644
--- a/torch/csrc/Storage.h
+++ b/torch/csrc/Storage.h
@@ -11,15 +11,13 @@
 
 struct THPStorage {
   PyObject_HEAD
-  c10::MaybeOwned<c10::Storage> cdata;
-  bool is_hermetic;
+  c10::Storage cdata;
 };
 
 TORCH_PYTHON_API PyObject* THPStorage_Wrap(c10::Storage storage);
 TORCH_PYTHON_API PyObject* THPStorage_NewWithStorage(
     PyTypeObject* type,
-    c10::Storage _storage,
-    bool allow_preexisting_pyobj = false);
+    c10::Storage _storage);
 TORCH_PYTHON_API extern PyTypeObject* THPStorageClass;
 
 inline bool THPStorage_CheckTypeExact(PyTypeObject* tp) {
@@ -49,7 +47,7 @@ TORCH_PYTHON_API void THPStorage_assertNotNull(PyObject* obj);
 TORCH_PYTHON_API extern PyTypeObject THPStorageType;
 
 inline const c10::Storage& THPStorage_Unpack(THPStorage* storage) {
-  return *storage->cdata;
+  return storage->cdata;
 }
 
 inline const c10::Storage& THPStorage_Unpack(PyObject* obj) {
diff --git a/torch/csrc/StorageMethods.cpp b/torch/csrc/StorageMethods.cpp
index 68c06f7c88c1c..178f735802fb7 100644
--- a/torch/csrc/StorageMethods.cpp
+++ b/torch/csrc/StorageMethods.cpp
@@ -529,9 +529,8 @@ static PyObject* THPStorage__setCdata(PyObject* _self, PyObject* new_cdata) {
       THPUtils_typename(new_cdata));
   c10::StorageImpl* ptr =
       static_cast<c10::StorageImpl*>(PyLong_AsVoidPtr(new_cdata));
-  self->cdata.~MaybeOwned<c10::Storage>();
-  self->cdata = c10::MaybeOwned<c10::Storage>::owned(
-      c10::Storage(c10::intrusive_ptr<c10::StorageImpl>::reclaim_copy(ptr)));
+  self->cdata =
+      c10::Storage(c10::intrusive_ptr<c10::StorageImpl>::reclaim_copy(ptr));
   Py_INCREF(self);
   return reinterpret_cast<PyObject*>(self);
   END_HANDLE_TH_ERRORS
diff --git a/torch/csrc/TypeInfo.cpp b/torch/csrc/TypeInfo.cpp
index 6874374eff768..de23b79536033 100644
--- a/torch/csrc/TypeInfo.cpp
+++ b/torch/csrc/TypeInfo.cpp
@@ -254,7 +254,7 @@ static PyObject* THPFInfo_str(THPFInfo* self) {
       << PyFloat_AsDouble(THPFInfo_smallest_normal(self, nullptr));
   oss << ", tiny=" << PyFloat_AsDouble(THPFInfo_tiny(self, nullptr));
   if (dtypeStr != nullptr) {
-    oss << ", dtype=" << PyUnicode_AsUTF8(dtypeStr) << ")";
+    oss << ", dtype=" << PyUnicode_AsUTF8(dtypeStr) << ')';
   }
   return !PyErr_Occurred() ? THPUtils_packString(oss.str().c_str()) : nullptr;
 }
@@ -266,7 +266,7 @@ static PyObject* THPIInfo_str(THPIInfo* self) {
   oss << "iinfo(min=" << PyLong_AsDouble(THPIInfo_min(self, nullptr));
   oss << ", max=" << PyLong_AsDouble(THPIInfo_max(self, nullptr));
   if (dtypeStr) {
-    oss << ", dtype=" << PyUnicode_AsUTF8(dtypeStr) << ")";
+    oss << ", dtype=" << PyUnicode_AsUTF8(dtypeStr) << ')';
   }
 
   return !PyErr_Occurred() ? THPUtils_packString(oss.str().c_str()) : nullptr;
diff --git a/torch/csrc/api/include/torch/detail/TensorDataContainer.h b/torch/csrc/api/include/torch/detail/TensorDataContainer.h
index 9485af1d297d2..152672c7f3f21 100644
--- a/torch/csrc/api/include/torch/detail/TensorDataContainer.h
+++ b/torch/csrc/api/include/torch/detail/TensorDataContainer.h
@@ -271,7 +271,7 @@ struct TensorDataContainer {
           "TensorDataContainer_pretty_print_scalar",
           [&] { stream << scalar_.to<scalar_t>(); });
     } else if (is_init_list()) {
-      stream << "{";
+      stream << '{';
       for (const TensorDataContainer* it = init_list_.begin();
            it != init_list_.end();
            it++) {
@@ -279,9 +279,9 @@ struct TensorDataContainer {
         if (std::next(it) != init_list_.end())
           stream << ", ";
       }
-      stream << "}";
+      stream << '}';
     } else if (is_tensor()) {
-      stream << "{";
+      stream << '{';
       for (const auto i : c10::irange(tensor_.sizes()[0])) {
         AT_DISPATCH_ALL_TYPES_AND3(
             at::kBool,
@@ -293,7 +293,7 @@ struct TensorDataContainer {
         if (i != tensor_.sizes()[0] - 1)
           stream << ", ";
       }
-      stream << "}";
+      stream << '}';
     } else {
       TORCH_INTERNAL_ASSERT(false, "Invalid TensorDataContainer type");
     }
diff --git a/torch/csrc/api/include/torch/nn/modules/batchnorm.h b/torch/csrc/api/include/torch/nn/modules/batchnorm.h
index 8437ffd7afb8e..a0456578da0e7 100644
--- a/torch/csrc/api/include/torch/nn/modules/batchnorm.h
+++ b/torch/csrc/api/include/torch/nn/modules/batchnorm.h
@@ -145,7 +145,7 @@ class BatchNormImplBase : public NormImplBase<D, Derived, BatchNormOptions> {
     stream << ", "
            << "affine=" << this->options.affine() << ", "
            << "track_running_stats=" << this->options.track_running_stats()
-           << ")";
+           << ')';
   }
 };
 
diff --git a/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h b/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h
index 008d790fdece1..72cc777cd5c0e 100644
--- a/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h
+++ b/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h
@@ -28,13 +28,13 @@ class ParameterDictImpl : public Cloneable<ParameterDictImpl> {
   void pretty_print(std::ostream& stream) const override {
     stream << "torch::nn::ParameterDict(" << '\n';
     for (const auto& pair : parameters_) {
-      stream << "(" << pair.key() << ")"
-             << ": Parameter containing: [" << pair.value().scalar_type()
-             << " of size " << pair.value().sizes() << "]";
+      stream << '(' << pair.key() << ')' << ": Parameter containing: ["
+             << pair.value().scalar_type() << " of size "
+             << pair.value().sizes() << ']';
       ;
       stream << '\n';
     }
-    stream << ")";
+    stream << ')';
   }
 
   /// Insert the parameter along with the key into ParameterDict
diff --git a/torch/csrc/api/include/torch/nn/modules/container/parameterlist.h b/torch/csrc/api/include/torch/nn/modules/container/parameterlist.h
index 198172ab56489..c42215715406d 100644
--- a/torch/csrc/api/include/torch/nn/modules/container/parameterlist.h
+++ b/torch/csrc/api/include/torch/nn/modules/container/parameterlist.h
@@ -36,13 +36,13 @@ class ParameterListImpl : public Cloneable<ParameterListImpl> {
   void pretty_print(std::ostream& stream) const override {
     stream << "torch::nn::ParameterList(" << '\n';
     for (const auto& pair : parameters_) {
-      stream << "(" << pair.key() << ")"
-             << ": Parameter containing: [" << pair.value().scalar_type()
-             << " of size " << pair.value().sizes() << "]";
+      stream << '(' << pair.key() << ')' << ": Parameter containing: ["
+             << pair.value().scalar_type() << " of size "
+             << pair.value().sizes() << ']';
       ;
       stream << '\n';
     }
-    stream << ")";
+    stream << ')';
   }
 
   /// push the a given parameter at the end of the list
diff --git a/torch/csrc/api/include/torch/nn/modules/conv.h b/torch/csrc/api/include/torch/nn/modules/conv.h
index 8c5f1f3e39182..56fb6023ed4b5 100644
--- a/torch/csrc/api/include/torch/nn/modules/conv.h
+++ b/torch/csrc/api/include/torch/nn/modules/conv.h
@@ -113,8 +113,8 @@ class ConvNdImpl : public torch::nn::Cloneable<Derived> {
 
   /// Pretty prints the `Conv{1,2,3}d` module into the given `stream`.
   void pretty_print(std::ostream& stream) const override {
-    stream << "torch::nn::Conv" << D << "d"
-           << "(" << options.in_channels() << ", " << options.out_channels()
+    stream << "torch::nn::Conv" << D << 'd' << '(' << options.in_channels()
+           << ", " << options.out_channels()
            << ", kernel_size=" << options.kernel_size()
            << ", stride=" << options.stride();
     std::visit(
@@ -143,7 +143,7 @@ class ConvNdImpl : public torch::nn::Cloneable<Derived> {
       stream << ", padding_mode="
              << enumtype::get_enum_name(options.padding_mode());
     }
-    stream << ")";
+    stream << ')';
   }
 
   /// The options with which this `Module` was constructed.
@@ -278,8 +278,8 @@ class ConvTransposeNdImpl : public ConvNdImpl<D, Derived> {
 
   /// Pretty prints the `ConvTranspose{1,2,3}d` module into the given `stream`.
   void pretty_print(std::ostream& stream) const override {
-    stream << "torch::nn::ConvTranspose" << D << "d"
-           << "(" << this->options.in_channels() << ", "
+    stream << "torch::nn::ConvTranspose" << D << 'd' << '('
+           << this->options.in_channels() << ", "
            << this->options.out_channels()
            << ", kernel_size=" << this->options.kernel_size()
            << ", stride=" << this->options.stride();
@@ -303,7 +303,7 @@ class ConvTransposeNdImpl : public ConvNdImpl<D, Derived> {
       stream << ", padding_mode="
              << enumtype::get_enum_name(this->options.padding_mode());
     }
-    stream << ")";
+    stream << ')';
   }
 
  protected:
diff --git a/torch/csrc/api/include/torch/nn/modules/instancenorm.h b/torch/csrc/api/include/torch/nn/modules/instancenorm.h
index 228f181715fc7..492aba8e4e234 100644
--- a/torch/csrc/api/include/torch/nn/modules/instancenorm.h
+++ b/torch/csrc/api/include/torch/nn/modules/instancenorm.h
@@ -53,7 +53,7 @@ class InstanceNormImpl
            << "momentum=" << this->options.momentum() << ", "
            << "affine=" << this->options.affine() << ", "
            << "track_running_stats=" << this->options.track_running_stats()
-           << ")";
+           << ')';
   }
 };
 
diff --git a/torch/csrc/api/include/torch/nn/modules/pooling.h b/torch/csrc/api/include/torch/nn/modules/pooling.h
index 17ed12f4cc037..4f08bf31031e6 100644
--- a/torch/csrc/api/include/torch/nn/modules/pooling.h
+++ b/torch/csrc/api/include/torch/nn/modules/pooling.h
@@ -232,8 +232,8 @@ class TORCH_API AdaptiveMaxPoolImpl : public torch::nn::Cloneable<Derived> {
   /// Pretty prints the `AdaptiveMaxPool{1,2,3}d` module into the given
   /// `stream`.
   void pretty_print(std::ostream& stream) const override {
-    stream << "torch::nn::AdaptiveMaxPool" << D << "d"
-           << "(output_size=" << options.output_size() << ")";
+    stream << "torch::nn::AdaptiveMaxPool" << D << 'd'
+           << "(output_size=" << options.output_size() << ')';
   }
 
   /// The options with which this `Module` was constructed.
@@ -365,8 +365,8 @@ class TORCH_API AdaptiveAvgPoolImpl : public torch::nn::Cloneable<Derived> {
   /// Pretty prints the `AdaptiveAvgPool{1,2,3}d` module into the given
   /// `stream`.
   void pretty_print(std::ostream& stream) const override {
-    stream << "torch::nn::AdaptiveAvgPool" << D << "d"
-           << "(output_size=" << options.output_size() << ")";
+    stream << "torch::nn::AdaptiveAvgPool" << D << 'd'
+           << "(output_size=" << options.output_size() << ')';
   }
 
   /// The options with which this `Module` was constructed.
diff --git a/torch/csrc/api/src/nn/module.cpp b/torch/csrc/api/src/nn/module.cpp
index 563ed4789cb12..5dbc36b7dd5f2 100644
--- a/torch/csrc/api/src/nn/module.cpp
+++ b/torch/csrc/api/src/nn/module.cpp
@@ -355,11 +355,11 @@ void Module::pretty_print_recursive(
     stream << "(\n";
     const std::string next_indentation = indentation + "  ";
     for (const auto& child : children_) {
-      stream << next_indentation << "(" << child.key() << "): ";
+      stream << next_indentation << '(' << child.key() << "): ";
       child.value()->pretty_print_recursive(stream, next_indentation);
       stream << '\n';
     }
-    stream << indentation << ")";
+    stream << indentation << ')';
   }
 }
 
diff --git a/torch/csrc/api/src/nn/modules/activation.cpp b/torch/csrc/api/src/nn/modules/activation.cpp
index 68949f3fb496e..5144ea51ecee7 100644
--- a/torch/csrc/api/src/nn/modules/activation.cpp
+++ b/torch/csrc/api/src/nn/modules/activation.cpp
@@ -21,7 +21,7 @@ void ELUImpl::pretty_print(std::ostream& stream) const {
   if (options.inplace()) {
     stream << std::boolalpha << ", inplace=" << options.inplace();
   }
-  stream << ")";
+  stream << ')';
 }
 
 // ============================================================================
@@ -39,7 +39,7 @@ void SELUImpl::pretty_print(std::ostream& stream) const {
   if (options.inplace()) {
     stream << std::boolalpha << "inplace=" << options.inplace();
   }
-  stream << ")";
+  stream << ')';
 }
 
 // ============================================================================
@@ -55,7 +55,7 @@ void HardshrinkImpl::reset() {}
 
 void HardshrinkImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::Hardshrink(" << options.lambda()
-         << ")";
+         << ')';
 }
 
 // ============================================================================
@@ -86,7 +86,7 @@ void HardtanhImpl::pretty_print(std::ostream& stream) const {
   if (options.inplace()) {
     stream << std::boolalpha << ", inplace=" << options.inplace();
   }
-  stream << ")";
+  stream << ')';
 }
 
 // ============================================================================
@@ -107,7 +107,7 @@ void LeakyReLUImpl::pretty_print(std::ostream& stream) const {
   if (options.inplace()) {
     stream << std::boolalpha << ", inplace=" << options.inplace();
   }
-  stream << ")";
+  stream << ')';
 }
 
 // ============================================================================
@@ -129,7 +129,7 @@ SoftmaxImpl::SoftmaxImpl(const SoftmaxOptions& options_) : options(options_) {}
 void SoftmaxImpl::reset() {}
 
 void SoftmaxImpl::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::Softmax(dim=" << options.dim() << ")";
+  stream << "torch::nn::Softmax(dim=" << options.dim() << ')';
 }
 
 Tensor SoftmaxImpl::forward(const Tensor& input) {
@@ -143,7 +143,7 @@ SoftminImpl::SoftminImpl(const SoftminOptions& options_) : options(options_) {}
 void SoftminImpl::reset() {}
 
 void SoftminImpl::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::Softmin(dim=" << options.dim() << ")";
+  stream << "torch::nn::Softmin(dim=" << options.dim() << ')';
 }
 
 Tensor SoftminImpl::forward(const Tensor& input) {
@@ -158,7 +158,7 @@ LogSoftmaxImpl::LogSoftmaxImpl(const LogSoftmaxOptions& options_)
 void LogSoftmaxImpl::reset() {}
 
 void LogSoftmaxImpl::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::LogSoftmax(dim=" << options.dim() << ")";
+  stream << "torch::nn::LogSoftmax(dim=" << options.dim() << ')';
 }
 
 Tensor LogSoftmaxImpl::forward(const Tensor& input) {
@@ -197,7 +197,7 @@ void PReLUImpl::reset() {
 
 void PReLUImpl::pretty_print(std::ostream& stream) const {
   stream << "torch::nn::PReLU(num_parameters=" << options.num_parameters()
-         << ")";
+         << ')';
 }
 
 // ============================================================================
@@ -215,7 +215,7 @@ void ReLUImpl::pretty_print(std::ostream& stream) const {
   if (options.inplace()) {
     stream << std::boolalpha << "inplace=" << options.inplace();
   }
-  stream << ")";
+  stream << ')';
 }
 
 // ============================================================================
@@ -233,7 +233,7 @@ void ReLU6Impl::pretty_print(std::ostream& stream) const {
   if (options.inplace()) {
     stream << std::boolalpha << "inplace=" << options.inplace();
   }
-  stream << ")";
+  stream << ')';
 }
 
 // ============================================================================
@@ -257,7 +257,7 @@ void RReLUImpl::pretty_print(std::ostream& stream) const {
   if (options.inplace()) {
     stream << std::boolalpha << ", inplace=" << options.inplace();
   }
-  stream << ")";
+  stream << ')';
 }
 
 // ============================================================================
@@ -275,7 +275,7 @@ void CELUImpl::pretty_print(std::ostream& stream) const {
   if (options.inplace()) {
     stream << std::boolalpha << ", inplace=" << options.inplace();
   }
-  stream << ")";
+  stream << ')';
 }
 
 // ============================================================================
@@ -289,7 +289,7 @@ Tensor GLUImpl::forward(const Tensor& input) {
 void GLUImpl::reset() {}
 
 void GLUImpl::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::GLU(dim=" << options.dim() << ")";
+  stream << "torch::nn::GLU(dim=" << options.dim() << ')';
 }
 
 // ============================================================================
@@ -355,7 +355,7 @@ void SoftplusImpl::reset() {}
 
 void SoftplusImpl::pretty_print(std::ostream& stream) const {
   stream << "torch::nn::Softplus(beta=" << options.beta()
-         << ", threshold=" << options.threshold() << ")";
+         << ", threshold=" << options.threshold() << ')';
 }
 
 // ============================================================================
@@ -370,7 +370,7 @@ Tensor SoftshrinkImpl::forward(const Tensor& input) {
 void SoftshrinkImpl::reset() {}
 
 void SoftshrinkImpl::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::Softshrink(" << options.lambda() << ")";
+  stream << "torch::nn::Softshrink(" << options.lambda() << ')';
 }
 
 // ============================================================================
@@ -430,7 +430,7 @@ void ThresholdImpl::pretty_print(std::ostream& stream) const {
   if (options.inplace()) {
     stream << std::boolalpha << ", inplace=" << options.inplace();
   }
-  stream << ")";
+  stream << ')';
 }
 
 // ============================================================================
diff --git a/torch/csrc/api/src/nn/modules/distance.cpp b/torch/csrc/api/src/nn/modules/distance.cpp
index d8e7fa8ac4003..7b45deadac947 100644
--- a/torch/csrc/api/src/nn/modules/distance.cpp
+++ b/torch/csrc/api/src/nn/modules/distance.cpp
@@ -12,7 +12,7 @@ void CosineSimilarityImpl::reset() {}
 
 void CosineSimilarityImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::CosineSimilarity"
-         << "(dim=" << options.dim() << ", eps=" << options.eps() << ")";
+         << "(dim=" << options.dim() << ", eps=" << options.eps() << ')';
 }
 
 Tensor CosineSimilarityImpl::forward(const Tensor& x1, const Tensor& x2) {
@@ -30,7 +30,7 @@ void PairwiseDistanceImpl::reset() {}
 void PairwiseDistanceImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::PairwiseDistance"
          << "(p=" << options.p() << ", eps=" << options.eps()
-         << ", keepdim=" << options.keepdim() << ")";
+         << ", keepdim=" << options.keepdim() << ')';
 }
 
 Tensor PairwiseDistanceImpl::forward(const Tensor& x1, const Tensor& x2) {
diff --git a/torch/csrc/api/src/nn/modules/dropout.cpp b/torch/csrc/api/src/nn/modules/dropout.cpp
index 2b7c5aa3a289e..08433bf363128 100644
--- a/torch/csrc/api/src/nn/modules/dropout.cpp
+++ b/torch/csrc/api/src/nn/modules/dropout.cpp
@@ -19,7 +19,7 @@ Tensor DropoutImpl::forward(Tensor input) {
 
 void DropoutImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::Dropout(p=" << options.p()
-         << ", inplace=" << options.inplace() << ")";
+         << ", inplace=" << options.inplace() << ')';
 }
 
 // ============================================================================
@@ -31,7 +31,7 @@ Tensor Dropout2dImpl::forward(Tensor input) {
 
 void Dropout2dImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::Dropout2d(p=" << options.p()
-         << ", inplace=" << options.inplace() << ")";
+         << ", inplace=" << options.inplace() << ')';
 }
 
 // ============================================================================
@@ -43,7 +43,7 @@ Tensor Dropout3dImpl::forward(Tensor input) {
 
 void Dropout3dImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::Dropout3d(p=" << options.p()
-         << ", inplace=" << options.inplace() << ")";
+         << ", inplace=" << options.inplace() << ')';
 }
 
 // ============================================================================
@@ -55,7 +55,7 @@ Tensor AlphaDropoutImpl::forward(const Tensor& input) {
 
 void AlphaDropoutImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::AlphaDropout(p=" << options.p()
-         << ", inplace=" << options.inplace() << ")";
+         << ", inplace=" << options.inplace() << ')';
 }
 
 // ============================================================================
@@ -67,7 +67,7 @@ Tensor FeatureAlphaDropoutImpl::forward(const Tensor& input) {
 
 void FeatureAlphaDropoutImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::FeatureAlphaDropout(p=" << options.p()
-         << ", inplace=" << options.inplace() << ")";
+         << ", inplace=" << options.inplace() << ')';
 }
 
 } // namespace torch::nn
diff --git a/torch/csrc/api/src/nn/modules/embedding.cpp b/torch/csrc/api/src/nn/modules/embedding.cpp
index b9fededfd7372..e704e71c97e65 100644
--- a/torch/csrc/api/src/nn/modules/embedding.cpp
+++ b/torch/csrc/api/src/nn/modules/embedding.cpp
@@ -76,7 +76,7 @@ void EmbeddingImpl::pretty_print(std::ostream& stream) const {
   if (options.sparse()) {
     stream << ", sparse=" << std::boolalpha << options.sparse();
   }
-  stream << ")";
+  stream << ')';
 }
 
 torch::Tensor EmbeddingImpl::forward(const Tensor& input) {
@@ -181,6 +181,6 @@ void EmbeddingBagImpl::pretty_print(std::ostream& stream) const {
   if (padding_idx_opt.has_value()) {
     stream << ", padding_idx=" << padding_idx_opt.value();
   }
-  stream << ")";
+  stream << ')';
 }
 } // namespace torch::nn
diff --git a/torch/csrc/api/src/nn/modules/fold.cpp b/torch/csrc/api/src/nn/modules/fold.cpp
index 32c83ca6e1b7f..43b07b84fcf27 100644
--- a/torch/csrc/api/src/nn/modules/fold.cpp
+++ b/torch/csrc/api/src/nn/modules/fold.cpp
@@ -17,7 +17,7 @@ void FoldImpl::pretty_print(std::ostream& stream) const {
          << ", kernel_size=" << options.kernel_size()
          << ", dilation=" << options.dilation()
          << ", padding=" << options.padding() << ", stride=" << options.stride()
-         << ")";
+         << ')';
 }
 
 Tensor FoldImpl::forward(const Tensor& input) {
@@ -40,7 +40,7 @@ void UnfoldImpl::pretty_print(std::ostream& stream) const {
   stream << "torch::nn::Unfold(kernel_size=" << options.kernel_size()
          << ", dilation=" << options.dilation()
          << ", padding=" << options.padding() << ", stride=" << options.stride()
-         << ")";
+         << ')';
 }
 
 Tensor UnfoldImpl::forward(const Tensor& input) {
diff --git a/torch/csrc/api/src/nn/modules/linear.cpp b/torch/csrc/api/src/nn/modules/linear.cpp
index 0b31e3aa03730..6ed92d2998c24 100644
--- a/torch/csrc/api/src/nn/modules/linear.cpp
+++ b/torch/csrc/api/src/nn/modules/linear.cpp
@@ -55,7 +55,7 @@ void LinearImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha
          << "torch::nn::Linear(in_features=" << options.in_features()
          << ", out_features=" << options.out_features()
-         << ", bias=" << options.bias() << ")";
+         << ", bias=" << options.bias() << ')';
 }
 
 Tensor LinearImpl::forward(const Tensor& input) {
@@ -70,7 +70,7 @@ void FlattenImpl::reset() {}
 
 void FlattenImpl::pretty_print(std::ostream& stream) const {
   stream << "torch::nn::Flatten(start_dim=" << options.start_dim()
-         << ", end_dim=" << options.end_dim() << ")";
+         << ", end_dim=" << options.end_dim() << ')';
 }
 
 Tensor FlattenImpl::forward(const Tensor& input) {
@@ -161,7 +161,7 @@ void BilinearImpl::pretty_print(std::ostream& stream) const {
          << "torch::nn::Bilinear(in1_features=" << options.in1_features()
          << ", in2_features=" << options.in2_features()
          << ", out_features=" << options.out_features()
-         << ", bias=" << options.bias() << ")";
+         << ", bias=" << options.bias() << ')';
 }
 
 Tensor BilinearImpl::forward(const Tensor& input1, const Tensor& input2) {
diff --git a/torch/csrc/api/src/nn/modules/loss.cpp b/torch/csrc/api/src/nn/modules/loss.cpp
index 7cae60ac99251..6ea9d76af8128 100644
--- a/torch/csrc/api/src/nn/modules/loss.cpp
+++ b/torch/csrc/api/src/nn/modules/loss.cpp
@@ -74,7 +74,7 @@ HingeEmbeddingLossImpl::HingeEmbeddingLossImpl(
 void HingeEmbeddingLossImpl::reset() {}
 
 void HingeEmbeddingLossImpl::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::HingeEmbeddingLoss(margin=" << options.margin() << ")";
+  stream << "torch::nn::HingeEmbeddingLoss(margin=" << options.margin() << ')';
 }
 
 Tensor HingeEmbeddingLossImpl::forward(
@@ -104,7 +104,7 @@ void MultiMarginLossImpl::pretty_print(std::ostream& stream) const {
   stream << "torch::nn::MultiMarginLoss(p=" << options.p()
          << ", margin=" << options.margin() << ", weight=" << options.weight()
          << ", reduction=" << enumtype::get_enum_name(options.reduction())
-         << ")";
+         << ')';
 }
 
 Tensor MultiMarginLossImpl::forward(const Tensor& input, const Tensor& target) {
@@ -126,7 +126,7 @@ CosineEmbeddingLossImpl::CosineEmbeddingLossImpl(
 void CosineEmbeddingLossImpl::reset() {}
 
 void CosineEmbeddingLossImpl::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::CosineEmbeddingLoss(margin=" << options.margin() << ")";
+  stream << "torch::nn::CosineEmbeddingLoss(margin=" << options.margin() << ')';
 }
 
 Tensor CosineEmbeddingLossImpl::forward(
@@ -169,7 +169,7 @@ void TripletMarginLossImpl::reset() {}
 void TripletMarginLossImpl::pretty_print(std::ostream& stream) const {
   stream << "torch::nn::TripletMarginLoss(margin=" << options.margin()
          << ", p=" << options.p() << ", eps=" << options.eps() << std::boolalpha
-         << ", swap=" << options.swap() << ")";
+         << ", swap=" << options.swap() << ')';
 }
 
 Tensor TripletMarginLossImpl::forward(
@@ -199,7 +199,7 @@ void TripletMarginWithDistanceLossImpl::pretty_print(
     std::ostream& stream) const {
   stream << "torch::nn::TripletMarginWithDistanceLoss(margin="
          << options.margin() << std::boolalpha << ", swap=" << options.swap()
-         << ")";
+         << ')';
 }
 
 Tensor TripletMarginWithDistanceLossImpl::forward(
diff --git a/torch/csrc/api/src/nn/modules/normalization.cpp b/torch/csrc/api/src/nn/modules/normalization.cpp
index 41129c8990923..72957356a3da9 100644
--- a/torch/csrc/api/src/nn/modules/normalization.cpp
+++ b/torch/csrc/api/src/nn/modules/normalization.cpp
@@ -40,7 +40,7 @@ void LayerNormImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::LayerNorm("
          << torch::IntArrayRef(options.normalized_shape())
          << ", eps=" << options.eps()
-         << ", elementwise_affine=" << options.elementwise_affine() << ")";
+         << ", elementwise_affine=" << options.elementwise_affine() << ')';
 }
 
 torch::Tensor LayerNormImpl::forward(const Tensor& input) {
@@ -64,7 +64,7 @@ void LocalResponseNormImpl::reset() {}
 void LocalResponseNormImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::LocalResponseNorm(" << options.size()
          << ", alpha=" << options.alpha() << ", beta=" << options.beta()
-         << ", k=" << options.k() << ")";
+         << ", k=" << options.k() << ')';
 }
 
 // ============================================================================
@@ -74,7 +74,7 @@ void CrossMapLRN2dImpl::reset() {}
 void CrossMapLRN2dImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::CrossMapLRN2d(" << options.size()
          << ", alpha=" << options.alpha() << ", beta=" << options.beta()
-         << ", k=" << options.k() << ")";
+         << ", k=" << options.k() << ')';
 }
 
 torch::Tensor CrossMapLRN2dImpl::forward(const torch::Tensor& input) {
@@ -115,7 +115,7 @@ torch::Tensor GroupNormImpl::forward(const Tensor& input) {
 void GroupNormImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::GroupNorm(" << options.num_groups()
          << ", " << options.num_channels() << ", eps=" << options.eps()
-         << ", affine=" << options.affine() << ")";
+         << ", affine=" << options.affine() << ')';
 }
 
 } // namespace torch::nn
diff --git a/torch/csrc/api/src/nn/modules/padding.cpp b/torch/csrc/api/src/nn/modules/padding.cpp
index d992bf696d0ca..2e3212f7c94fe 100644
--- a/torch/csrc/api/src/nn/modules/padding.cpp
+++ b/torch/csrc/api/src/nn/modules/padding.cpp
@@ -21,8 +21,8 @@ Tensor ReflectionPadImpl<D, Derived>::forward(const Tensor& input) {
 
 template <size_t D, typename Derived>
 void ReflectionPadImpl<D, Derived>::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::ReflectionPad" << D << "d"
-         << "(padding=" << options.padding() << ")";
+  stream << "torch::nn::ReflectionPad" << D << 'd'
+         << "(padding=" << options.padding() << ')';
 }
 
 template class ReflectionPadImpl<1, ReflectionPad1dImpl>;
@@ -46,8 +46,8 @@ Tensor ReplicationPadImpl<D, Derived>::forward(const Tensor& input) {
 
 template <size_t D, typename Derived>
 void ReplicationPadImpl<D, Derived>::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::ReplicationPad" << D << "d"
-         << "(padding=" << options.padding() << ")";
+  stream << "torch::nn::ReplicationPad" << D << 'd'
+         << "(padding=" << options.padding() << ')';
 }
 
 template class ReplicationPadImpl<1, ReplicationPad1dImpl>;
@@ -70,8 +70,8 @@ Tensor ZeroPadImpl<D, Derived>::forward(const Tensor& input) {
 
 template <size_t D, typename Derived>
 void ZeroPadImpl<D, Derived>::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::ZeroPad" << D << "d"
-         << "(padding=" << options.padding() << ")";
+  stream << "torch::nn::ZeroPad" << D << 'd' << "(padding=" << options.padding()
+         << ')';
 }
 
 template class ZeroPadImpl<1, ZeroPad1dImpl>;
@@ -96,9 +96,9 @@ Tensor ConstantPadImpl<D, Derived>::forward(const Tensor& input) {
 
 template <size_t D, typename Derived>
 void ConstantPadImpl<D, Derived>::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::ConstantPad" << D << "d"
+  stream << "torch::nn::ConstantPad" << D << 'd'
          << "(padding=" << options.padding() << ", value=" << options.value()
-         << ")";
+         << ')';
 }
 
 template class ConstantPadImpl<1, ConstantPad1dImpl>;
diff --git a/torch/csrc/api/src/nn/modules/pixelshuffle.cpp b/torch/csrc/api/src/nn/modules/pixelshuffle.cpp
index b11a99eea4e47..bae89d1964961 100644
--- a/torch/csrc/api/src/nn/modules/pixelshuffle.cpp
+++ b/torch/csrc/api/src/nn/modules/pixelshuffle.cpp
@@ -9,7 +9,7 @@ PixelShuffleImpl::PixelShuffleImpl(const PixelShuffleOptions& options_)
 
 void PixelShuffleImpl::pretty_print(std::ostream& stream) const {
   stream << "torch::nn::PixelShuffle(upscale_factor="
-         << options.upscale_factor() << ")";
+         << options.upscale_factor() << ')';
 }
 
 void PixelShuffleImpl::reset() {}
@@ -23,7 +23,7 @@ PixelUnshuffleImpl::PixelUnshuffleImpl(const PixelUnshuffleOptions& options_)
 
 void PixelUnshuffleImpl::pretty_print(std::ostream& stream) const {
   stream << "torch::nn::PixelUnshuffle(downscale_factor="
-         << options.downscale_factor() << ")";
+         << options.downscale_factor() << ')';
 }
 
 void PixelUnshuffleImpl::reset() {}
diff --git a/torch/csrc/api/src/nn/modules/pooling.cpp b/torch/csrc/api/src/nn/modules/pooling.cpp
index f42cfe6b20294..3d6aeb6dffb6c 100644
--- a/torch/csrc/api/src/nn/modules/pooling.cpp
+++ b/torch/csrc/api/src/nn/modules/pooling.cpp
@@ -15,10 +15,10 @@ void AvgPoolImpl<D, Derived>::reset() {}
 
 template <size_t D, typename Derived>
 void AvgPoolImpl<D, Derived>::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::AvgPool" << D << "d"
+  stream << "torch::nn::AvgPool" << D << 'd'
          << "(kernel_size=" << options.kernel_size()
          << ", stride=" << options.stride() << ", padding=" << options.padding()
-         << ")";
+         << ')';
 }
 
 Tensor AvgPool1dImpl::forward(const Tensor& input) {
@@ -68,11 +68,11 @@ void MaxPoolImpl<D, Derived>::reset() {}
 
 template <size_t D, typename Derived>
 void MaxPoolImpl<D, Derived>::pretty_print(std::ostream& stream) const {
-  stream << std::boolalpha << "torch::nn::MaxPool" << D << "d"
+  stream << std::boolalpha << "torch::nn::MaxPool" << D << 'd'
          << "(kernel_size=" << options.kernel_size()
          << ", stride=" << options.stride() << ", padding=" << options.padding()
          << ", dilation=" << options.dilation()
-         << ", ceil_mode=" << options.ceil_mode() << ")";
+         << ", ceil_mode=" << options.ceil_mode() << ')';
 }
 
 Tensor MaxPool1dImpl::forward(const Tensor& input) {
@@ -219,10 +219,10 @@ void MaxUnpoolImpl<D, Derived>::reset() {}
 
 template <size_t D, typename Derived>
 void MaxUnpoolImpl<D, Derived>::pretty_print(std::ostream& stream) const {
-  stream << std::boolalpha << "torch::nn::MaxUnpool" << D << "d"
+  stream << std::boolalpha << "torch::nn::MaxUnpool" << D << 'd'
          << "(kernel_size=" << options.kernel_size()
          << ", stride=" << options.stride() << ", padding=" << options.padding()
-         << ")";
+         << ')';
 }
 
 Tensor MaxUnpool1dImpl::forward(
@@ -401,7 +401,7 @@ void LPPoolImpl<D, Derived>::pretty_print(std::ostream& stream) const {
          << "norm_type=" << options.norm_type() << ", "
          << "kernel_size=" << options.kernel_size() << ", "
          << "stride=" << options.stride() << ", "
-         << "ceil_mode=" << options.ceil_mode() << ")";
+         << "ceil_mode=" << options.ceil_mode() << ')';
 }
 
 Tensor LPPool1dImpl::forward(const Tensor& input) {
diff --git a/torch/csrc/api/src/nn/modules/rnn.cpp b/torch/csrc/api/src/nn/modules/rnn.cpp
index be7c5ded2fc52..7ee864bc8ea94 100644
--- a/torch/csrc/api/src/nn/modules/rnn.cpp
+++ b/torch/csrc/api/src/nn/modules/rnn.cpp
@@ -374,7 +374,7 @@ void RNNImplBase<Derived>::pretty_print(std::ostream& stream) const {
   if (options_base.proj_size() > 0) {
     stream << ", proj_size=" << options_base.proj_size();
   }
-  stream << ")";
+  stream << ')';
 }
 
 template <typename Derived>
@@ -837,7 +837,7 @@ template <typename Derived>
 void RNNCellImplBase<Derived>::pretty_print(std::ostream& stream) const {
   const std::string name = this->name();
   const std::string name_without_impl = name.substr(0, name.size() - 4);
-  stream << name_without_impl << "(" << options_base.input_size() << ", "
+  stream << name_without_impl << '(' << options_base.input_size() << ", "
          << options_base.hidden_size();
   if (!options_base.bias()) {
     stream << ", bias=" << std::boolalpha << false;
@@ -846,7 +846,7 @@ void RNNCellImplBase<Derived>::pretty_print(std::ostream& stream) const {
   if (!nonlinearity_str.empty() && nonlinearity_str != "kTanh") {
     stream << ", nonlinearity=" << nonlinearity_str;
   }
-  stream << ")";
+  stream << ')';
 }
 
 template <typename Derived>
diff --git a/torch/csrc/api/src/nn/modules/upsampling.cpp b/torch/csrc/api/src/nn/modules/upsampling.cpp
index 420ffe5a8813d..e29f1034fa51c 100644
--- a/torch/csrc/api/src/nn/modules/upsampling.cpp
+++ b/torch/csrc/api/src/nn/modules/upsampling.cpp
@@ -18,7 +18,7 @@ void UpsampleImpl::pretty_print(std::ostream& stream) const {
     // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
     stream << "size=" << at::ArrayRef<int64_t>(options.size().value());
   }
-  stream << ", mode=" << enumtype::get_enum_name(options.mode()) << ")";
+  stream << ", mode=" << enumtype::get_enum_name(options.mode()) << ')';
 }
 
 Tensor UpsampleImpl::forward(const Tensor& input) {
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 42d701298b0d1..f3a8c4faac175 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -79,6 +79,12 @@ Tensor toNonOptPrimal(const std::optional<Tensor>& t) {
   return Tensor();
 }
 
+void update_wrapped_number(Tensor& input, Tensor& output) {
+  if (input.unsafeGetTensorImpl()->is_wrapped_number()) {
+    output.unsafeGetTensorImpl()->set_wrapped_number(true);
+  }
+}
+
 void copy_range(variable_list& out, IndexRange range, const Tensor& t) {
   TORCH_CHECK(range.second <= out.size());
   TORCH_CHECK(
@@ -3326,7 +3332,14 @@ std::tuple<Tensor, Tensor> atan2_backward(
   if (!grad.defined()) {
     return std::tuple<Tensor, Tensor>{Tensor(), Tensor()};
   }
-  auto recip = (self * self + other * other).reciprocal();
+  auto denom = self * self + other * other;
+  auto recip = denom.reciprocal();
+  if (at::areAnyTensorSubclassLike({self, other, denom, recip}) ||
+      at::GradMode::is_enabled()) {
+    recip = recip.masked_fill(denom == 0, 0);
+  } else {
+    recip.masked_fill_(denom == 0, 0);
+  }
   return std::tuple<Tensor, Tensor>{
       output_mask[0] ? grad * other * recip : Tensor(),
       output_mask[1] ? grad * -self * recip : Tensor()};
diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h
index 4dc0425d426ec..ee0f919c44012 100644
--- a/torch/csrc/autograd/FunctionsManual.h
+++ b/torch/csrc/autograd/FunctionsManual.h
@@ -43,6 +43,7 @@ inline std::optional<Tensor> wrap_opt_if(const Tensor& t, const bool cond) {
 TORCH_API Tensor
 apply_loss_reduction(const Tensor& unreduced, int64_t reduction);
 TORCH_API bool any_variable_defined(const variable_list& variables);
+TORCH_API void update_wrapped_number(Tensor& input, Tensor& output);
 TORCH_API void copy_range(
     variable_list& out,
     IndexRange range,
diff --git a/torch/csrc/autograd/autograd_not_implemented_fallback.cpp b/torch/csrc/autograd/autograd_not_implemented_fallback.cpp
index 9de461cc56a28..c16cbb2331f07 100644
--- a/torch/csrc/autograd/autograd_not_implemented_fallback.cpp
+++ b/torch/csrc/autograd/autograd_not_implemented_fallback.cpp
@@ -6,6 +6,12 @@
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <ATen/core/ivalue.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/_async_error.h>
+#endif
+
 #include <c10/core/impl/TorchDispatchModeTLS.h>
 #include <torch/csrc/autograd/VariableTypeUtils.h>
 #include <torch/csrc/autograd/autograd.h>
@@ -50,7 +56,6 @@ AutogradFallbackMode kAutogradFallbackMode = AutogradFallbackMode::Warn;
 } // namespace
 
 void setAutogradFallbackMode(AutogradFallbackMode mode) {
-  TORCH_CHECK(mode != AutogradFallbackMode::Error, "NYI: mode='error'");
   kAutogradFallbackMode = mode;
 }
 
@@ -58,41 +63,60 @@ AutogradFallbackMode getAutogradFallbackMode() {
   return kAutogradFallbackMode;
 }
 
-static void warnAutogradNotImplemented(const std::string& op_name) {
-  TORCH_WARN(
-      op_name,
-      ": an autograd kernel was not registered to the Autograd key(s) ",
-      "but we are trying to backprop through it. This may lead to silently incorrect behavior. ",
-      "This behavior is deprecated and will be removed in a future version of PyTorch. ",
-      "If your operator is differentiable, please ensure you have registered an "
-      "autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, "
-      "DispatchKey::CompositeImplicitAutograd). If your operator is not "
-      "differentiable, or to squash this warning and use the previous behavior, "
-      "please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd.");
+static void reportAutogradNotImplemented(
+    const std::string& op_name,
+    bool is_warn) {
+  if (is_warn) {
+    TORCH_WARN(
+        op_name,
+        ": an autograd kernel was not registered to the Autograd key(s) ",
+        "but we are trying to backprop through it. This may lead to silently incorrect behavior. ",
+        "This behavior is deprecated and will be removed in a future version of PyTorch. ",
+        "If your operator is differentiable, please ensure you have registered an "
+        "autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, "
+        "DispatchKey::CompositeImplicitAutograd). If your operator is not "
+        "differentiable, or to squash this warning and use the previous behavior, "
+        "please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd.");
+  } else {
+    at::_async_error(c10::str(
+        op_name,
+        ": an autograd kernel was not registered to the Autograd key(s) ",
+        "but we are trying to backprop through it. This can lead to silently incorrect behavior. ",
+        "If your operator is differentiable, please ensure you have registered an "
+        "autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, "
+        "). If your operator is not "
+        "differentiable and ensure NO gradients flow through this operator, "
+        "please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd."));
+  }
 }
 
-struct WarnNotImplemented : public Node {
-  WarnNotImplemented(
+struct NotImplementedBackward : public Node {
+  NotImplementedBackward(
       std::string op_name,
       size_t num_outputs,
+      bool is_warn,
       edge_list&& next_edges)
       : Node(std::move(next_edges)),
         op_name(std::move(op_name)),
-        num_outputs(num_outputs) {}
+        num_outputs(num_outputs),
+        is_warn(is_warn) {}
 
-  WarnNotImplemented(std::string op_name, size_t num_outputs)
-      : op_name(std::move(op_name)), num_outputs(num_outputs) {}
+  NotImplementedBackward(std::string op_name, size_t num_outputs, bool is_warn)
+      : op_name(std::move(op_name)),
+        num_outputs(num_outputs),
+        is_warn(is_warn) {}
 
   variable_list apply(variable_list&& inputs) override;
 
   std::string op_name;
   size_t num_outputs;
+  bool is_warn;
 };
 
 // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
-auto WarnNotImplemented::apply(variable_list&& inputs) -> variable_list {
+auto NotImplementedBackward::apply(variable_list&& inputs) -> variable_list {
   auto inputsLocal = std::move(inputs);
-  warnAutogradNotImplemented(op_name);
+  reportAutogradNotImplemented(op_name, is_warn);
   std::vector<at::Tensor> output(num_outputs);
   return output;
 }
@@ -111,8 +135,6 @@ static void basicAutogradNotImplementedFallbackImpl(
     op.redispatchBoxed(dispatch_keys & c10::after_autograd_keyset, stack);
     return;
   }
-  TORCH_INTERNAL_ASSERT(
-      getAutogradFallbackMode() == AutogradFallbackMode::Warn);
 
   bool any_input_requires_grad = false;
   _foreach_tensor(
@@ -128,7 +150,9 @@ static void basicAutogradNotImplementedFallbackImpl(
   // by putting it after the requires_grad checks.
   any_input_requires_grad = any_input_requires_grad && GradMode::is_enabled();
 
-  std::shared_ptr<WarnNotImplemented> grad_fn;
+  bool is_warn = getAutogradFallbackMode() == AutogradFallbackMode::Warn;
+
+  std::shared_ptr<NotImplementedBackward> grad_fn;
   if (any_input_requires_grad) {
     // NB: It is standard to collect edges from all tensors
     // (see generated/VariableTypeEverything.cpp for examples)
@@ -140,8 +164,9 @@ static void basicAutogradNotImplementedFallbackImpl(
         stack,
         stack_start,
         num_arguments);
-    grad_fn = std::shared_ptr<WarnNotImplemented>(
-        new WarnNotImplemented(op_name, all_tensors_on_stack.size()),
+    grad_fn = std::shared_ptr<NotImplementedBackward>(
+        new NotImplementedBackward(
+            op_name, all_tensors_on_stack.size(), is_warn),
         deleteNode);
     grad_fn->set_next_edges(collect_next_edges(all_tensors_on_stack));
   }
@@ -177,8 +202,8 @@ static void basicAutogradNotImplementedFallbackImpl(
           // >>> y = op(k)
           // >>> torch.autograd.grad(z.sum(), w)
           if (t.requires_grad()) {
-            t.register_hook([op_name](const at::Tensor& grad) {
-              warnAutogradNotImplemented(op_name);
+            t.register_hook([op_name, is_warn](const at::Tensor& grad) {
+              reportAutogradNotImplemented(op_name, is_warn);
             });
             // If history is rebased, then we will attempt to warn
             // on the view's base. This will catch most cases (because
@@ -188,18 +213,19 @@ static void basicAutogradNotImplementedFallbackImpl(
               const auto& base = t._base();
               if (base.requires_grad()) {
                 // Can only register_hook on tensors that require grad.
-                base.register_hook([op_name](const at::TensorBase& grad) {
-                  warnAutogradNotImplemented(op_name);
-                });
+                base.register_hook(
+                    [op_name, is_warn](const at::TensorBase& grad) {
+                      reportAutogradNotImplemented(op_name, is_warn);
+                    });
               }
             }
             return;
           }
 
           // If the post-autograd implementation returns any Tensors that
-          // don't require grad, then we install the WarnNotImplemented grad_fn.
-          // This grad_fn warns in backward and returns undefined tensor
-          // gradients.
+          // don't require grad, then we install the NotImplementedBackward
+          // grad_fn. This grad_fn warns in backward and returns undefined
+          // tensor gradients.
           //
           // NOTE [autograd fallback and in-place operations]
           // If the schema says the output is mutable, and the output
diff --git a/torch/csrc/autograd/functions/accumulate_grad.h b/torch/csrc/autograd/functions/accumulate_grad.h
index 97e689d36050c..8f55f22ae4ad4 100644
--- a/torch/csrc/autograd/functions/accumulate_grad.h
+++ b/torch/csrc/autograd/functions/accumulate_grad.h
@@ -180,7 +180,9 @@ struct TORCH_API AccumulateGrad : public Node {
       if (!GradMode::is_enabled() && !new_grad.is_sparse() &&
           !new_grad.is_sparse_csr() &&
           !(variable.is_sparse_csr() && new_grad.layout() == at::kStrided) &&
-          at::caching::adjusted_use_count(new_grad) <= num_expected_refs &&
+          impl::is_tensor_stealable(
+              new_grad,
+              num_expected_refs + at::caching::is_cached_tensor(new_grad)) &&
           (new_grad.is_mkldnn() ||
            utils::obeys_layout_contract(new_grad, variable))) {
         // See Case 1.1: Stealable dense new_grad
@@ -193,7 +195,7 @@ struct TORCH_API AccumulateGrad : public Node {
           // SparseTensor should be the only one holding a reference to these.
           new_grad._indices().use_count() <= 1 &&
           new_grad._values().use_count() <= 1 &&
-          new_grad.use_count() <= num_expected_refs) {
+          impl::is_tensor_stealable(new_grad, num_expected_refs)) {
         // Case 1.2: Stealable sparse new_grad
         // No scenario where we expect this to be true currently
         TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index 7cfb935942046..a13cc70270ccb 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -1218,6 +1218,33 @@ static PyObject* is_view_replay_enabled(PyObject* self, PyObject* args) {
   END_HANDLE_TH_ERRORS
 }
 
+static PyObject* set_graph_exec_group(PyObject* self, PyObject* obj) {
+  HANDLE_TH_ERRORS
+  if (obj == Py_None) {
+    c10::AutogradState::get_tls_state().set_graph_exec_group(std::nullopt);
+  } else {
+    Py_INCREF(obj);
+    c10::AutogradState::get_tls_state().set_graph_exec_group(
+        c10::SafePyObject(obj, getPyInterpreter()));
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* get_graph_exec_group(PyObject* self, PyObject* args) {
+  HANDLE_TH_ERRORS
+  const auto& group =
+      c10::AutogradState::get_tls_state().get_graph_exec_group();
+  if (group.has_value()) {
+    PyObject* obj = group->ptr(getPyInterpreter());
+    Py_INCREF(obj);
+    return obj;
+  } else {
+    Py_RETURN_NONE;
+  }
+  END_HANDLE_TH_ERRORS
+}
+
 static PyObject* is_inference_mode_enabled(PyObject* _unused, PyObject* arg) {
   HANDLE_TH_ERRORS
   if (c10::InferenceMode::is_enabled()) {
@@ -1598,6 +1625,8 @@ static PyMethodDef methods[] = {
      castPyCFunctionWithKeywords(set_view_replay_enabled),
      METH_VARARGS | METH_KEYWORDS,
      nullptr},
+    {"_set_graph_exec_group", set_graph_exec_group, METH_O, nullptr},
+    {"_get_graph_exec_group", get_graph_exec_group, METH_NOARGS, nullptr},
     {"_enter_dual_level", python_enter_dual_level, METH_NOARGS, nullptr},
     {"_exit_dual_level",
      castPyCFunctionWithKeywords(python_exit_dual_level),
diff --git a/torch/csrc/autograd/input_buffer.cpp b/torch/csrc/autograd/input_buffer.cpp
index 62770ef946592..a477bf4c3e507 100644
--- a/torch/csrc/autograd/input_buffer.cpp
+++ b/torch/csrc/autograd/input_buffer.cpp
@@ -86,8 +86,8 @@ bool can_accumulate_inplace(const Variable& v) {
       v.is_non_overlapping_and_dense() &&
 
       // and we hold the last reference
-      at::caching::adjusted_use_count(v) == 1 && v.has_storage() &&
-      v.storage().use_count() == 1);
+      impl::is_tensor_stealable(v, 1 + at::caching::is_cached_tensor(v)) &&
+      v.has_storage() && v.storage().use_count() == 1);
 }
 } // anonymous namespace
 
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index 946a8d5f1d367..6d0bf5d0a8579 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -1,8 +1,13 @@
+#include <ATen/DTensorState.h>
 #include <ATen/NamedTensorUtils.h>
+#include <ATen/native/Resize.h>
 #include <c10/core/DeviceType.h>
+#include <c10/core/SymIntArrayRef.h>
 #include <c10/core/impl/GPUTrace.h>
 #include <c10/core/impl/HermeticPyObjectTLS.h>
 #include <c10/core/impl/PythonDispatcherTLS.h>
+#include <c10/util/FbcodeMaps.h>
+#include <c10/util/SmallVector.h>
 #include <c10/util/irange.h>
 #include <pybind11/pytypes.h>
 #include <torch/csrc/Device.h>
@@ -40,7 +45,6 @@
 
 #include <ATen/ATen.h>
 
-#include <c10/core/SymIntArrayRef.h>
 #include <structmember.h>
 #include <cstdint>
 #include <memory>
@@ -50,15 +54,103 @@
 using namespace at;
 using namespace torch;
 using namespace torch::autograd;
+using torch::utils::PyObjectPreservation;
 
-std::pair<py::object, py::dict> parseIValuesToPyArgsKwargs(
-    const c10::OperatorHandle& op,
-    const std::vector<c10::IValue>& arguments) {
-  TORCH_CHECK(
-      PyGILState_Check(),
-      "GIL must be held before you call parseIValuesToPyArgsKwargs");
-  const auto& schema = op.schema();
-  py::dict kwargs;
+namespace {
+class OperatorArgsKwargsView {
+ public:
+  OperatorArgsKwargsView(
+      const c10::OperatorHandle& op,
+      const std::vector<c10::IValue>& arguments);
+  using args_iterator = const c10::IValue*;
+
+  args_iterator args_begin() const {
+    return arguments_.data();
+  }
+
+  args_iterator args_end() const {
+    return arguments_.data() + positional_default_start_;
+  }
+
+  auto num_positional_args() const {
+    return positional_default_start_;
+  }
+
+  auto kwarg_start_index() const {
+    return first_non_default_kwarg_;
+  }
+
+  struct kwargs_iterator {
+    kwargs_iterator() = default;
+    kwargs_iterator(const OperatorArgsKwargsView* parent, size_t current)
+        : parent_(parent), current_(current) {}
+
+    kwargs_iterator(const kwargs_iterator&) = default;
+    kwargs_iterator& operator=(const kwargs_iterator&) = default;
+
+    kwargs_iterator& operator++() {
+      do {
+        current_++;
+      } while (current_ < parent_->arguments_.size() &&
+               parent_->is_default(current_));
+      return *this;
+    }
+
+    kwargs_iterator operator++(int) {
+      auto copy = *this;
+      ++(*this);
+      return copy;
+    }
+
+    const c10::IValue& operator*() const {
+      return parent_->arguments_[current_];
+    }
+
+    const c10::IValue* operator->() const {
+      return &operator*();
+    }
+
+    int64_t underlying_index() const {
+      return current_;
+    }
+
+    bool operator==(const kwargs_iterator& rhs) const {
+      return parent_ == rhs.parent_ && current_ == rhs.current_;
+    }
+
+    bool operator!=(const kwargs_iterator& rhs) {
+      return !(*this == rhs);
+    }
+
+   private:
+    const OperatorArgsKwargsView* parent_ = nullptr;
+    size_t current_ = 0;
+  };
+
+  kwargs_iterator kwargs_begin() const {
+    return kwargs_iterator(this, first_non_default_kwarg_);
+  }
+
+  kwargs_iterator kwargs_end() const {
+    return kwargs_iterator(this, arguments_.size());
+  }
+
+ private:
+  bool is_default(size_t idx) const {
+    const auto& arg = op_.schema().arguments()[idx];
+    if (!arg.default_value().has_value()) {
+      return false;
+    }
+    const auto& default_ivalue = *arg.default_value();
+    const auto& ivalue = arguments_[idx];
+    if (default_ivalue != ivalue) {
+      return false;
+    }
+    return true;
+  }
+
+  const c10::OperatorHandle& op_;
+  c10::ArrayRef<c10::IValue> arguments_;
   // About all the pointers:
   //
   // f(int x, int y = 0, *, int z = 0)
@@ -66,45 +158,63 @@ std::pair<py::object, py::dict> parseIValuesToPyArgsKwargs(
   //                        ^- kwarg_only_start
   //          ^- positional_default_start
   //   ^- 0
+  int64_t positional_default_start_;
+  int64_t first_non_default_kwarg_;
+};
 
+OperatorArgsKwargsView::OperatorArgsKwargsView(
+    const c10::OperatorHandle& op,
+    const std::vector<c10::IValue>& arguments)
+    : op_(op), arguments_(arguments) {
   // Find the split point between kwarg-only and regular.  Since most functions
   // don't have kwarg-only arguments, it is more efficient to scan from the
   // right (but ideally, this would just be precomputed in FunctionSchema
   // itself).  (NB: minus one in the loop is because we're testing if the
   // *next* argument is kwarg-only before we advance the starting index)
-  int64_t kwarg_only_start = static_cast<int64_t>(arguments.size());
+  const int64_t signed_arguments_size = static_cast<int64_t>(arguments.size());
+  int64_t kwarg_only_start = signed_arguments_size;
   for (; kwarg_only_start > 0; kwarg_only_start--) {
-    const auto& arg = schema.arguments()[kwarg_only_start - 1];
+    const auto& arg = op.schema().arguments()[kwarg_only_start - 1];
     if (!arg.kwarg_only()) {
       break;
     }
   }
 
   // Find the first positional argument that isn't defaulted
-  auto is_default = [&](size_t idx) -> bool {
-    const auto& arg = schema.arguments()[idx];
-    if (!arg.default_value().has_value()) {
-      return false;
-    }
-    const auto& default_ivalue = *arg.default_value();
-    const auto& ivalue = arguments[idx];
-    if (default_ivalue != ivalue) {
-      return false;
+  positional_default_start_ = kwarg_only_start;
+  for (; positional_default_start_ > 0; positional_default_start_--) {
+    if (!is_default(positional_default_start_ - 1)) {
+      break;
     }
-    return true;
-  };
+  }
 
-  int64_t positional_default_start = kwarg_only_start;
-  for (; positional_default_start > 0; positional_default_start--) {
-    if (!is_default(positional_default_start - 1)) {
+  // kwargs_iterator will skip default kwargs when incremented, but we
+  // need to skip any initial run of default kwargs ourselves.
+  first_non_default_kwarg_ = kwarg_only_start;
+  for (; first_non_default_kwarg_ < signed_arguments_size;
+       ++first_non_default_kwarg_) {
+    if (!is_default(first_non_default_kwarg_)) {
       break;
     }
   }
+}
+} // namespace
 
-  auto args =
-      py::reinterpret_steal<py::object>(PyTuple_New(positional_default_start));
+std::pair<py::object, py::dict> parseIValuesToPyArgsKwargs(
+    const c10::OperatorHandle& op,
+    const std::vector<c10::IValue>& arguments) {
+  TORCH_CHECK(
+      PyGILState_Check(),
+      "GIL must be held before you call parseIValuesToPyArgsKwargs");
+  const auto& schema = op.schema();
+  py::dict kwargs;
+
+  OperatorArgsKwargsView args_kwargs(op, arguments);
+  auto args = py::reinterpret_steal<py::object>(
+      PyTuple_New(args_kwargs.num_positional_args()));
 
-  auto schemaAwareToPyObject = [&](size_t idx) -> py::object {
+  auto schemaAwareToPyObject =
+      [&schema](size_t idx, const c10::IValue& argument) -> py::object {
     const auto& arg = schema.arguments()[idx];
     auto match = [&](c10::TypeKind kind) {
       const auto& t = arg.real_type();
@@ -116,38 +226,42 @@ std::pair<py::object, py::dict> parseIValuesToPyArgsKwargs(
       }
       return false;
     };
-    if (arguments[idx].isNone()) {
+    if (argument.isNone()) {
       return py::none();
     } else if (match(c10::ScalarTypeType::Kind)) {
-      auto* obj =
-          getTHPDtype(static_cast<c10::ScalarType>(arguments[idx].toInt()));
+      auto* obj = getTHPDtype(static_cast<c10::ScalarType>(argument.toInt()));
       return py::reinterpret_borrow<py::object>(
           reinterpret_cast<PyObject*>(obj));
     } else if (match(c10::LayoutType::Kind)) {
-      auto* obj =
-          getTHPLayout(static_cast<c10::Layout>(arguments[idx].toInt()));
+      auto* obj = getTHPLayout(static_cast<c10::Layout>(argument.toInt()));
       return py::reinterpret_borrow<py::object>(
           reinterpret_cast<PyObject*>(obj));
     } else if (match(c10::MemoryFormatType::Kind)) {
-      return py::cast(static_cast<c10::MemoryFormat>(arguments[idx].toInt()));
+      return py::cast(static_cast<c10::MemoryFormat>(argument.toInt()));
     } else {
-      return torch::jit::toPyObject(arguments[idx]);
+      return torch::jit::toPyObject(argument);
     }
   };
 
   // Populate positional arguments
-  for (const auto idx : c10::irange(positional_default_start)) {
+  size_t idx = 0;
+  for (auto argument_it = args_kwargs.args_begin();
+       argument_it != args_kwargs.args_end();
+       ++argument_it) {
     PyTuple_SET_ITEM(
-        args.ptr(), idx, schemaAwareToPyObject(idx).release().ptr());
+        args.ptr(),
+        idx,
+        schemaAwareToPyObject(idx, *argument_it).release().ptr());
+    idx++;
   }
 
   // Populate keyword arguments
-  for (const auto idx : c10::irange(kwarg_only_start, arguments.size())) {
-    // But don't populate default keyword arguments
-    if (is_default(idx))
-      continue;
-    const auto& arg = schema.arguments()[idx];
-    kwargs[py::cast(arg.name())] = schemaAwareToPyObject(idx);
+  for (auto argument_it = args_kwargs.kwargs_begin();
+       argument_it != args_kwargs.kwargs_end();
+       ++argument_it) {
+    const auto& arg = schema.arguments()[argument_it.underlying_index()];
+    kwargs[py::cast(arg.name())] =
+        schemaAwareToPyObject(argument_it.underlying_index(), *argument_it);
   }
   return std::make_pair(std::move(args), std::move(kwargs));
 }
@@ -208,20 +322,15 @@ PyObject* THPVariableClass = nullptr;
 
 PyObject* ParameterClass = nullptr;
 
-static PyObject* THPVariable_NewWithVar(
-    PyTypeObject* type,
-    const at::TensorBase& _var,
-    bool allow_preexisting_pyobj = false,
-    std::optional<bool> has_torch_dispatch_if_known = std::nullopt);
-
 // clang-tidy gets confused by static const
 static constexpr const char* VOLATILE_WARNING =
     "volatile was removed and now has no effect. Use "
     "`with torch.no_grad():` instead.";
 
+static void TORCH_CHECK_TENSOR_SUBTYPE(PyObject* cls);
+
 static bool check_has_torch_dispatch(PyObject* obj) {
-  PyTypeObject* tp = Py_TYPE(obj);
-  if (THPVariable_CheckTypeExact(tp)) {
+  if (THPVariable_CheckExact(obj)) {
     return false;
   }
   py::object attr = PyObject_FastGetAttrString(obj, "__torch_dispatch__");
@@ -257,152 +366,86 @@ void activateGPUTrace() {
   c10::impl::GPUTrace::set_trace(getPyInterpreter());
 }
 
-PyObject* THPVariable_Wrap(const at::TensorBase& var) {
+static void check_tensor_subclass(PyObject* obj, PyTypeObject* type) {
+  TORCH_CHECK(
+      PyObject_TypeCheck(obj, type),
+      "Creating a new Tensor subclass ",
+      type->tp_name,
+      " but the raw Tensor object is already associated to a python object ",
+      "of type ",
+      Py_TYPE(obj)->tp_name,
+      " which is not a subclass of the requested type");
+}
+
+// Generic for const Tensor& or Tensor&&
+template <typename T>
+static PyObject* THPVariable_WrapWithType(
+    T&& var,
+    std::optional<PyTypeObject*> desired_type) {
   if (!var.defined()) {
     Py_RETURN_NONE;
   }
 
-  if (c10::impl::HermeticPyObjectTLS::get_state()) {
-    return THPVariable_NewWithVar((PyTypeObject*)THPVariableClass, var);
-  }
-
-  std::optional<PyObject*> mb_obj =
-      var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
-          /*ignore_hermetic_tls=*/false);
-  if (mb_obj.has_value()) {
-    auto obj = *mb_obj;
-    if (obj) {
-      if (var.unsafeGetTensorImpl()->pyobj_slot()->owns_pyobj()) {
-        // C++ owns the Python object; this implies there weren't any other
-        // owning references to the Python object.  Since we're making the
-        // object "live" again on Python side, let's flip back the ownership
-        // (Python owns C++) as it would now be unsound to deallocate the C++
-        // object if all C++ references go to zero
-        var.unsafeGetTensorImpl()->pyobj_slot()->set_owns_pyobj(false);
-        reinterpret_cast<THPVariable*>(obj)->cdata =
-            MaybeOwned<Variable>::owned(Variable(var));
-        // NB: incref is not necessary, because we are "stealing" the previous
-        // ownership from the Variable to return it here for the wrap
-        return obj;
-      }
-      Py_INCREF(obj);
-      return obj;
-    }
-    // TODO: a better invariant is that if we tagged, we MUST have a valid
-    // PyObject.  That's PyObject preservation
-    // (https://github.com/pytorch/pytorch/pull/56017).  Prior to this PR
-    // being a thing, the PyObject field will get cleared when all references
-    // to the Python object are removed.
-  }
+  c10::TensorImpl* tensor_impl = var.unsafeGetTensorImpl();
+  c10::impl::PyObjectSlot* pyobj_slot = tensor_impl->pyobj_slot();
 
-  if (C10_LIKELY(var.device().type() != c10::kXLA)) {
-    return THPVariable_NewWithVar((PyTypeObject*)THPVariableClass, var);
+  PyObject* obj = pyobj_slot->load_pyobj();
+  if (obj) {
+    if (desired_type) {
+      check_tensor_subclass(obj, *desired_type);
+    }
+    return Py_NewRef(obj);
   }
 
-  if (auto clazz = getPythonTensorClass(var.device())) {
-    return THPVariable_NewWithVar((PyTypeObject*)clazz, var);
+  PyTypeObject* type = reinterpret_cast<PyTypeObject*>(THPVariableClass);
+  if (desired_type) {
+    type = *desired_type;
+  } else if (C10_UNLIKELY(var.device().type() == c10::kXLA)) {
+    if (auto clazz = getPythonTensorClass(var.device())) {
+      type = reinterpret_cast<PyTypeObject*>(clazz);
+    }
   }
 
-  return THPVariable_NewWithVar((PyTypeObject*)THPVariableClass, var);
-}
+  obj = type->tp_alloc(type, 0);
+  TORCH_CHECK(obj, "Failed to allocate a ", type->tp_name, " object");
 
-static bool isResurrectable(THPVariable* self) {
-  // We want to divide this check into 2 cases.
+  // Ensure that PyUnstable_TryIncref calls don't fail spuriously in
+  // free-threaded Python.
+  PyUnstable_EnableTryIncRef(obj);
 
-  // 1. C++ owns PyObject (in this case, self->cdata.unsafeIsBorrowed() is
-  // true). You might think that in this case, it is impossible for tp_clear to
-  // be called: surely the C++ reference to the PyObject is keeping it live? And
-  // you'd be right! In fact, when C++ owns the PyObject, we have an invariant
-  // that the refcount on the PyObject should be precisely one (because if you
-  // take out another reference to the PyObject, we're supposed to flip the
-  // ownership pointer back). In reality, you can violate this invariant
-  // temporarily with weak references, so we don't test for it in asserts.
+  auto v = reinterpret_cast<THPVariable*>(obj);
+  new (&v->cdata) Tensor(std::forward<T>(var));
 
-  // 2. PyObject owns C++ (in this case, self->cdata.unsafeIsBorrowed() is
-  // false). In this case, tp_clear can get called if the PyObject is referenced
-  // from a dead cycle, and nowhere else. But if resurrection did not occur,
-  // then the reference to C++ from the PyObject must be the ONLY reference to
-  // the C++ object.
-  if (self->cdata.unsafeIsBorrowed()) {
-    return false;
+  if (THPVariable_Unpack(obj).is_uniquely_owned()) {
+    // We can use a faster non-atomic code path if we have the only reference to
+    // a fresh Tensor.
+    PyObjectPreservation::init_fresh_nonatomic(tensor_impl, pyobj_slot, obj);
+    return obj;
   }
-  auto const& tensor = THPVariable_Unpack(self);
-  if (!tensor.defined() || tensor.use_count() <= 1) {
-    return false;
-  }
-  // Check if this is hermetic. If it is, no resurrection.
-  if (tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
-          /*ignore_hermetic_tls=*/false) != (PyObject*)self) {
-    return false;
-  }
-  return true;
-}
 
-// returns true if successfully rezzed; if so, cancel the
-// rest of deallocation
-static bool THPVariable_tryResurrect(THPVariable* self) {
-  const auto& tensor = THPVariable_Unpack(self);
-
-  if (!isResurrectable(self)) {
-    return false;
+  PyObject* wrapper =
+      PyObjectPreservation::init_once(tensor_impl, pyobj_slot, obj);
+  if (wrapper != obj) {
+    // Another thread beat us to it
+    Py_DECREF(obj);
+    if (desired_type) {
+      check_tensor_subclass(wrapper, *desired_type);
+    }
+    return Py_NewRef(wrapper);
   }
+  return obj;
+}
 
-  // At this point, we are definitely going to resurrect the tensor. So, the
-  // tensor better be defined :)
-  TORCH_INTERNAL_ASSERT(tensor.defined());
-
-  // There are other C++ owners of the tensor.  Flip ownership
-  // so that C++ owns this Python object, and cancel deallocation.
-  TORCH_INTERNAL_ASSERT(
-      !tensor.unsafeGetTensorImpl()->pyobj_slot()->owns_pyobj());
-
-  c10::TensorImpl* tensor_impl = tensor.unsafeGetTensorImpl();
-  auto maybe_pyobj = tensor_impl->pyobj_slot()->check_pyobj(
-      /*ignore_hermetic_tls=*/false);
-
-  TORCH_INTERNAL_ASSERT(
-      maybe_pyobj.has_value(),
-      "Trying to preserve a Python tensor whose PyObjectSlot does not have a PyObject");
-
-  tensor_impl->pyobj_slot()->set_owns_pyobj(true);
-
-  // Resurrect the Python object.  This is something CPython does
-  // internally occasionally, see
-  // https://github.com/python/cpython/blob/b98eba5bc2ffbe7a0ed49d540ebc4f756ae61985/Objects/object.c#L248-L259
-  // so we just copy the pattern here.  Note that we don't have to worry
-  // about saving and restoring the refcount (as the quoted code does)
-  // because we actually DO need to reset the refcount to one here, we
-  // can't assume that some other code has taken care of it.
-  // NB: this will overreport _Py_RefTotal but based on inspection of object.c
-  // there is no way to avoid this
-
-  // When resurrecting, we MUST use _Py_NewReference and not Py_INCREF to
-  // ensure the PyObject is in a valid state
-  _Py_NewReference((PyObject*)self);
-
-  // Flip THPVariable to be non-owning
-  // (near use-after-free miss here: fresh MaybeOwned is created breaking
-  // reference on Tensor in struct BEFORE we overwrite the old one)
-  TORCH_INTERNAL_ASSERT(!c10::impl::HermeticPyObjectTLS::get_state());
-  self->cdata = MaybeOwned<Variable>::borrowed(tensor);
-
-  // NB: At this point, tensor *could* be dead (e.g., some other C++ thread
-  // decrefed it.)  At this point, it is probably waiting on the GIL to
-  // deallocate the Python object and will kill self, BUT NOT YET.
-
-  return true;
+PyObject* THPVariable_Wrap(at::TensorBase&& var) {
+  return THPVariable_WrapWithType(std::move(var), std::nullopt);
 }
 
-static int THPFake_traverse(THPVariable* self, visitproc visit, void* arg) {
-  TORCH_INTERNAL_ASSERT(
-      false, "TensorBase tp_traverse function was not overridden properly");
-  return 0;
+PyObject* THPVariable_Wrap(const at::TensorBase& var) {
+  return THPVariable_WrapWithType(var, std::nullopt);
 }
 
-static int THPFake_clear(THPVariable* self) {
-  TORCH_INTERNAL_ASSERT(
-      false, "TensorBase tp_clear function was not overridden properly");
-  return 0;
+PyObject* THPVariable_Wrap(const at::TensorBase& var, PyTypeObject* type) {
+  return THPVariable_WrapWithType(var, type);
 }
 
 static PyObject* THPVariable_pynew(
@@ -564,16 +607,16 @@ static PyObject* THPVariable_as_subclass(
   ParsedArgs<1> parsed_args{};
   auto r = parser.parse(_self, args, kwargs, parsed_args);
   PyObject* cls = r.pyobject(0);
-  TORCH_CHECK_TYPE(
-      PyType_Check(cls),
-      "cls must be a type (got ",
-      Py_TYPE(cls)->tp_name,
-      ")");
+  TORCH_CHECK_TENSOR_SUBTYPE(cls);
   // guard completely turns off torch dispatch modes, doesn't just pop off the
   // stack
   torch_dispatch_mode::StashTorchDispatchStackGuard td_g;
   c10::impl::DisablePythonDispatcher dpd_g;
-  return THPVariable_NewWithVar((PyTypeObject*)cls, self.alias());
+  PyObject* obj = THPVariable_WrapWithType(self.alias(), (PyTypeObject*)cls);
+  if (check_has_torch_dispatch(obj)) {
+    THPVariable_Unpack(obj).unsafeGetTensorImpl()->set_python_dispatch(true);
+  }
+  return obj;
   END_HANDLE_TH_ERRORS
 }
 
@@ -588,11 +631,7 @@ static PyObject* THPVariable_make_subclass(
   ParsedArgs<7> parsed_args{};
   auto r = parser.parse(args, kwargs, parsed_args);
   PyObject* cls = r.pyobject(0);
-  TORCH_CHECK_TYPE(
-      PyType_Check(cls),
-      "cls must be a type (got ",
-      Py_TYPE(cls)->tp_name,
-      ")");
+  TORCH_CHECK_TENSOR_SUBTYPE(cls);
   // guard completely turns off torch dispatch modes, doesn't just pop off the
   // stack
   torch_dispatch_mode::StashTorchDispatchStackGuard td_g;
@@ -625,7 +664,11 @@ static PyObject* THPVariable_make_subclass(
     data.unsafeGetTensorImpl()->_change_backend_component_keys(r.device(6));
   }
 
-  return THPVariable_NewWithVar((PyTypeObject*)cls, data);
+  PyObject* obj = THPVariable_WrapWithType(data, (PyTypeObject*)cls);
+  if (check_has_torch_dispatch(obj)) {
+    THPVariable_Unpack(obj).unsafeGetTensorImpl()->set_python_dispatch(true);
+  }
+  return obj;
   END_HANDLE_TH_ERRORS
 }
 
@@ -722,11 +765,7 @@ static PyObject* THPVariable_make_wrapper_subclass(
   auto r = parser.parse(args, kwargs, parsed_args);
   PyObject* cls = r.pyobject(0);
 
-  TORCH_CHECK_TYPE(
-      PyType_Check(cls),
-      "cls must be a type (got ",
-      Py_TYPE(cls)->tp_name,
-      ")");
+  TORCH_CHECK_TENSOR_SUBTYPE(cls);
 
   // This is an important safety check; without it, the default behavior will be
   // to continue on to the underlying CPU/CUDA kernel advertised by the dispatch
@@ -764,6 +803,8 @@ static PyObject* THPVariable_make_wrapper_subclass(
       /*storage_size=*/r.toSymIntOptional(14),
       r.toDispatchKeySetOptional(13));
 
+  tensor.unsafeGetTensorImpl()->set_python_dispatch(true);
+
   const auto sizes_strides_policy = r.stringViewOptional(10);
   if (sizes_strides_policy.has_value()) {
     tensor.unsafeGetTensorImpl()->set_python_custom_sizes_strides(
@@ -779,39 +820,88 @@ static PyObject* THPVariable_make_wrapper_subclass(
     tensor.unsafeGetTensorImpl()->set_python_custom_layout(true);
   }
 
-  return THPVariable_NewWithVar(
-      (PyTypeObject*)cls,
-      tensor,
-      // false is the default
-      /*allow_preexisting_pyobj=*/false,
-      // we checked __torch_dispatch__ above; avoid checking again.
-      /*has_torch_dispatch_if_known=*/true);
+  return THPVariable_WrapWithType(std::move(tensor), (PyTypeObject*)cls);
   END_HANDLE_TH_ERRORS
 }
 
-static py::handle get_dtensor_spec_class() {
 #if IS_PYBIND_2_13_PLUS
-  PYBIND11_CONSTINIT static py::gil_safe_call_once_and_store<py::object>
-      storage;
-  return storage
-      .call_once_and_store_result([]() -> py::object {
-        return py::module::import("torch")
-            .attr("distributed")
-            .attr("tensor")
-            .attr("_dtensor_spec")
-            .attr("DTensorSpec");
-      })
-      .get_stored();
+#define DEFINE_CACHING_PYTHON_IMPORT_GETTER(name, import_expr)             \
+  static py::handle name() {                                               \
+    PYBIND11_CONSTINIT static py::gil_safe_call_once_and_store<py::object> \
+        storage;                                                           \
+    return storage                                                         \
+        .call_once_and_store_result(                                       \
+            []() -> py::object { return import_expr; })                    \
+        .get_stored();                                                     \
+  }
 #else
-  static py::handle dtensor_spec_class = py::object(py::module::import("torch")
-                                                        .attr("distributed")
-                                                        .attr("tensor")
-                                                        .attr("_dtensor_spec")
-                                                        .attr("DTensorSpec"))
-                                             .release();
-  return dtensor_spec_class;
+#define DEFINE_CACHING_PYTHON_IMPORT_GETTER(name, import_expr)     \
+  static py::handle name() {                                       \
+    static py::handle storage = py::object(import_expr).release(); \
+    return storage;                                                \
+  }
 #endif
-}
+
+DEFINE_CACHING_PYTHON_IMPORT_GETTER(
+    get_dtensor_class_impl,
+    py::module::import("torch.distributed.tensor").attr("DTensor"))
+
+py::handle get_dtensor_class() {
+  return get_dtensor_class_impl();
+}
+
+DEFINE_CACHING_PYTHON_IMPORT_GETTER(
+    get_dtensor_spec_class,
+    py::module::import("torch.distributed.tensor")
+        .attr("_dtensor_spec")
+        .attr("DTensorSpec"))
+
+DEFINE_CACHING_PYTHON_IMPORT_GETTER(
+    get_replicate_class,
+    py::module::import("torch.distributed.tensor")
+        .attr("placement_types")
+        .attr("Replicate"))
+
+DEFINE_CACHING_PYTHON_IMPORT_GETTER(
+    get_tensor_meta_class,
+    py::module::import("torch.distributed.tensor")
+        .attr("_dtensor_spec")
+        .attr("TensorMeta"))
+
+DEFINE_CACHING_PYTHON_IMPORT_GETTER(
+    get_dtensor_op_dispatcher,
+    py::module::import("torch.distributed.tensor")
+        .attr("DTensor")
+        .attr("_op_dispatcher"))
+
+DEFINE_CACHING_PYTHON_IMPORT_GETTER(
+    get_dtensor_dispatch,
+    py::module::import("torch.distributed.tensor")
+        .attr("DTensor")
+        .attr("_op_dispatcher")
+        .attr("_dispatch_fast_path_python_tail"))
+
+DEFINE_CACHING_PYTHON_IMPORT_GETTER(
+    get_dtensor_dispatcher_wrap,
+    py::module::import("torch.distributed.tensor")
+        .attr("DTensor")
+        .attr("_op_dispatcher")
+        .attr("wrap"))
+
+DEFINE_CACHING_PYTHON_IMPORT_GETTER(
+    get_dtensor_get_local_results_slow_path,
+    py::module::import("torch")
+        .attr("distributed")
+        .attr("tensor")
+        .attr("DTensor")
+        .attr("_op_dispatcher")
+        .attr("_dispatch_get_local_results_slow_path"))
+
+DEFINE_CACHING_PYTHON_IMPORT_GETTER(
+    get_output_sharding_class,
+    py::module::import("torch.distributed.tensor")
+        .attr("_op_schema")
+        .attr("OutputSharding"))
 
 static bool arg_type_tensor_or_tensor_list_like(py::handle arg) {
   const auto dtensor_spec_class = get_dtensor_spec_class();
@@ -839,13 +929,26 @@ static bool arg_type_tensor_or_tensor_list_like(py::handle arg) {
 #define FOR_EACH_DTENSOR_INTERNED_STRING(_)                   \
   MAYBE_FOR_EACH_PYTHON_3_10_MINUS_DTENSOR_INTERNED_STRING(_) \
   _(_comparison_key)                                          \
+  _(_custom_op_handlers)                                      \
   _(_local_tensor)                                            \
   _(_spec)                                                    \
+  _(_unwrap_to_op_info_impl)                                  \
   _(args_schema)                                              \
+  _(compute_mesh)                                             \
+  _(device_mesh)                                              \
+  _(dtype)                                                    \
+  _(get_coordinate)                                           \
   _(kwargs_schema)                                            \
+  _(ndim)                                                     \
+  _(needs_pytree)                                             \
+  _(needs_redistribute)                                       \
   _(op)                                                       \
+  _(op_to_schema_info)                                        \
+  _(output_sharding)                                          \
+  _(output_spec)                                              \
   _(schema_info)                                              \
   _(shape)                                                    \
+  _(sharding_propagator)                                      \
   _(size)                                                     \
   _(static_argnum)                                            \
   _(static_kwargkey)                                          \
@@ -860,6 +963,7 @@ struct DTensorInternedStrings {
 
 static DTensorInternedStrings dtensor_interned_strings;
 
+#ifdef USE_DISTRIBUTED
 static bool intern_dtensor_strings() {
 #define INTERN_DTENSOR_STRING(s)                                           \
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dtensor_interned_strings.s == nullptr); \
@@ -872,6 +976,7 @@ static bool intern_dtensor_strings() {
 #undef INTERN_DTENSOR_STRING
   return true;
 }
+#endif
 
 static bool checked_not(PyObject* obj) {
   int result = PyObject_Not(obj);
@@ -881,6 +986,36 @@ static bool checked_not(PyObject* obj) {
   return result;
 }
 
+static bool checked_istrue(PyObject* obj) {
+  int result = PyObject_IsTrue(obj);
+  if (result == -1) {
+    throw py::error_already_set();
+  }
+  return result;
+}
+
+// pybind11 does not not use PyObject_Vectorcall currently; it seems
+// to materialize a tuple of args instead.
+template <std::size_t N>
+static py::object checked_vectorcall(
+    PyObject* obj,
+    std::array<PyObject*, N> args) {
+  PyObject* result = PyObject_Vectorcall(obj, args.data(), N, nullptr);
+  if (!result) {
+    throw py::error_already_set();
+  }
+  return py::reinterpret_steal<py::object>(result);
+}
+
+template <typename... Args>
+static py::object checked_vectorcall(PyObject* obj, Args... args) {
+  static_assert(
+      (std::is_same_v<Args, PyObject*> && ...),
+      "must pass PyObject* to checked_vectorcall!");
+  std::array<PyObject*, sizeof...(Args)> arr = {args...};
+  return checked_vectorcall(obj, arr);
+}
+
 static c10::SymDimVector tuple_to_symintlist(PyObject* obj) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(PyTuple_Check(obj));
   c10::SymDimVector res;
@@ -901,6 +1036,577 @@ static c10::SymDimVector tuple_to_symintlist(PyObject* obj) {
   return res;
 }
 
+// As a Python object, DTensorSpec can be stored directly within
+// IValue, but doing so is inefficient -- it requires a
+// heap-allocated, reference counted intermediate
+// ivalue::PyObjectHolder.
+// Representation options:
+// 1) Add an IValue tag to represent a placeholder object.
+// 2) Play representational tricks -- stuff information into an IValue
+// payload, such as by creating impossible
+// intrusive_ptr_target*. Problem: this would cause IValue copying and
+// possibly destruction to crash and so would be horribly unsafe.
+// 3) Represent DTensorSpec directly inside IValue despite the inefficiency.
+// 4) Leave the actual DTensor in the list of IValues, but detect it efficiently
+// and transparently replace.
+// 5) Just use a 24-byte struct of IValue + extra py::object.
+//
+// Given the high blast radius of (1), the unsafety of (2), the likely
+// poor performance of (3), and detection of (4) looking less
+// efficient than (5), (5) seems like the best path forward.
+
+// We can't safely steal bits from IValue, so we just use 24 bytes of
+// space. If dtensor_spec is non-null (truthy) then it's the active
+// member, otherwise it's iv.
+struct IValueOrDTensorSpec {
+  IValueOrDTensorSpec() = default;
+  explicit IValueOrDTensorSpec(c10::IValue v) : iv(std::move(v)) {}
+  explicit IValueOrDTensorSpec(py::object dts) : dtensor_spec(std::move(dts)) {}
+  c10::IValue iv;
+  py::object dtensor_spec;
+
+  bool operator==(const IValueOrDTensorSpec& rhs) const {
+    return dtensor_spec
+        ? (rhs.dtensor_spec && dtensor_spec.equal(rhs.dtensor_spec))
+        : (iv == rhs.iv);
+  }
+};
+
+// This corresponds to the Python OpSchema class in that it is the key
+// for the (native version of the) sharding propagator cache. It is
+// missing essentially everything else from the Python OpSchema
+// though.
+class NativeOpSchema {
+ public:
+  NativeOpSchema(
+      const c10::OperatorHandle& op,
+      c10::SmallVector<IValueOrDTensorSpec, 8> comparison_key,
+      std::size_t comparison_key_hash,
+      std::size_t args_schema_len)
+      : op_(op),
+        hash_(hash_combine(
+            hash_combine(
+                std::hash<c10::OperatorHandle>()(op),
+                comparison_key_hash),
+            args_schema_len)),
+        args_schema_len_(args_schema_len),
+        comparison_key_(std::move(comparison_key)) {}
+
+  bool operator==(const NativeOpSchema& rhs) const {
+    // If two NativeOpSchema are being compared, they are probably
+    // equal, because comparison is occurring during a hash table
+    // lookup and we know the hashes are already equal. Therefore, we
+    // don't bother checking hash_ first.
+    return op_ == rhs.op_ && args_schema_len_ == rhs.args_schema_len_ &&
+        comparison_key_ == rhs.comparison_key_;
+  }
+
+  std::size_t hash() const {
+    return hash_;
+  }
+
+ private:
+  // It would *not* be correct to store this by reference, because we
+  // have no guarantees about its lifetime. This class is cheap anyway.
+  c10::OperatorHandle op_;
+  std::size_t hash_;
+  // Subtle point: consider clamp.Tensor(Tensor self, Tensor?
+  // min=None, Tensor? max=None). The invocations clamp(t1, None, t2)
+  // and clamp(t1, t2, None) have the same comparison key (t1, t2)
+  // because we drop non-static non-tensor args from comparison. The
+  // only way we happen to be able to tell them apart is that we omit
+  // trailing defaulted arguments from the args tuple passed to
+  // __torch_dispatch__ (and hence to DTensor dispatch as well), so
+  // they have different args_schema_len_.
+  //
+  // I am preserving this existing behavior, but I suspect we should
+  // make an algorithm change to be less brittle, such as including
+  // None defaults for Tensor arguments in the comparison.
+  std::size_t args_schema_len_;
+  // There is no particular justification for the choice of 8
+  // here. Feel free to change it.
+  c10::SmallVector<IValueOrDTensorSpec, 8> comparison_key_;
+};
+
+namespace std {
+template <>
+struct hash<NativeOpSchema> {
+  std::size_t operator()(const NativeOpSchema& schema) const {
+    return schema.hash();
+  }
+};
+} // namespace std
+
+// Map from OpSchema to pyobject sharding propagation config.
+class NativeShardingPropagatorCache {
+ public:
+  // Returns an invalid (falsey) py::object if the lookup fails.
+  py::object find(const NativeOpSchema& op_schema) const {
+    if (auto it = repr_.find(op_schema); it != repr_.end()) {
+      hits_++;
+      return py::object(it->second);
+    }
+    misses_++;
+    return py::object();
+  }
+
+  void insert(NativeOpSchema&& op_schema, py::object output_sharding) {
+    auto [it, inserted] =
+        repr_.emplace(std::move(op_schema), std::move(output_sharding));
+    TORCH_INTERNAL_ASSERT(
+        inserted,
+        "tried to insert already-present element in NativeShardingPropagatorCache!");
+  }
+
+  auto hits() const {
+    return hits_;
+  }
+
+  auto misses() const {
+    return misses_;
+  }
+
+ private:
+  c10::FastMap<NativeOpSchema, py::object> repr_;
+  // Cache is thread-local, so we don't take any further action for
+  // thread-safety of these.
+  mutable std::size_t hits_ = 0;
+  mutable std::size_t misses_ = 0;
+};
+
+static std::optional<std::pair<NativeOpSchema, /*ComputeMesh*/ py::object>>
+create_native_op_schema(
+    const c10::OperatorHandle& op,
+    py::handle py_op,
+    torch::jit::Stack* stack);
+
+static std::mutex native_sharding_propagator_cache_cleanup_mutex;
+static c10::
+    FastMap<std::thread::id, std::optional<NativeShardingPropagatorCache>*>
+        all_thread_caches;
+thread_local std::optional<NativeShardingPropagatorCache>
+    native_sharding_propagator_cache_DO_NOT_USE;
+
+NativeShardingPropagatorCache&
+get_thread_local_native_sharding_propagator_cache() {
+  if (!native_sharding_propagator_cache_DO_NOT_USE.has_value()) {
+    native_sharding_propagator_cache_DO_NOT_USE.emplace();
+    std::lock_guard<std::mutex> lock(
+        native_sharding_propagator_cache_cleanup_mutex);
+    const auto this_thread_id = std::this_thread::get_id();
+    all_thread_caches[this_thread_id] =
+        &native_sharding_propagator_cache_DO_NOT_USE;
+    py::dict thread_dict =
+        py::reinterpret_borrow<py::dict>(PyThreadState_GetDict());
+    // We need to clean up before Python detaches from the thread if
+    // the thread is being destroyed.
+    thread_dict["__DTensor_fastpath_thread_cache_cleanup"] =
+        py::capsule(new std::thread::id(this_thread_id), [](void* p) {
+          auto* ptid = reinterpret_cast<std::thread::id*>(p);
+          {
+            std::lock_guard<std::mutex> inner_lock(
+                native_sharding_propagator_cache_cleanup_mutex);
+            auto it = all_thread_caches.find(*ptid);
+            if (it != all_thread_caches.end()) {
+              // We need to both:
+              // 1) free python objects, and
+              it->second->reset();
+              // 2) make sure we don't try to come back and mess with
+              // a destroyed thread-local at module unload (e.g.,
+              // process exit) time.
+              all_thread_caches.erase(it);
+            }
+          }
+          delete ptid;
+        });
+  }
+  return native_sharding_propagator_cache_DO_NOT_USE.value();
+}
+
+// We need to clean up all thread_locals if our module is getting
+// unloaded.
+void cleanup_thread_local_native_sharding_propagator_caches() {
+  std::lock_guard<std::mutex> lock(
+      native_sharding_propagator_cache_cleanup_mutex);
+  for (auto& [_, popt_cache] : all_thread_caches) {
+    popt_cache->reset();
+  }
+  all_thread_caches.clear();
+}
+
+static void replace_dtensors_with_local_tensor(torch::jit::Stack& stack);
+
+static bool is_default_overload(const std::string& overload_name) {
+  return overload_name.empty() || overload_name == "default";
+}
+
+static bool is_random_op(const c10::OperatorHandle& op) {
+  // NOTE: must stay in sync with _random_ops in
+  // torch/distributed/tensor/_dispatch.py
+  constexpr auto aten_namespace_prefix_len = 6;
+  const auto& op_name = op.operator_name();
+  if (op_name.name.size() <= aten_namespace_prefix_len ||
+      memcmp(op_name.name.data(), "aten::", aten_namespace_prefix_len) != 0) {
+    return false;
+  }
+  static constexpr std::array<std::string_view, 6> random_names = {{
+      "native_dropout",
+      "normal_",
+      "rand_like",
+      "randn_like",
+      "uniform_",
+      "bernoulli",
+  }};
+  std::string_view name_without_namespace(
+      op_name.name.c_str() + aten_namespace_prefix_len,
+      op_name.name.size() - aten_namespace_prefix_len);
+  if (name_without_namespace == "bernoulli_") {
+    return op_name.overload_name == "float";
+  }
+  if (name_without_namespace == "randint_like") {
+    return is_default_overload(op_name.overload_name) ||
+        op_name.overload_name == "low_dtype" ||
+        op_name.overload_name == "low_dtype_out";
+  }
+  const auto it = std::find(
+      random_names.begin(), random_names.end(), name_without_namespace);
+  if (it == random_names.end()) {
+    return false;
+  }
+  return is_default_overload(op_name.overload_name);
+}
+
+// Puts local results on the stack. Return true for success, false for bailout
+// to slow path.
+static bool get_local_results(
+    const c10::OperatorHandle& op,
+    py::handle output_sharding,
+    py::handle compute_mesh,
+    bool participating,
+    torch::jit::Stack* stack) {
+  if (participating) {
+    // computation that happens in the current rank of the mesh, normal case
+    if (checked_istrue(
+            output_sharding.attr(dtensor_interned_strings.needs_redistribute)
+                .ptr()) ||
+        is_random_op(op)) {
+      // Bail out to slow path.
+      return false;
+    }
+    // normal case, run local sharded op computation.
+
+    // It is slightly inefficient that we take another pass over
+    // arguments here when we just did one in create_native_op_schema to
+    // create the comparison key. However, we have a crucial difference:
+    // in the NativeOpSchema, we don't want to waste time dealing with
+    // defaulted args. Here, we need to provide defaulted args because
+    // we are going to make a local op call.
+    replace_dtensors_with_local_tensor(*stack);
+    op.callBoxed(*stack);
+  } else {
+    // For a non-participating device (happens on rank that does not
+    // belong to the device mesh), we do:
+    //
+    //   1. if the return type is scalar, set the local result to
+    //   None.
+    //   2. if the return type is Tensor or List[Tensor], return
+    //   empty tensor(s) with correct dtype.
+
+    stack->clear();
+
+    auto spec = output_sharding.attr(dtensor_interned_strings.output_spec);
+    if (spec.is_none()) {
+      // For a scalar return type, the non-participating device has
+      // None as its local result.
+      stack->emplace_back(); // Return None.
+      return true;
+    }
+
+    const auto default_tensor = [](py::handle spec) -> Tensor {
+      auto tensor_meta = spec.attr(dtensor_interned_strings.tensor_meta);
+      TORCH_CHECK(
+          !tensor_meta.is_none(), py::str(spec), " has no tensor metadata.");
+      const auto sizes = tensor_meta.attr(dtensor_interned_strings.shape);
+      TORCH_CHECK(
+          PyTuple_Check(sizes.ptr()), "spec.tensor_meta.shape must be a tuple");
+      const auto dtype = tensor_meta.attr(dtensor_interned_strings.dtype);
+      TORCH_CHECK(
+          THPDtype_Check(dtype.ptr()),
+          "spec.tensor_meta.dtype must be a torch.dtype");
+      const auto scalar_type =
+          reinterpret_cast<THPDtype*>(dtype.ptr())->scalar_type;
+      if (py::cast<py::tuple>(sizes).empty()) {
+        // scalar tensor
+        return at::zeros({}, scalar_type);
+      } else {
+        // non-scalar tensor
+        return at::empty({0}, scalar_type);
+      }
+    };
+    auto handle_sequence = [&default_tensor, &op, stack](auto sequence) {
+      c10::List<c10::IValue> result(op.schema().returns().at(0).type());
+      for (const auto& item : sequence) {
+        TORCH_CHECK(
+            !item.is_none(),
+            "return type ",
+            op.schema().returns().at(0).type(),
+            " in DTensor op is not supported");
+        result.push_back(default_tensor(item));
+      }
+      stack->push_back(std::move(result));
+    };
+
+    if (py::isinstance(spec, get_dtensor_spec_class())) {
+      stack->push_back(default_tensor(spec));
+    } else if (PyList_Check(spec.ptr())) {
+      handle_sequence(py::reinterpret_borrow<py::list>(spec));
+    } else if (PyTuple_Check(spec.ptr())) {
+      handle_sequence(py::reinterpret_borrow<py::tuple>(spec));
+    } else if (PySequence_Check(spec.ptr())) {
+      handle_sequence(py::reinterpret_borrow<py::sequence>(spec));
+    } else {
+      // return None.
+      stack->emplace_back();
+    }
+  }
+  return true;
+}
+
+static void functionalize_unsafe_set(at::Tensor& dst, const at::Tensor& src) {
+  at::native::checkSetStorage(
+      dst,
+      src.storage(),
+      dst.sym_storage_offset(),
+      dst.sym_sizes(),
+      dst.sym_strides(),
+      /*check_offset_in_bounds=*/false);
+}
+
+static bool sets_intersect(
+    const std::unordered_set<Symbol>& smaller,
+    const std::unordered_set<Symbol>& bigger) {
+  if (smaller.size() > bigger.size()) {
+    return sets_intersect(bigger, smaller);
+  }
+  for (const auto& item : smaller) {
+    if (bigger.find(item) != bigger.end()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+py::object dispatchDTensorOp(
+    const c10::OperatorHandle& op,
+    py::handle py_op,
+    py::handle args,
+    py::handle kwargs,
+    torch::jit::Stack* stack) {
+  py::object cached_sharding;
+  const auto op_dispatcher = get_dtensor_op_dispatcher();
+  {
+    const auto custom_op_handlers =
+        op_dispatcher.attr(dtensor_interned_strings._custom_op_handlers);
+    TORCH_CHECK(
+        PyDict_Check(custom_op_handlers.ptr()),
+        "_custom_op_handlers must be a dict!");
+    PyObject* custom_op_handler =
+        PyDict_GetItemWithError(custom_op_handlers.ptr(), py_op.ptr());
+    if (custom_op_handler) {
+      auto result = checked_vectorcall(
+          custom_op_handler, py_op.ptr(), args.ptr(), kwargs.ptr());
+      stack->clear();
+      return result;
+    } else if (PyErr_Occurred()) {
+      throw py::error_already_set();
+    }
+  }
+
+  torch::jit::Stack saved_args = *stack;
+  NativeShardingPropagatorCache* native_sharding_propagator_cache = nullptr;
+  auto opt_native_op_schema = create_native_op_schema(op, py_op, stack);
+  if (opt_native_op_schema.has_value()) {
+    native_sharding_propagator_cache =
+        &get_thread_local_native_sharding_propagator_cache();
+    cached_sharding =
+        native_sharding_propagator_cache->find(opt_native_op_schema->first);
+  }
+  py::object py_op_info;
+  if (!cached_sharding) {
+    py_op_info = checked_vectorcall(
+        op_dispatcher.attr("unwrap_to_op_info").ptr(),
+        py_op.ptr(),
+        args.ptr(),
+        kwargs.ptr());
+    py::object sharding = checked_vectorcall(
+        op_dispatcher
+            .attr("_propagate_op_sharding_non_cached_dispatch_slow_path")
+            .ptr(),
+        py_op.ptr(),
+        args.ptr(),
+        kwargs.ptr(),
+        py_op_info.ptr());
+    if (!py::isinstance(sharding, get_output_sharding_class())) {
+      stack->clear();
+      return sharding;
+    }
+    cached_sharding = sharding;
+    if (opt_native_op_schema.has_value()) {
+      native_sharding_propagator_cache->insert(
+          std::move(opt_native_op_schema->first), std::move(sharding));
+    }
+    py_op_info.attr(dtensor_interned_strings.output_sharding) = cached_sharding;
+  }
+
+  const auto get_py_op_info_if_needed = [&, &args = args, &kwargs = kwargs]() {
+    if (!py_op_info) {
+      py_op_info = checked_vectorcall(
+          op_dispatcher.attr(dtensor_interned_strings._unwrap_to_op_info_impl)
+              .ptr(),
+          py_op.ptr(),
+          args.ptr(),
+          kwargs.ptr(),
+          Py_False);
+      py_op_info.attr(dtensor_interned_strings.output_sharding) =
+          cached_sharding;
+    }
+  };
+
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      !kwargs.is_none(),
+      "Python op_dispatch implementation expects non-None kwargs");
+
+  py::object compute_mesh;
+  if (opt_native_op_schema.has_value()) {
+    compute_mesh = std::move(opt_native_op_schema->second);
+  } else {
+    get_py_op_info_if_needed();
+    compute_mesh = py_op_info.attr(dtensor_interned_strings.compute_mesh);
+  }
+
+  const bool participating =
+      !checked_vectorcall(
+           compute_mesh.attr(dtensor_interned_strings.get_coordinate).ptr())
+           .is_none();
+  const bool local_results_success = get_local_results(
+      op, cached_sharding, compute_mesh, participating, stack);
+  py::object py_local_results;
+  if (local_results_success) {
+    py_local_results = torch::jit::createPyObjectForStack(std::move(*stack));
+  } else {
+    get_py_op_info_if_needed();
+    py_local_results = checked_vectorcall(
+        get_dtensor_get_local_results_slow_path().ptr(),
+        py_op.ptr(),
+        args.ptr(),
+        py_op_info.ptr());
+  }
+
+  const auto& operator_name = op.operator_name();
+  // Simple analysis of function schema to determine if this is an
+  // inplace variant. It might not be entirely correct, but it's good
+  // enough for now.
+  const bool is_inplace_op =
+      !operator_name.name.empty() && operator_name.name.back() == '_';
+  // Simple analysis of function schema to determine if this is an
+  // ou variant. It might not be entirely correct, but it's good
+  // enough for now.
+  const bool is_out_variant_op = !is_inplace_op &&
+      operator_name.overload_name.find("out") != std::string::npos;
+
+  // Fast path for default or view ops.
+  const auto output_spec =
+      cached_sharding.attr(dtensor_interned_strings.output_spec);
+  if (!is_inplace_op && !is_out_variant_op &&
+      !(output_spec.is_none() &&
+        (op.operator_name().name == "aten::equal" &&
+         is_default_overload(op.operator_name().overload_name)))) {
+    const auto wrap = get_dtensor_dispatcher_wrap();
+    auto wrapped_result = checked_vectorcall(
+        wrap.ptr(), py_local_results.ptr(), output_spec.ptr());
+    if (!participating) {
+      stack->clear();
+      return wrapped_result;
+    }
+
+    // Direct C++ implementation of return_and_correct_aliasing for view ops.
+
+    // py::tuple's default constructor allocates a size-0 tuple, so we
+    // wrap in optional to get a detectable empty state.
+    std::optional<py::tuple> wrapped_result_tuple;
+    if (PyTuple_Check(wrapped_result.ptr())) {
+      wrapped_result_tuple = py::reinterpret_borrow<py::tuple>(wrapped_result);
+    }
+    const auto& returns = op.schema().returns();
+    const auto num_arguments = op.schema().arguments().size();
+    for (const auto arg_idx : c10::irange(num_arguments)) {
+      const auto& arg_schema = op.schema().arguments()[arg_idx];
+      const auto* arg_alias_info = arg_schema.alias_info();
+      if (!arg_alias_info || arg_alias_info->isWrite()) {
+        continue;
+      }
+      // If we ever get here, it's a view op. Therefore, it does not
+      // have mutable output aliases, so we skip that portion of
+      // return_and_correct_aliasing. Furthermore, we *only* want to
+      // return_and_correct_aliasing if it's a view op, so we do not
+      // need to port the mutable output aliases portion of
+      // return_and_correct_aliasing at all.
+      const c10::IValue& arg_iv =
+          saved_args.at(saved_args.size() - num_arguments + arg_idx);
+      if (!arg_iv.isTensor()) {
+        continue;
+      }
+      const auto& arg = arg_iv.toTensor();
+      int ret_idx = 0;
+      for (const auto& ret_schema : returns) {
+        const auto* ret_alias_info = ret_schema.alias_info();
+        if (!ret_alias_info) {
+          ret_idx++;
+          continue;
+        }
+        if (sets_intersect(
+                arg_alias_info->beforeSets(), ret_alias_info->beforeSets())) {
+          py::object ret;
+          if (wrapped_result_tuple.has_value()) {
+            ret = wrapped_result_tuple.value()[ret_idx];
+          } else {
+            TORCH_INTERNAL_ASSERT(ret_idx == 0);
+            ret = wrapped_result;
+          }
+          if (PyList_Check(ret.ptr())) {
+            py::list ret_list = py::reinterpret_borrow<py::list>(ret);
+            for (const auto& r : ret_list) {
+              auto tensor = py::cast<at::Tensor>(r);
+              functionalize_unsafe_set(tensor, arg);
+            }
+          } else {
+            auto tensor = py::cast<at::Tensor>(ret);
+            functionalize_unsafe_set(tensor, arg);
+          }
+        }
+        ret_idx++;
+      }
+    }
+    stack->clear();
+    return wrapped_result;
+  }
+
+  auto dispatch = get_dtensor_dispatch();
+  auto result = checked_vectorcall(
+      dispatch.ptr(),
+      py_op.ptr(),
+      args.ptr(),
+      kwargs.ptr(),
+      compute_mesh.ptr(),
+      cached_sharding.ptr(),
+      py_local_results.ptr(),
+      participating ? Py_True : Py_False,
+      is_inplace_op ? Py_True : Py_False,
+      is_out_variant_op ? Py_True : Py_False);
+  stack->clear();
+  return result;
+}
+
 // DTensor-specific variant of make_wrapper_subclass to minimize DTensor
 // overhead.
 static PyObject* THPVariable_dtensor_new(
@@ -915,11 +1621,7 @@ static PyObject* THPVariable_dtensor_new(
   auto r = parser.parse(args, kwargs, parsed_args);
   PyObject* cls = r.pyobject(0);
 
-  TORCH_CHECK_TYPE(
-      PyType_Check(cls),
-      "cls must be a type (got ",
-      Py_TYPE(cls)->tp_name,
-      ")");
+  TORCH_CHECK_TENSOR_SUBTYPE(cls);
 
 #ifndef NDEBUG
   // This is specifically for making a DTensor, which we know defines
@@ -967,46 +1669,58 @@ static PyObject* THPVariable_dtensor_new(
   Tensor tensor = make_tensor_for_subclass_helper(
       /*sym_sizes=*/tuple_to_symintlist(sizes.ptr()),
       /*sym_strides=*/tuple_to_symintlist(stride.ptr()),
-      /*sym_storage_offset=*/std::nullopt,
+      /*sym_storage_offset=*/local_tensor.sym_storage_offset(),
       options,
       /*storage_size=*/std::nullopt,
       extra_dispatch_keys);
   tensor.set_requires_grad(requires_grad);
-  py::object py_tensor =
-      py::reinterpret_steal<py::object>(THPVariable_NewWithVar(
-          (PyTypeObject*)cls,
-          tensor,
-          // false is the default
-          /*allow_preexisting_pyobj=*/false,
-          // we know DTensor has __torch_dispatch__; avoid checking again.
-          /*has_torch_dispatch_if_known=*/true));
+  tensor.unsafeGetTensorImpl()->set_python_dispatch(true);
+  py::object py_tensor = py::reinterpret_steal<py::object>(
+      THPVariable_WrapWithType(std::move(tensor), (PyTypeObject*)cls));
   py_tensor.attr(dtensor_interned_strings._spec) = spec;
   py_tensor.attr(dtensor_interned_strings._local_tensor) = local_tensor;
   return py_tensor.release().ptr();
   END_HANDLE_TH_ERRORS
 }
 
+struct NativeRuntimeSchemaInfo {
+  py::object static_kwargkey;
+  size_t static_argnum;
+};
+
+NativeRuntimeSchemaInfo unpack_runtime_schema_info(
+    py::handle runtime_schema_info,
+    size_t num_args) {
+  NativeRuntimeSchemaInfo result;
+  if (!runtime_schema_info) {
+    result.static_argnum = num_args;
+  } else {
+    result.static_argnum = py::cast<size_t>(
+        runtime_schema_info.attr(dtensor_interned_strings.static_argnum));
+    result.static_kwargkey =
+        runtime_schema_info.attr(dtensor_interned_strings.static_kwargkey);
+    TORCH_CHECK(
+        result.static_kwargkey.is_none() ||
+            PyList_Check(result.static_kwargkey.ptr()),
+        "RuntimeSchemaInfo.static_kwargkey must be a list!");
+  }
+  return result;
+}
+
 static bool DTensor_OpSchema_recompute_comparison_key_impl(
     PyObject* self,
     const py::tuple& args_schema) {
-  py::object static_kwargkey;
-  size_t static_argnum = 0;
   const py::handle self_handle = py::handle(self);
-  const py::handle schema_info =
+  const auto schema_info =
       self_handle.attr(dtensor_interned_strings.schema_info);
-  if (checked_not(schema_info.ptr())) {
-    static_argnum = args_schema.size();
-    static_kwargkey = py::none();
-  } else {
-    static_argnum = py::cast<size_t>(
-        schema_info.attr(dtensor_interned_strings.static_argnum));
-    static_kwargkey =
-        schema_info.attr(dtensor_interned_strings.static_kwargkey);
-  }
+  NativeRuntimeSchemaInfo native_info = unpack_runtime_schema_info(
+      checked_not(schema_info.ptr()) ? py::handle() : py::handle(schema_info),
+      args_schema.size());
   c10::SmallVector<py::object, 8> args_to_hash;
   size_t idx = 0;
   for (const auto& e : args_schema) {
-    if (idx >= static_argnum || arg_type_tensor_or_tensor_list_like(e)) {
+    if (idx >= native_info.static_argnum ||
+        arg_type_tensor_or_tensor_list_like(e)) {
       if (PyList_Check(e.ptr())) {
         args_to_hash.push_back(
             py::reinterpret_steal<py::object>(PyList_AsTuple(e.ptr())));
@@ -1021,24 +1735,19 @@ static bool DTensor_OpSchema_recompute_comparison_key_impl(
     args_to_hash_tup[idx] = std::move(args_to_hash[idx]);
   }
   PyObject* comparison_key = nullptr;
-  if (!static_kwargkey.is_none()) {
-    if (!PyList_Check(static_kwargkey.ptr())) {
-      PyErr_SetString(
-          PyExc_TypeError, "self.schema_info.static_kwargkey must be a list!");
-      return false;
-    }
-    py::list static_kwargkey_list =
-        py::reinterpret_borrow<py::list>(static_kwargkey);
+  if (native_info.static_kwargkey && !native_info.static_kwargkey.is_none()) {
+    py::list static_kwargkey =
+        py::reinterpret_borrow<py::list>(native_info.static_kwargkey);
     auto raw_kwargs_schema =
         self_handle.attr(dtensor_interned_strings.kwargs_schema);
     if (!PyDict_Check(raw_kwargs_schema.ptr())) {
       PyErr_SetString(PyExc_TypeError, "self.kwargs_schema must be a dict!");
       return false;
     }
-    py::tuple kwargs_to_hash(static_kwargkey_list.size());
+    py::tuple kwargs_to_hash(static_kwargkey.size());
     int idx = 0;
     auto kwargs_schema = py::reinterpret_borrow<py::dict>(raw_kwargs_schema);
-    for (const auto& k : static_kwargkey_list) {
+    for (const auto& k : static_kwargkey) {
       PyObject* item = PyDict_GetItemWithError(kwargs_schema.ptr(), k.ptr());
       if (item) {
         kwargs_to_hash[idx++] = py::reinterpret_borrow<py::object>(item);
@@ -1233,6 +1942,370 @@ static PyObject* DTensor_compute_global_tensor_info(
   END_HANDLE_TH_ERRORS
 }
 
+enum class TensorFlavor {
+  NON_TENSOR,
+  EXACTLY_DTENSOR,
+  EXACTLY_TENSOR,
+  DTENSOR_SUBCLASS,
+  NON_DTENSOR_TENSOR_SUBCLASS,
+};
+
+static std::pair<TensorFlavor, py::object> check_for_dtensor_or_tensor(
+    const at::Tensor& tensor) {
+  if (!tensor.defined()) {
+    return {TensorFlavor::NON_TENSOR, py::object()};
+  }
+
+  // I don't think we need to check for wrapped_number() tensors here;
+  // the try_replicate_spec_for_scalar_tensor stuff in our caller
+  // specifically handles 1-element tensors.
+
+  torch::jit::guardAgainstNamedTensor<at::Tensor>(tensor);
+  auto py_tensor = py::cast(tensor);
+
+  const auto dtensor = get_dtensor_class();
+  auto* const obj_type = Py_TYPE(py_tensor.ptr());
+  if (obj_type == (PyTypeObject*)dtensor.ptr()) {
+    return {TensorFlavor::EXACTLY_DTENSOR, std::move(py_tensor)};
+  }
+  // Fast path for plain old Tensors.
+  if (THPVariable_CheckTypeExact(obj_type)) {
+    return {TensorFlavor::EXACTLY_TENSOR, std::move(py_tensor)};
+  }
+  if (py::isinstance(py_tensor, dtensor)) {
+    return {TensorFlavor::DTENSOR_SUBCLASS, std::move(py_tensor)};
+  }
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      THPVariableClass && py::isinstance(py_tensor, THPVariableClass));
+  return {TensorFlavor::NON_DTENSOR_TENSOR_SUBCLASS, std::move(py_tensor)};
+}
+
+static std::pair<TensorFlavor, py::object> check_for_dtensor_or_tensor(
+    const c10::IValue& iv) {
+  if (!iv.isTensor()) {
+    return {TensorFlavor::NON_TENSOR, py::object()};
+  }
+
+  return check_for_dtensor_or_tensor(iv.toTensor());
+}
+
+static c10::List<c10::IValue> replace_dtensors_with_local_tensor(
+    const c10::List<c10::IValue>& tl) {
+  c10::List<c10::IValue> local_list(tl.elementType());
+  local_list.reserve(tl.size());
+  for (const auto& elt : tl) {
+    const auto [tensor_flavor, py_tensor] = check_for_dtensor_or_tensor(elt);
+    if (tensor_flavor == TensorFlavor::EXACTLY_DTENSOR ||
+        tensor_flavor == TensorFlavor::DTENSOR_SUBCLASS) {
+      local_list.push_back(THPVariable_Unpack(
+          py_tensor.attr(dtensor_interned_strings._local_tensor).ptr()));
+    } else {
+      local_list.push_back(elt);
+    }
+  }
+  return local_list;
+}
+
+static void replace_dtensors_with_local_tensor(torch::jit::Stack& stack) {
+  for (auto& arg : stack) {
+    if (arg.isList()) {
+      arg = replace_dtensors_with_local_tensor(arg.toList());
+      continue;
+    }
+    const auto [tensor_flavor, py_tensor] = check_for_dtensor_or_tensor(arg);
+    if (tensor_flavor == TensorFlavor::EXACTLY_DTENSOR ||
+        tensor_flavor == TensorFlavor::DTENSOR_SUBCLASS) {
+      arg = THPVariable_Unpack(
+          py_tensor.attr(dtensor_interned_strings._local_tensor).ptr());
+    }
+  }
+}
+
+static py::object try_find_mesh_from_args(
+    const c10::OperatorHandle& op,
+    const OperatorArgsKwargsView& args_kwargs) {
+  for (auto argument_it = args_kwargs.args_begin();
+       argument_it != args_kwargs.args_end();
+       ++argument_it) {
+    const auto [tensor_flavor, py_tensor] =
+        check_for_dtensor_or_tensor(*argument_it);
+    if (tensor_flavor == TensorFlavor::EXACTLY_DTENSOR ||
+        tensor_flavor == TensorFlavor::DTENSOR_SUBCLASS) {
+      return py::reinterpret_borrow<py::object>(
+          py_tensor.attr(dtensor_interned_strings.device_mesh));
+    }
+  }
+  TORCH_CHECK_VALUE(
+      false, "Cannot find device mesh from args for op : ", op.operator_name());
+}
+
+static /*DTensorSpec*/ py::object try_replicate_spec_for_scalar_tensor(
+    bool allow_implicit_replication,
+    py::handle op_call,
+    py::handle py_tensor,
+    py::handle compute_mesh) {
+  const Tensor& tensor_arg = THPVariable_Unpack(py_tensor.ptr());
+  const bool numel_is_one = tensor_arg.numel() == 1;
+  if (numel_is_one && tensor_arg.dim() == 1) {
+    TORCH_WARN(
+        "Found a non-scalar tensor with numel=1 and ndim!=0, "
+        "we are implicitly creating a replicated DTensor for it. "
+        "However, please consider changing it to a scalar tensor "
+        "or explicitly create a DTensor under distributed environment.");
+  }
+
+  TORCH_CHECK(
+      numel_is_one || allow_implicit_replication,
+      py::str(op_call),
+      " got mixed torch.Tensor and DTensor, need to convert all torch.Tensor to DTensor before calling distributed operators!");
+
+  // scalar tensor can be safely treated as replicated.
+  const auto num_placements =
+      py::cast<Py_ssize_t>(compute_mesh.attr(dtensor_interned_strings.ndim));
+  py::tuple placements_tuple(num_placements);
+  py::object replicate = get_replicate_class()();
+  for (const auto idx : c10::irange(num_placements)) {
+    PyTuple_SET_ITEM(
+        placements_tuple.ptr(),
+        idx,
+        py::reinterpret_borrow<py::object>(replicate).release().ptr());
+  }
+
+  return checked_vectorcall(
+      get_dtensor_spec_class().ptr(),
+      compute_mesh.ptr(),
+      placements_tuple.ptr(),
+      checked_vectorcall(
+          get_tensor_meta_class().ptr(),
+          py_tensor.attr(dtensor_interned_strings.shape).ptr(),
+          py_tensor.attr(dtensor_interned_strings.stride)().ptr(),
+          py_tensor.attr(dtensor_interned_strings.dtype).ptr())
+          .ptr());
+}
+
+// May return unset object, in which case there was no runtime schema
+// info.
+static py::object get_runtime_schema_info_for_op(py::handle py_op) {
+  const auto op_dispatcher = get_dtensor_op_dispatcher();
+  const auto sharding_propagator =
+      op_dispatcher.attr(dtensor_interned_strings.sharding_propagator);
+  const py::dict op_to_schema_info = py::reinterpret_borrow<py::dict>(
+      sharding_propagator.attr(dtensor_interned_strings.op_to_schema_info));
+
+  PyObject* runtime_schema_info =
+      PyDict_GetItemWithError(op_to_schema_info.ptr(), py_op.ptr());
+  if (!runtime_schema_info && PyErr_Occurred()) {
+    throw py::error_already_set();
+  }
+  return py::reinterpret_borrow<py::object>(runtime_schema_info);
+}
+
+static bool contains_any_symint(const py::tuple& tup) {
+  for (const auto& s : tup) {
+    if (THPUtils_checkLong(s.ptr())) {
+      continue;
+    }
+    if (torch::is_symint(s)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool dtensor_spec_has_symints(py::handle spec) {
+  const auto tensor_meta = spec.attr(dtensor_interned_strings.tensor_meta);
+  if (tensor_meta.is_none()) {
+    return false;
+  }
+  py::object raw_shape = tensor_meta.attr(dtensor_interned_strings.shape);
+  if (!PyTuple_Check(raw_shape.ptr())) {
+    PyErr_SetString(PyExc_TypeError, "TensorMeta.shape must be a tuple!");
+    throw py::error_already_set();
+  }
+  const auto shape = py::reinterpret_steal<py::tuple>(raw_shape.release());
+  return contains_any_symint(shape);
+}
+
+static std::optional<std::pair<NativeOpSchema, /*ComputeMesh*/ py::object>>
+create_native_op_schema(
+    const c10::OperatorHandle& op,
+    py::handle py_op,
+    torch::jit::Stack* stack) {
+  // fused schema part of unwrap_to_op_info + recompute_comparison_key,
+  // operating on IValues instead of Python stuff.
+
+  py::object runtime_schema_info = get_runtime_schema_info_for_op(py_op);
+  if (runtime_schema_info &&
+      checked_istrue(py::handle(runtime_schema_info)
+                         .attr(dtensor_interned_strings.needs_pytree)
+                         .ptr())) {
+    // Punting on pytree flattening in the fast path on IValues for
+    // now since only a minority of ops need it.
+    return std::nullopt;
+  }
+
+  OperatorArgsKwargsView args_kwargs(op, *stack);
+  auto native_info = unpack_runtime_schema_info(
+      py::handle(runtime_schema_info), args_kwargs.num_positional_args());
+
+  c10::SmallVector<IValueOrDTensorSpec, 8> comparison_key;
+  std::size_t comparison_key_hash = 0;
+
+  py::object compute_mesh = py::none();
+
+  const auto handle_non_dtensor_arg =
+      [&comparison_key, &comparison_key_hash, &native_info](
+          size_t idx, c10::IValue arg) {
+        if (idx >= native_info.static_argnum) {
+          if (arg.isList()) {
+            const auto& list = arg.toList();
+            if (list.empty()) {
+              arg = c10::ivalue::Tuple::create({});
+            } else {
+              // WARNING: here we rely on c10::List being represented
+              // by a contiguous array of IValue for efficiency!
+              arg = c10::ivalue::Tuple::create(c10::ArrayRef<c10::IValue>(
+                  &(*list.begin()).get(), list.size()));
+            }
+          } else if (arg.isTensor() && !arg.toTensor().defined()) {
+            // Coerce undefined Tensor to None, just as we do when
+            // converting IValues to PyObject. Otherwise comparison
+            // doesn't work. (undefined Tensors can get here because
+            // check_for_dtensor_or_tensor calls them non-Tensors, but
+            // doesn't have a way to do the coercion for us.)
+            arg = c10::IValue();
+          }
+          comparison_key_hash =
+              c10::hash_combine(comparison_key_hash, c10::IValue::hash(arg));
+          comparison_key.emplace_back(std::move(arg));
+        }
+      };
+  const auto handle_dtensor_arg = [&comparison_key,
+                                   &comparison_key_hash](py::object arg) {
+    comparison_key_hash = c10::hash_combine(
+        comparison_key_hash, static_cast<size_t>(py::hash(arg)));
+    comparison_key.emplace_back(std::move(arg));
+  };
+
+  Py_ssize_t idx = 0;
+  const bool allow_implicit_replication =
+      at::get_dtensor_allow_implicit_replication();
+  for (auto argument_it = args_kwargs.args_begin();
+       argument_it != args_kwargs.args_end();
+       ++argument_it) {
+    const auto& arg = *argument_it;
+    const auto [tensor_flavor, py_tensor] = check_for_dtensor_or_tensor(arg);
+    switch (tensor_flavor) {
+      case TensorFlavor::EXACTLY_DTENSOR:
+      case TensorFlavor::DTENSOR_SUBCLASS: {
+        py::object spec = py_tensor.attr(dtensor_interned_strings._spec);
+        if (dtensor_spec_has_symints(spec)) {
+          // Symints are unhashable, so we can't use the cache for
+          // sharding propagation. bail out to slow path.
+          return std::nullopt;
+        }
+        handle_dtensor_arg(std::move(spec));
+        if (compute_mesh.is_none()) {
+          compute_mesh = py::reinterpret_borrow<py::object>(
+              py_tensor.attr(dtensor_interned_strings.device_mesh));
+        }
+        break;
+      }
+      case TensorFlavor::EXACTLY_TENSOR:
+      case TensorFlavor::NON_DTENSOR_TENSOR_SUBCLASS: {
+        if (compute_mesh.is_none()) {
+          compute_mesh = try_find_mesh_from_args(op, args_kwargs);
+        }
+        handle_dtensor_arg(try_replicate_spec_for_scalar_tensor(
+            allow_implicit_replication, py_op, py_tensor, compute_mesh));
+        break;
+      }
+      case TensorFlavor::NON_TENSOR: {
+        // non DTensor/Tensor args (i.e. int/float/bool), just add to
+        // local_args
+        handle_non_dtensor_arg(idx, arg);
+        break;
+      }
+      default:
+        TORCH_INTERNAL_ASSERT(false, "can't happen");
+        break;
+    }
+    idx++;
+  }
+
+  TORCH_CHECK(
+      !compute_mesh.is_none(),
+      "found no DeviceMesh from dtensor args for ",
+      op.operator_name());
+
+  if (native_info.static_kwargkey && !native_info.static_kwargkey.is_none()) {
+    // Separator to disambiguate kwargs from args in comparison and hashing.
+    static constexpr int64_t kwargs_separator = 0x0011223344556677LL;
+    comparison_key.emplace_back(static_cast<int64_t>(kwargs_separator));
+    comparison_key_hash = hash_combine(comparison_key_hash, kwargs_separator);
+
+    for (auto argument_it = args_kwargs.kwargs_begin();
+         argument_it != args_kwargs.kwargs_end();
+         ++argument_it) {
+      // Rather than hash/compare the string key, we can just use the
+      // index of the kwarg in the schema!
+      const auto underlying_index = argument_it.underlying_index();
+      comparison_key.emplace_back(c10::IValue(underlying_index));
+      comparison_key_hash = hash_combine(
+          comparison_key_hash, c10::IValue::hash(comparison_key.back().iv));
+      const auto [tensor_flavor, py_tensor] =
+          check_for_dtensor_or_tensor(*argument_it);
+      switch (tensor_flavor) {
+        case TensorFlavor::EXACTLY_DTENSOR:
+        case TensorFlavor::DTENSOR_SUBCLASS: {
+          handle_dtensor_arg(py_tensor.attr(dtensor_interned_strings._spec));
+          break;
+        }
+        case TensorFlavor::EXACTLY_TENSOR:
+        case TensorFlavor::NON_DTENSOR_TENSOR_SUBCLASS: {
+          handle_dtensor_arg(try_replicate_spec_for_scalar_tensor(
+              allow_implicit_replication, py_op, py_tensor, compute_mesh));
+          break;
+        }
+        case TensorFlavor::NON_TENSOR: {
+          handle_non_dtensor_arg(native_info.static_argnum, *argument_it);
+          break;
+        }
+        default:
+          TORCH_INTERNAL_ASSERT(false, "can't happen");
+          break;
+      }
+    }
+  }
+
+  return std::make_pair(
+      NativeOpSchema(
+          op,
+          std::move(comparison_key),
+          comparison_key_hash,
+          args_kwargs.num_positional_args()),
+      std::move(compute_mesh));
+}
+
+static PyObject* get_DTensor_sharding_propagator_cache_stats(
+    PyObject* self,
+    PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  auto& cache = get_thread_local_native_sharding_propagator_cache();
+  py::tuple result(2);
+  result[0] = cache.hits();
+  result[1] = cache.misses();
+  return result.release().ptr();
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* clear_DTensor_sharding_propagator_cache(
+    PyObject* self,
+    PyObject* noargs) {
+  native_sharding_propagator_cache_DO_NOT_USE.reset();
+  Py_RETURN_NONE;
+}
+
 using getter = PyObject* (*)(PyObject*, void*);
 using setter = int (*)(PyObject*, PyObject*, void*);
 
@@ -2206,7 +3279,7 @@ static PyMethodDef extra_methods[] = {
     {nullptr}};
 
 // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-non-const-global-variables)
-static PyMethodDef extra_functions[] = {
+static PyMethodDef extra_dtensor_functions[] = {
     {"_DTensor_OpSchema_post_init",
      DTensor_OpSchema_post_init,
      METH_O,
@@ -2219,6 +3292,14 @@ static PyMethodDef extra_functions[] = {
      castPyCFunctionFast(DTensor_compute_global_tensor_info),
      METH_FASTCALL,
      compute_global_tensor_info_doc},
+    {"_get_DTensor_sharding_propagator_cache_stats",
+     get_DTensor_sharding_propagator_cache_stats,
+     METH_NOARGS,
+     nullptr},
+    {"_clear_DTensor_sharding_propagator_cache",
+     clear_DTensor_sharding_propagator_cache,
+     METH_NOARGS,
+     nullptr},
     {nullptr}};
 
 struct THPVariableMeta {
@@ -2272,15 +3353,16 @@ static PyTypeObject THPVariableMetaType = {
     nullptr, /* tp_new */
 };
 
+static void THPVariable_dealloc(PyObject* self);
+static int THPVariable_clear(THPVariable* self);
+static int THPVariable_traverse(PyObject* self, visitproc visit, void* arg);
+
 static PyTypeObject THPVariableType = {
     PyVarObject_HEAD_INIT(&THPVariableMetaType, 0)
     "torch._C.TensorBase", /* tp_name */
     sizeof(THPVariable), /* tp_basicsize */
     0, /* tp_itemsize */
-    // This is unspecified, because it is illegal to create a THPVariableType
-    // directly.  Subclasses will have their tp_dealloc set appropriately
-    // by the metaclass
-    nullptr, /* tp_dealloc */
+    THPVariable_dealloc, /* tp_dealloc */
     0, /* tp_vectorcall_offset */
     nullptr, /* tp_getattr */
     nullptr, /* tp_setattr */
@@ -2299,9 +3381,8 @@ static PyTypeObject THPVariableType = {
     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
         Py_TPFLAGS_HAVE_GC, /* tp_flags */
     nullptr, /* tp_doc */
-    // Also set by metaclass
-    (traverseproc)THPFake_traverse, /* tp_traverse */
-    (inquiry)THPFake_clear, /* tp_clear */
+    (traverseproc)THPVariable_traverse, /* tp_traverse */
+    (inquiry)THPVariable_clear, /* tp_clear */
     nullptr, /* tp_richcompare */
     0, /* tp_weaklistoffset */
     nullptr, /* tp_iter */
@@ -2330,345 +3411,68 @@ PyObject* THPVariable_pynew(
       type != &THPVariableType,
       "Cannot directly construct TensorBase; subclass it and then construct that");
   jit::tracer::warn("torch.Tensor", jit::tracer::WARN_CONSTRUCTOR);
-  auto tensor = torch::utils::base_tensor_ctor(args, kwargs);
   // WARNING: tensor is NOT guaranteed to be a fresh tensor; e.g., if it was
   // given a raw pointer that will refcount bump
   // NB: base_tensor_ctor can call into dispatched ATen functions (e.g.,
   // alias(), lift_fresh()) which can return Tensor subclasses.  We allow
   // these to be passed on directly.
-  return THPVariable_NewWithVar(
-      type,
-      tensor,
-      /*allow_preexisting_pyobj=*/true);
+  PyObject* obj = THPVariable_WrapWithType(
+      torch::utils::base_tensor_ctor(args, kwargs), type);
+  if (check_has_torch_dispatch(obj)) {
+    THPVariable_Unpack(obj).unsafeGetTensorImpl()->set_python_dispatch(true);
+  }
+  return obj;
   END_HANDLE_TH_ERRORS
 }
 
-static int THPVariable_subclass_clear(THPVariable* self) {
-  // Is it OK for an object to still be live after running
-  // tp_clear? Yes. When Python is breaking reference cycles, it can't assume
-  // that an object will dealloc after it's cleared.  The source code explicitly
-  // handles this case:
-  // https://github.com/python/cpython/blob/4e661cd69164318c1f871faa476c68a04092ddc4/Modules/gcmodule.c#L1010-L1025
-
-  // Note that we don't need to actually resurrect here. There are 2 cases:
-  // 1. The PyObject is not part of a reference cycle. In this case, we don't
-  // need to do anything. The GC will move on to try and break the reference
-  // cycle on another object, which will eventually trigger tp_dealloc (and thus
-  // resurrection).
-
-  // 2. The PyObject is part of a reference cycle. This case should not actually
-  // be possible, due to the logic in our tp_traverse
-  // (THPVariable_subclass_traverse).
-
-  // In fact, resurrecting here breaks the invariant that "C++ owns Python only
-  // when PyObject's refcount would otherwise be 0". Most immediately, as we're
-  // merely breaking reference cycles here, there can be other references to the
-  // PyObject. *However*, if other objects in the refcycle resurrect, then we
-  // will be in a state where the PyObject has multiple Python references, yet
-  // C++ owns the PyObject.
-
-  // See https://github.com/pytorch/pytorch/pull/75933 for more discussion.
-  if (isResurrectable(self)) {
-    return 0;
-  }
-
+static int THPVariable_clear(THPVariable* self) {
   // First clear Tensor specific things
-
   Py_CLEAR(self->backward_hooks);
   Py_CLEAR(self->post_accumulate_grad_hooks);
-  const auto& tensor = THPVariable_Unpack(self);
-  if (tensor.defined()) {
-    // Two situations to consider:
-    //    PyObject -owns-> Tensor
-    //        unsafeIsBorrowed() is FALSE.  We're obligated to look through
-    //        Tensor to break references.  Clearing cdata must induce the
-    //        destruction of the C++ Tensor.  If there were other references
-    //        to C++ tensor, the Python object would have been resurrected
-    //        by flipping the ownership.
-    //    Tensor -owns-> PyObject
-    //        unsafeIsBorrowed() is TRUE.  We're deallocating the PyObject
-    //        because Tensor asked us to (it's already destructing).
-
-    if (!self->cdata.unsafeIsBorrowed() &&
-        tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
-            /*ignore_hermetic_tls=*/false) == (PyObject*)self) {
-      // TODO: empirically, on OS X this assert appears to be untrue
-      // In test_py_tensors_multi_async_call - ProcessGroupRpcTestWithSpawn
-      // distributed/rpc/test_process_group_agent.py
-      //
-      //  libc++abi.dylib: terminating with uncaught exception of type
-      //  c10::Error:
-      //  !tensor.unsafeGetTensorImpl()->pyobj_slot()->owns_pyobj()INTERNAL
-      //  ASSERT FAILED at "../torch/csrc/autograd/python_variable.cpp":171,
-      //  please report a bug to PyTorch. Exception raised from
-      //  THPVariable_subclass_clear at
-      //  ../torch/csrc/autograd/python_variable.cpp:171 (most recent call
-      //  first): frame #0: c10::Error::Error(c10::SourceLocation,
-      //  std::__1::basic_string<char, std::__1::char_traits<char>,
-      //  std::__1::allocator<char> >) + 98 (0x1158a0442 in libc10.dylib) frame
-      //  #1: c10::detail::torchCheckFail(char const*, char const*, unsigned
-      //  int, char const*) + 205 (0x11589ed3d in libc10.dylib) frame #2:
-      //  c10::detail::torchInternalAssertFail(char const*, char const*,
-      //  unsigned int, char const*, c10::detail::CompileTimeEmptyString) + 9
-      //  (0x1141e3f89 in libtorch_python.dylib) frame #3:
-      //  THPVariable_subclass_clear(THPVariable*) + 412 (0x1148a547c in
-      //  libtorch_python.dylib) frame #4:
-      //  THPVariable_subclass_dealloc(_object*) + 453 (0x1148a5035 in
-      //  libtorch_python.dylib) frame #5: (anonymous
-      //  namespace)::concrete_decref_fn(c10::impl::PyInterpreter const*,
-      //  _object*) + 53 (0x1148a5ea5 in libtorch_python.dylib) frame #6:
-      //  c10::TensorImpl::release_resources() + 182 (0x11588c4a6 in
-      //  libc10.dylib) frame #7:
-      //  c10::MaybeOwned<at::Tensor>::operator=(c10::MaybeOwned<at::Tensor>&&)
-      //  + 91 (0x11488c11b in libtorch_python.dylib) frame #8:
-      //  THPVariable_subclass_dealloc(_object*) + 607 (0x1148a50cf in
-      //  libtorch_python.dylib) <omitting python frames> frame #47: start + 1
-      //  (0x7fff6ffc7cc9 in libdyld.dylib) frame #48: 0x0 + 4 (0x4 in ???)
-      // TORCH_INTERNAL_ASSERT(!tensor.unsafeGetTensorImpl()->pyobj_slot()->owns_pyobj());
-      if (auto grad_acc =
-              torch::autograd::impl::try_get_grad_accumulator(tensor)) {
-        grad_acc->pre_hooks().clear();
-        grad_acc->tensor_pre_hooks().clear();
-        grad_acc->retains_grad_hooks().clear();
-      }
+  if (self->cdata.defined()) {
+    auto pyobj_slot = self->cdata.unsafeGetTensorImpl()->pyobj_slot();
+    // Typically the Tensor's pyobj_slot points back to this object. The only
+    // time that's not the case is if we had a race in THPVariable_Wrap and we
+    // need to discard the Python object because some other thread beat us to
+    // setting the pyobj_slot.
+    if (pyobj_slot->load_pyobj() == (PyObject*)self) {
+      // A Tensor's Python object should only be destroyed when the Tensor has
+      // no other references too.
+      TORCH_INTERNAL_ASSERT(self->cdata.use_count() == 1);
+
+      // Clear the pyobj_slot so that a try_incref() call from
+      // weak_intrusive_ptr::lock() won't see a freed pointer.
+      pyobj_slot->clear();
     }
   }
-  TORCH_INTERNAL_ASSERT(!isResurrectable(self));
   {
     // MapAllocator can take significant time to release large tensors;
     // release the GIL here to avoid impacting main thread perf.
     pybind11::gil_scoped_release no_gil;
-    self->cdata = MaybeOwned<Variable>();
-  }
-  // Since we override the basic subtype_clear from CPython, we need a crappy
-  // version here just like for traverse and dealloc
-
-  // Clear all slots until we get to the base Tensor class
-  PyTypeObject* type = Py_TYPE((PyObject*)self);
-  PyTypeObject* base = type;
-  while (base != &THPVariableType) {
-    if (Py_SIZE(base))
-      clear_slots(base, (PyObject*)self);
-    base = base->tp_base;
-    TORCH_INTERNAL_ASSERT(base);
-  }
-
-  // Assume we never have managed dict for Tensors as we don't set the flag on
-  // the base class
-  if (C10_LIKELY(type->tp_dictoffset)) {
-    PyObject** dictptr = _PyObject_GetDictPtr((PyObject*)self);
-    if (dictptr && *dictptr)
-      Py_CLEAR(*dictptr);
+    self->cdata = Variable();
   }
-
   return 0;
 }
 
-// NB: this is not the tp_dealloc on THPVariable; instead, its the dealloc
-// on subclasses.  It's never valid to construct a THPVariable so it's not
-// necessary to implement the dealloc for that case
-static void THPVariable_subclass_dealloc(PyObject* self) {
-  if (THPVariable_tryResurrect((THPVariable*)self))
-    return;
-
-  // This is like a crappy version of subtype_dealloc.
-  // Unfortunately, we cannot directly delegate to
-  // subtype_dealloc as it will start walking the parent
-  // chain *starting with* the type of self, which will cause
-  // us to go back to our custom dealloc.
-  //
-  // We have to replicate the subtype_dealloc logic to ensure
-  // that finalizers are handled correctly
-  PyTypeObject* type = Py_TYPE(self);
-  TORCH_INTERNAL_ASSERT(type->tp_flags & Py_TPFLAGS_HEAPTYPE);
-  TORCH_INTERNAL_ASSERT(PyType_IS_GC(type), "GC types not implemented");
-
+static void THPVariable_dealloc(PyObject* self) {
   PyObject_GC_UnTrack(self);
-  // TODO: consider using trash can
-
-  bool has_finalizer = type->tp_finalize || type->tp_del;
-
-  if (type->tp_finalize) {
-    PyObject_GC_Track(self);
-    if (PyObject_CallFinalizerFromDealloc(self) < 0) {
-      /* Resurrected */
-      return;
-    }
-    PyObject_GC_UnTrack(self);
-  }
-
-  // base test is unnecessary as THPVariable does not set this
-  if (type->tp_weaklistoffset) {
-    PyObject_ClearWeakRefs(self);
-  }
-
-  if (type->tp_del) {
-    PyObject_GC_Track(self);
-    type->tp_del(self);
-    if (Py_REFCNT(self) > 0) {
-      /* Resurrected */
-      return;
-    }
-    PyObject_GC_UnTrack(self);
-  }
-
-  if (has_finalizer) {
-    /* New weakrefs could be created during the finalizer call.
-       If this occurs, clear them out without calling their
-       finalizers since they might rely on part of the object
-       being finalized that has already been destroyed. */
-    if (type->tp_weaklistoffset) {
-      /* Modeled after GET_WEAKREFS_LISTPTR() */
-      PyWeakReference** list =
-          (PyWeakReference**)PyObject_GET_WEAKREFS_LISTPTR(self);
-      while (*list)
-        _PyWeakref_ClearRef(*list);
-    }
-  }
-
-  // Clear all slots until we get to base class THPVariableType
-  {
-    PyTypeObject* base = type;
-    while (base != &THPVariableType) {
-      if (Py_SIZE(base)) {
-        clear_slots(base, self);
-      }
-      base = base->tp_base;
-      TORCH_INTERNAL_ASSERT(base);
-    }
-  }
-
-  // All Python defined classes have __dict__
-  if (C10_LIKELY(type->tp_dictoffset)) {
-    PyObject** dictptr = _PyObject_GetDictPtr(self);
-    if (dictptr != nullptr) {
-      PyObject* dict = *dictptr;
-      if (dict != nullptr) {
-        Py_DECREF(dict);
-        *dictptr = nullptr;
-      }
-    }
-  }
-
-  // subtype_dealloc allows for this but we don't
-  TORCH_INTERNAL_ASSERT(Py_TYPE(self) == type);
-
-  // Finally clear out the base THPVariable
-  THPVariable_subclass_clear((THPVariable*)self);
-  ((THPVariable*)self)->cdata.~MaybeOwned<Variable>();
+  THPVariable_clear((THPVariable*)self);
+  ((THPVariable*)self)->cdata.~Variable();
   Py_TYPE(self)->tp_free(self);
-
-  // Python defined subclasses should always be on the heap
-  TORCH_INTERNAL_ASSERT(type->tp_flags & Py_TPFLAGS_HEAPTYPE);
-  Py_DECREF(type);
 }
 
-// Creates a new Python object for a Variable.
-static PyObject* THPVariable_NewWithVar(
-    PyTypeObject* type,
-    const at::TensorBase& _var,
-    bool allow_preexisting_pyobj,
-    std::optional<bool> has_torch_dispatch_if_known) {
-  // Make sure that the reinterpret into a THPVariable* will be valid
-  TORCH_CHECK(
-      type == &THPVariableType || PyType_IsSubtype(type, &THPVariableType),
-      "Creating a Tensor subclass from a class ",
-      "that does not inherit from Tensor is not possible. Make sure your class inherits from Tensor.");
-
-  // This function overwrite the Tensor's pyobj field without extra checks
-  // Make sure it is not set otherwise we would leak memory
-  auto mb_obj = _var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
-      /*ignore_hermetic_tls=*/false);
-
-  // Under some circumstances, we may attempt to create a new Python
-  // object for a variable that already has a Python object.  The most common
-  // situation this can occur is if you have a TorchDispatchMode active that
-  // is returning a subclass from lift_fresh (which is invoked to
-  // appropriately "wrap" a constant tensor into whatever ambient modes are
-  // active.)
-  //
-  // In general, it is impossible to handle this case compositionally.
-  // Suppose you have a user call ATensor([1, 2, 3]) when a mode is active
-  // that is transforming all ops (including the internal lift_fresh call that
-  // transforms [1, 2, 3] into a torch.tensor([1., 2., 3.])) to output
-  // BTensor, where ATensor and BTensor are completely unrelated subclasses
-  // and there is no way to compose them.  There is no way to satisfy the user
-  // request here: in particular, you can't just try to re-invoke the ATensor
-  // constructor on the returned BTensor, because (1) this could cause an
-  // infinite loop--we are already in ATensor.__new__ and (2) there isn't any
-  // guarantee that ATensor.__new__ supports a single element constructor
-  // anyway.
-  //
-  // However, a more common case is a user just called torch.Tensor([1, 2, 3]),
-  // and a fake tensor mode is active.  Really, all you want is to get back
-  // a FakeTensor, in the same way torch.tensor([1, 2, 3]) or torch.arange(3)
-  // would have returned a fake tensor (concretely, the way this happens
-  // is we create a *real* tensor torch.tensor([1., 2., 3.]), and then it
-  // turns into a FakeTensor when we call lift_fresh on this real tensor).
-  // This case is compositional because FakeTensor is a subclass of Tensor, so
-  // it's valid for us to return it in place of a Tensor.  So this is what we
-  // do.
-
-  if (mb_obj.has_value() && mb_obj.value()) {
-    TORCH_CHECK(
-        allow_preexisting_pyobj,
-        "Creating a new Tensor subclass ",
-        type->tp_name,
-        " but the raw Tensor object is already associated to a python object ",
-        "of type ",
-        mb_obj.value()->ob_type->tp_name);
-    // Even if we allow pre-existing PyObject, we don't allow completely
-    // ignoring the requested type.  Check that we fulfilled a subtype
-    // relation here.  In the common case the requested type is Tensor and
-    // this always succeeds.
-    PyObject* obj = *mb_obj;
-    // Check if it's OK to just directly return the Python object without
-    // allocating a new variable.  We just check that the existing Python
-    // object is a subclass of the requested type.
-    PyTypeObject* obj_type = Py_TYPE(obj);
-    TORCH_CHECK(
-        obj_type == type || PyType_IsSubtype(obj_type, type),
-        "Creating a new Tensor subclass ",
-        type->tp_name,
-        " but the raw Tensor object is already associated to a python object ",
-        "of type ",
-        mb_obj.value()->ob_type->tp_name,
-        " which is not a subclass of the "
-        "requested type");
-    // We may (in fact, we typically will) need to resurrect this
-    return THPVariable_Wrap(_var);
-  }
-
-  PyObject* obj = type->tp_alloc(type, 0);
-  if (obj) {
-    auto v = (THPVariable*)obj;
-    // TODO: named constructor to avoid default initialization
-    new (&v->cdata) MaybeOwned<Variable>();
-    if (c10::impl::HermeticPyObjectTLS::get_state()) {
-      // Do NOT initialize pyobj field on the tensor, you own the C++
-      v->cdata = MaybeOwned<Variable>::owned(Variable(_var));
-      TORCH_INTERNAL_ASSERT(
-          !check_has_torch_dispatch(obj),
-          "While HermeticPyObject was enabled, we attempted to create a tensor "
-          "subclass with __torch_dispatch__.  This violates the invariant that "
-          "operations in HermeticPyObject have equivalent C++ implementations. "
-          "If your operator registered from Python operator registration isn't "
-          "doing anything strange, there may be an internal PyTorch bug involving "
-          "not appropriately disabling TorchDispatchMode before executing "
-          "Python op registration.");
-    } else {
-      // Normal codepath
-      v->cdata = MaybeOwned<Variable>::owned(Variable(_var));
-      const auto& var = THPVariable_Unpack(v);
-      var.unsafeGetTensorImpl()->pyobj_slot()->init_pyobj(obj);
-      if (has_torch_dispatch_if_known.has_value()
-              ? *has_torch_dispatch_if_known
-              : check_has_torch_dispatch(obj)) {
-        var.unsafeGetTensorImpl()->set_python_dispatch(true);
-      }
-    }
-  }
-  return obj;
+static void TORCH_CHECK_TENSOR_SUBTYPE(PyObject* cls) {
+  TORCH_CHECK_TYPE(
+      PyType_Check(cls),
+      "cls must be a type (got ",
+      Py_TYPE(cls)->tp_name,
+      ")");
+  PyTypeObject* type = reinterpret_cast<PyTypeObject*>(cls);
+  TORCH_CHECK_TYPE(
+      type == &THPVariableType || cls == THPVariableClass ||
+          PyType_IsSubtype(type, &THPVariableType),
+      "Creating a Tensor subclass from a class that does not inherit from "
+      "Tensor is not possible. Make sure your class inherits from Tensor.");
 }
 
 /// NOTE [ PyObject Traversal ]
@@ -2687,7 +3491,7 @@ static PyObject* THPVariable_NewWithVar(
 /// into account these C++ ownership links.
 ///
 /// The main danger here comes from the fact that, while all python-related code
-/// is thread safe wrt the GC execution (thanks to the GIL), other threads might
+/// is thread safe wrt the GC execution, other threads might
 /// be using our C++ objects arbitrarily which can lead to shared_ptr ref count
 /// going up or down in between the different traverse/clear invocations. The
 /// one constraint we add here that is not explicitly mentioned in the GC
@@ -2717,124 +3521,46 @@ static PyObject* THPVariable_NewWithVar(
 /// https://github.com/pytorch/pytorch/issues/7343
 ///
 
-static int traverse_slots(
-    PyTypeObject* type,
-    PyObject* self,
-    visitproc visit,
-    void* arg) {
-  auto n = Py_SIZE(type);
-  auto mp = type->tp_members;
-  for (Py_ssize_t i = 0; i < n; i++, mp++) {
-    if (mp->type == T_OBJECT_EX) {
-      char* addr = (char*)self + mp->offset;
-      PyObject* obj = *(PyObject**)addr;
-      if (obj != nullptr) {
-        int err = visit(obj, arg);
-        if (err)
-          return err;
-      }
-    }
-  }
-  return 0;
-}
-
-static int THPVariable_subclass_traverse(
-    PyObject* self,
-    visitproc visit,
-    void* arg) {
-  // If the tensor is eligible to be resurrected, don't traverse it; instead
-  // treat all of its references as a root (as they WOULD be a root since we
-  // can treat the inbound C++ references as root owners).
-  //
-  // This works because unlike conventional GCs, Python's GC operates in two
-  // phases: first it uses traverse to discover roots, and then it uses traverse
-  // to do reachability.  Bypassing traverse during root discovery forces Python
-  // to treat self as a root for everything it refers to.  For a full
-  // explanation of the algorithm see
-  // https://devguide.python.org/garbage_collector/
-  //
-  // NB: if we don't hold an owning reference to the underlying Tensor, it is
-  // possible that the underlying Tensor has already gone dead.  In that case,
-  // it's not safe to access it.  But it's also safe to traverse, because if
-  // the underlying Tensor *is* live, then root discovery will determine that
-  // self is live, and nothing will get GC'ed anyway (resurrection cannot happen
-  // if the C++ objects owns the PyObject)
+static int THPVariable_traverse(PyObject* self, visitproc visit, void* arg) {
   THPVariable* var = reinterpret_cast<THPVariable*>(self);
-  if (isResurrectable(var)) {
-    return 0;
-  }
-
-  // Crappy version of subtype_traverse; same deal as
-  // THPVariable_subclass_dealloc
-
-  PyTypeObject* type = Py_TYPE(self);
-  // Traverse slots until we get to base class THPVariableType
-  {
-    PyTypeObject* base = type;
-    while (base != &THPVariableType) {
-      if (Py_SIZE(base)) {
-        int err = traverse_slots(base, self, visit, arg);
-        if (err)
-          return err;
-      }
-      base = base->tp_base;
-      TORCH_INTERNAL_ASSERT(base);
-    }
-  }
-
-  // All Python defined classes have __dict__
-  if (C10_LIKELY(type->tp_dictoffset)) {
-    PyObject** dictptr = _PyObject_GetDictPtr(self);
-    if (dictptr && *dictptr)
-      Py_VISIT(*dictptr);
-  }
-
-  TORCH_INTERNAL_ASSERT(type->tp_flags & Py_TPFLAGS_HEAPTYPE);
-  Py_VISIT(type);
-
-  // Finally traverse THPVariable special stuff
   Py_VISIT(var->backward_hooks);
   Py_VISIT(var->post_accumulate_grad_hooks);
-  if (!var->cdata.unsafeIsBorrowed()) {
-    const auto& tensor = THPVariable_Unpack(var);
-    if (tensor.defined()) {
-      // WARNING: The grad_fn traversal logic is very subtle, if you change
-      // this, be very careful not to re-introduce this bug:
-      // https://gist.github.com/zou3519/7ac92b84dd7d206dcc6eae55fee8372c
-
-      // We ensure that we follow NOTE [ PyObject Traversal ] he by checking
-      // that this python object is the sole owner of the underlying Tensor and
-      // that this Tensor is the sole owner of its grad_fn. In this case, the
-      // only way to get a new reference to the grad_fn is by using this python
-      // object, which requires the GIL to be accessed. Note that this is only
-      // valid as long as user don't share non-owning references across
-      // different threads (which is crazy and should never be done).
-      auto autograd_meta = torch::autograd::impl::get_autograd_meta(tensor);
-      if (tensor.use_count() == 1) {
-        if (autograd_meta) {
-          // Do NOT call grad_fn() here as that might trigger a recompute
-          const auto& grad_fn = autograd_meta->grad_fn_;
-          if (grad_fn && grad_fn.use_count() == 1) {
-            // All Node can have a pyobj (stored in "pyobj_")
-            Py_VISIT(grad_fn->pyobj());
-            // PyNode are special as they also have an "obj" field
-            if (auto py_node_fn = dynamic_cast<PyNode*>(grad_fn.get())) {
-              Py_VISIT(py_node_fn->obj);
-            }
+  const auto& tensor = THPVariable_Unpack(var);
+  if (tensor.defined()) {
+    // WARNING: The grad_fn traversal logic is very subtle, if you change
+    // this, be very careful not to re-introduce this bug:
+    // https://gist.github.com/zou3519/7ac92b84dd7d206dcc6eae55fee8372c
+
+    // We ensure that we follow NOTE [ PyObject Traversal ] he by checking
+    // that this python object is the sole owner of the underlying Tensor and
+    // that this Tensor is the sole owner of its grad_fn. In this case, the
+    // only way to get a new reference to the grad_fn is by using this python
+    // object, which requires the GIL to be accessed. Note that this is only
+    // valid as long as user don't share non-owning references across
+    // different threads (which is crazy and should never be done).
+    auto autograd_meta = torch::autograd::impl::get_autograd_meta(tensor);
+    if (tensor.use_count() == 1) {
+      if (autograd_meta) {
+        // Do NOT call grad_fn() here as that might trigger a recompute
+        const auto& grad_fn = autograd_meta->grad_fn_;
+        if (grad_fn && grad_fn.use_count() == 1) {
+          // All Node can have a pyobj (stored in "pyobj_")
+          Py_VISIT(grad_fn->pyobj());
+          // PyNode are special as they also have an "obj" field
+          if (auto py_node_fn = dynamic_cast<PyNode*>(grad_fn.get())) {
+            Py_VISIT(py_node_fn->obj);
           }
         }
       }
-      if (autograd_meta) {
-        for (const auto& hook : torch::autograd::impl::hooks(tensor)) {
-          if (auto pyhook =
-                  dynamic_cast<PyFunctionTensorPreHook*>(hook.get())) {
-            Py_VISIT(pyhook->dict);
-          }
+    }
+    if (autograd_meta) {
+      for (const auto& hook : torch::autograd::impl::hooks(tensor)) {
+        if (auto pyhook = dynamic_cast<PyFunctionTensorPreHook*>(hook.get())) {
+          Py_VISIT(pyhook->dict);
         }
       }
     }
   }
-
   return 0;
 }
 
@@ -2842,17 +3568,6 @@ int THPVariableMetaType_init(PyObject* cls, PyObject* args, PyObject* kwargs) {
   if (PyType_Type.tp_init(cls, args, kwargs) < 0) {
     return -1;
   }
-  // It is important for all three of these to be overridden correctly for the
-  // resurrection checks to properly happen. In particular, an older version
-  // was not overriding tp_clear here. This lead to the default subtype_clear
-  // running on the Tensor object (as only TensorBase tp_clear was custom),
-  // clearing the __dict__ field, before the TensorBase custom clear was called
-  // and would properly detect the resurrect.
-  // See https://github.com/pytorch/pytorch/issues/136358 for the exact behavior
-  ((PyTypeObject*)cls)->tp_dealloc = (destructor)THPVariable_subclass_dealloc;
-  ((PyTypeObject*)cls)->tp_traverse =
-      (traverseproc)THPVariable_subclass_traverse;
-  ((PyTypeObject*)cls)->tp_clear = (inquiry)THPVariable_subclass_clear;
 
   // Don't do anything for the base Tensor class
   if (!THPVariableClass) {
@@ -2945,13 +3660,22 @@ bool THPVariable_initModule(PyObject* module) {
   PyModule_AddObject(module, "TensorBase", (PyObject*)&THPVariableType);
   Py_INCREF(&THPVariableType);
   PyModule_AddObject(module, "_TensorBase", (PyObject*)&THPVariableType);
+#ifdef USE_DISTRIBUTED
+  PyModule_AddObject(
+      module,
+      "__DTensor_fastpath_cache_cleanup",
+      py::capsule(
+          []() { cleanup_thread_local_native_sharding_propagator_caches(); })
+          .release()
+          .ptr());
+  if (!intern_dtensor_strings()) {
+    return false;
+  }
+  PyModule_AddFunctions(module, extra_dtensor_functions);
+#endif
   torch::autograd::initTorchFunctions(module);
   torch::autograd::initTensorImplConversion(module);
   torch::utils::validate_numpy_for_dlpack_deleter_bug();
 
-  if (!intern_dtensor_strings()) {
-    return false;
-  }
-  PyModule_AddFunctions(module, extra_functions);
   return true;
 }
diff --git a/torch/csrc/autograd/python_variable.h b/torch/csrc/autograd/python_variable.h
index 82939211eb50a..5b6f089990693 100644
--- a/torch/csrc/autograd/python_variable.h
+++ b/torch/csrc/autograd/python_variable.h
@@ -17,7 +17,7 @@ namespace py = pybind11;
 struct THPVariable {
   PyObject_HEAD
   // Payload
-  c10::MaybeOwned<at::Tensor> cdata;
+  at::Tensor cdata;
   // Hooks to be run on backwards pass (corresponds to Python attr
   // '_backwards_hooks', set by 'register_hook')
   PyObject* backward_hooks = nullptr;
@@ -37,7 +37,11 @@ TORCH_PYTHON_API extern PyObject* THPVariableClass;
 TORCH_PYTHON_API extern PyObject* ParameterClass;
 
 bool THPVariable_initModule(PyObject* module);
+TORCH_PYTHON_API PyObject* THPVariable_Wrap(at::TensorBase&& var);
 TORCH_PYTHON_API PyObject* THPVariable_Wrap(const at::TensorBase& var);
+TORCH_PYTHON_API PyObject* THPVariable_Wrap(
+    const at::TensorBase& var,
+    PyTypeObject* type);
 
 inline bool THPVariable_CheckTypeExact(PyTypeObject* tp) {
   // Check that a python object is a `Tensor`, but not a `Tensor` subclass.
@@ -69,7 +73,7 @@ inline bool THPVariable_Check(PyObject* obj) {
 }
 
 inline const at::Tensor& THPVariable_Unpack(THPVariable* var) {
-  return *var->cdata;
+  return var->cdata;
 }
 
 inline const at::Tensor& THPVariable_Unpack(PyObject* obj) {
@@ -86,6 +90,15 @@ void pushPyOutToStack(
     py::object out,
     const char* msg);
 
+py::handle get_dtensor_class();
+
+py::object dispatchDTensorOp(
+    const c10::OperatorHandle& op,
+    py::handle py_op,
+    py::handle args,
+    py::handle kwargs,
+    torch::jit::Stack* stack);
+
 inline PyObject* THPVariable_WrapList(
     const torch::autograd::variable_list& inputs) {
   PyObject* pyinput = PyList_New(static_cast<Py_ssize_t>(inputs.size()));
diff --git a/torch/csrc/autograd/saved_variable.cpp b/torch/csrc/autograd/saved_variable.cpp
index 0124a0212bc61..55def20af786f 100644
--- a/torch/csrc/autograd/saved_variable.cpp
+++ b/torch/csrc/autograd/saved_variable.cpp
@@ -172,15 +172,15 @@ Variable SavedVariable::unpack(std::shared_ptr<Node> saved_for) const {
       message
           << "one of the variables needed for gradient computation has been "
              "modified by an inplace operation: ["
-          << data_.toString() << " ";
+          << data_.toString() << ' ';
       if (data_.is_nested()) {
-        message << data_._nested_tensor_size() << "]";
+        message << data_._nested_tensor_size() << ']';
       } else {
-        message << data_.sizes() << "]";
+        message << data_.sizes() << ']';
       }
       if (grad_fn) {
         message << ", which is output " << output_nr_ << " of "
-                << grad_fn->name() << ",";
+                << grad_fn->name() << ',';
       }
       message << " is at version " << current_version << "; expected version "
               << saved_version_ << " instead.";
diff --git a/torch/csrc/autograd/utils/grad_layout_contract.h b/torch/csrc/autograd/utils/grad_layout_contract.h
index ed97dc4530eb4..00bdb91c36867 100644
--- a/torch/csrc/autograd/utils/grad_layout_contract.h
+++ b/torch/csrc/autograd/utils/grad_layout_contract.h
@@ -65,7 +65,9 @@ inline at::Tensor clone_obey_contract(
                          .new_empty_strided_symint(
                              variable.sym_sizes(),
                              variable.sym_strides(),
-                             variable.options().memory_format(std::nullopt))
+                             variable.options()
+                                 .memory_format(std::nullopt)
+                                 .dtype(new_grad.dtype()))
                          .copy_(new_grad));
   } else {
     // (2)
diff --git a/torch/csrc/autograd/utils/wrap_outputs.h b/torch/csrc/autograd/utils/wrap_outputs.h
index 6e0494df5cf47..616b0fa0331bc 100644
--- a/torch/csrc/autograd/utils/wrap_outputs.h
+++ b/torch/csrc/autograd/utils/wrap_outputs.h
@@ -70,6 +70,10 @@ inline PyObject* wrap(const at::Tensor& tensor) {
   return THPVariable_Wrap(tensor);
 }
 
+inline PyObject* wrap(at::Tensor&& tensor) {
+  return THPVariable_Wrap(std::move(tensor));
+}
+
 inline PyObject* wrap(const at::Scalar& scalar) {
   return wrap(scalar_to_tensor(scalar));
 }
diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
index a297a9f5ef425..05dbfdaa44325 100644
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@@ -197,6 +197,22 @@ TORCH_API std::unique_ptr<PostAccumulateGradHook>& post_acc_grad_hooks(
 TORCH_API void create_cpp_hook(
     const at::TensorBase& /*self*/,
     bool is_retains_grad_hooks = false);
+
+inline bool is_tensor_stealable(
+    const at::Tensor& new_grad,
+    size_t num_expected_refs = 1) {
+  size_t use_count = new_grad.use_count();
+  if (use_count <= num_expected_refs) {
+    return true;
+  }
+  if (use_count >= 2 &&
+      new_grad.unsafeGetTensorImpl()->pyobj_slot()->has_unique_reference()) {
+    // The Python wrapper, if it exists, also has a reference to the Tensor.
+    num_expected_refs++;
+  }
+  return use_count <= num_expected_refs;
+}
+
 } // namespace impl
 
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -894,7 +910,7 @@ inline Variable make_variable(
     bool requires_grad = false,
     bool allow_tensor_metadata_change = true) {
   if (data.defined()) {
-    if (data.getIntrusivePtr().use_count() == 1 &&
+    if (impl::is_tensor_stealable(data) &&
         data.getIntrusivePtr()->unique_version()) {
       auto data_impl = data.unsafeReleaseIntrusivePtr();
       data_impl->set_allow_tensor_metadata_change(allow_tensor_metadata_change);
diff --git a/torch/csrc/cuda/MemPool.cpp b/torch/csrc/cuda/MemPool.cpp
index b651a4b5e68aa..4762eaadeb69b 100644
--- a/torch/csrc/cuda/MemPool.cpp
+++ b/torch/csrc/cuda/MemPool.cpp
@@ -4,6 +4,7 @@
 #include <torch/csrc/utils/device_lazy_init.h>
 #include <torch/csrc/utils/pybind.h>
 
+#include <ATen/cuda/MemPool.h>
 #include <c10/cuda/CUDACachingAllocator.h>
 
 template <typename T>
@@ -12,16 +13,16 @@ using shared_ptr_class_ = py::class_<T, std::shared_ptr<T>>;
 // NOLINTNEXTLINE(misc-use-internal-linkage)
 void THCPMemPool_init(PyObject* module) {
   auto torch_C_m = py::handle(module).cast<py::module>();
-  shared_ptr_class_<::c10::cuda::MemPool>(torch_C_m, "_MemPool")
+  shared_ptr_class_<::at::cuda::MemPool>(torch_C_m, "_MemPool")
       .def(
           py::init([](c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator,
                       bool is_user_created,
                       bool use_on_oom) {
             torch::utils::device_lazy_init(at::kCUDA);
-            return std::make_shared<::c10::cuda::MemPool>(
+            return std::make_shared<::at::cuda::MemPool>(
                 allocator, is_user_created, use_on_oom);
           }))
-      .def_property_readonly("id", &::c10::cuda::MemPool::id)
-      .def_property_readonly("allocator", &::c10::cuda::MemPool::allocator)
-      .def("use_count", &::c10::cuda::MemPool::use_count);
+      .def_property_readonly("id", &::at::cuda::MemPool::id)
+      .def_property_readonly("allocator", &::at::cuda::MemPool::allocator)
+      .def("use_count", &::at::cuda::MemPool::use_count);
 }
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
index b14323a47bf35..a8ae82b1b66ea 100644
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@@ -1114,7 +1114,7 @@ static void registerCudaDeviceProperties(PyObject* module) {
         stream << "_CudaDeviceProperties(name='" << prop.name
                << "', major=" << prop.major << ", minor=" << prop.minor
 #if USE_ROCM
-               << ", gcnArchName='" << prop.gcnArchName << "'"
+               << ", gcnArchName='" << prop.gcnArchName << '\''
 #endif // USE_ROCM
                << ", total_memory=" << prop.totalGlobalMem / (1024ull * 1024)
                << "MB, multi_processor_count=" << prop.multiProcessorCount
diff --git a/torch/csrc/cuda/shared/cudnn.cpp b/torch/csrc/cuda/shared/cudnn.cpp
index f56899107fd56..20e69779c062b 100644
--- a/torch/csrc/cuda/shared/cudnn.cpp
+++ b/torch/csrc/cuda/shared/cudnn.cpp
@@ -2,6 +2,7 @@
 // This file should only be compiled if this condition holds, so it should be
 // safe.
 #if defined(USE_CUDNN) || defined(USE_ROCM)
+#include <ATen/detail/CUDAHooksInterface.h>
 #include <torch/csrc/utils/pybind.h>
 
 #include <tuple>
@@ -32,11 +33,7 @@ version_tuple getRuntimeVersion() {
 }
 
 size_t getVersionInt() {
-#ifndef USE_STATIC_CUDNN
-  return cudnnGetVersion();
-#else
-  return CUDNN_VERSION;
-#endif
+  return at::detail::getCUDAHooks().versionRuntimeCuDNN();
 }
 
 } // namespace
diff --git a/torch/csrc/cuda/shared/nvtx.cpp b/torch/csrc/cuda/shared/nvtx.cpp
index d28e8ae222eaa..f4b3c8824b85c 100644
--- a/torch/csrc/cuda/shared/nvtx.cpp
+++ b/torch/csrc/cuda/shared/nvtx.cpp
@@ -2,17 +2,18 @@
 #include <wchar.h> // _wgetenv for nvtx
 #endif
 
+#include <cuda_runtime.h>
+
 #ifndef ROCM_ON_WINDOWS
-#ifdef TORCH_CUDA_USE_NVTX3
+#if CUDART_VERSION >= 13000 || defined(TORCH_CUDA_USE_NVTX3)
 #include <nvtx3/nvtx3.hpp>
-#else // TORCH_CUDA_USE_NVTX3
+#else // CUDART_VERSION >= 13000 || defined(TORCH_CUDA_USE_NVTX3)
 #include <nvToolsExt.h>
-#endif // TORCH_CUDA_USE_NVTX3
+#endif // CUDART_VERSION >= 13000 || defined(TORCH_CUDA_USE_NVTX3)
 #else // ROCM_ON_WINDOWS
 #include <c10/util/Exception.h>
 #endif // ROCM_ON_WINDOWS
 #include <c10/cuda/CUDAException.h>
-#include <cuda_runtime.h>
 #include <torch/csrc/utils/pybind.h>
 
 namespace torch::cuda::shared {
diff --git a/torch/csrc/distributed/c10d/Backend.hpp b/torch/csrc/distributed/c10d/Backend.hpp
index 6ffa1529a4de0..72e35e3fc9dd3 100644
--- a/torch/csrc/distributed/c10d/Backend.hpp
+++ b/torch/csrc/distributed/c10d/Backend.hpp
@@ -79,6 +79,23 @@ class TORCH_API Backend : public torch::CustomClassHolder {
     return false;
   }
 
+  virtual bool supportsShrinking() const {
+    return false;
+  }
+
+  // Shrink the backend by excluding specified ranks. Backends that support
+  // communicator shrinking should override this and return a new backend
+  // instance representing the shrunken group. Backends may use opts_override
+  // to supply backend-specific options for the new group.
+  virtual c10::intrusive_ptr<Backend> shrink(
+      const std::vector<int64_t>& /*ranks_to_exclude*/,
+      int /*shrink_flags*/ = 0,
+      const c10::intrusive_ptr<Options>& /*opts_override*/ = nullptr) {
+    TORCH_CHECK(
+        false,
+        c10::str("Backend ", getBackendName(), " does not support shrink"));
+  }
+
   virtual void setTimeout(std::chrono::milliseconds timeout) {
     TORCH_CHECK(
         false,
diff --git a/torch/csrc/distributed/c10d/FileStore.cpp b/torch/csrc/distributed/c10d/FileStore.cpp
index 7e22aa6fd0bd5..969379e739438 100644
--- a/torch/csrc/distributed/c10d/FileStore.cpp
+++ b/torch/csrc/distributed/c10d/FileStore.cpp
@@ -492,4 +492,17 @@ void FileStore::wait(
   }
 }
 
+std::vector<std::string> FileStore::listKeys() {
+  std::unique_lock<std::mutex> l(activeFileOpLock_);
+  File file(path_, O_RDONLY, timeout_);
+  auto lock = file.lockShared();
+  pos_ = refresh(file, pos_, cache_, deletePrefix_);
+  std::vector<std::string> keys;
+  keys.reserve(cache_.size());
+  for (const auto& kv : cache_) {
+    keys.push_back(kv.first.substr(regularPrefix_.size()));
+  }
+  return keys;
+}
+
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/FileStore.hpp b/torch/csrc/distributed/c10d/FileStore.hpp
index 563ac76e03bf5..11ded19d8125a 100644
--- a/torch/csrc/distributed/c10d/FileStore.hpp
+++ b/torch/csrc/distributed/c10d/FileStore.hpp
@@ -45,6 +45,8 @@ class TORCH_API FileStore : public Store {
     return path_;
   }
 
+  std::vector<std::string> listKeys() override;
+
  protected:
   int64_t addHelper(const std::string& key, int64_t i);
 
diff --git a/torch/csrc/distributed/c10d/FlightRecorder.hpp b/torch/csrc/distributed/c10d/FlightRecorder.hpp
index 23b8893c54f2c..bdb4ad045ff2a 100644
--- a/torch/csrc/distributed/c10d/FlightRecorder.hpp
+++ b/torch/csrc/distributed/c10d/FlightRecorder.hpp
@@ -108,12 +108,14 @@ struct FlightRecorder {
     capture_cpp_stack_ = getCvarBool(
         {"TORCH_FR_CPP_STACK", "TORCH_NCCL_TRACE_CPP_STACK"}, false);
     enabled_ = max_entries_ > 0;
+    reset_epoch_start_idx_[0] = 0;
   }
   struct Entry {
     size_t id_; // incremented id in the trace buffer
                 // used to figure out where in the circular entries
                 // buffer this entry will be located to
                 // update state information
+    size_t reset_epoch_; // epoch when this entry was created
     size_t pg_id_;
     std::tuple<std::string, std::string> pg_name_; // <group_name, group_desc>
 
@@ -183,11 +185,34 @@ struct FlightRecorder {
   size_t max_entries_ = 0;
   size_t next_ = 0;
   size_t id_ = 0;
+  size_t reset_epoch_ = 0;
+  std::unordered_map<size_t, size_t>
+      reset_epoch_start_idx_; // maps reset_epoch to the idx where it starts
   std::map<size_t, std::shared_ptr<ProcessGroupStatus>> all_pg_status_;
   std::map<std::tuple<std::string, std::string>, std::vector<uint64_t>>
       pg_name_to_ranks_;
   std::string comm_lib_version_;
 
+  struct TraceIdentifier {
+    std::optional<size_t> id;
+    std::optional<size_t> reset_epoch;
+  };
+
+  TraceIdentifier recordWithResetEnabled(
+      size_t pg_id,
+      const std::tuple<std::string, std::string>& pg_name,
+      size_t collective_seq_id,
+      size_t p2p_seq_id,
+      size_t op_id,
+      std::string profiling_name,
+      const std::vector<at::Tensor>& inputs,
+      const std::vector<at::Tensor>& outputs,
+      EventType* start,
+      EventType* end,
+      std::chrono::milliseconds timeout_ms,
+      std::shared_ptr<ProcessGroupStatus> pg_status,
+      bool isP2P);
+
   std::optional<size_t> record(
       size_t pg_id,
       const std::tuple<std::string, std::string>& pg_name,
@@ -213,8 +238,16 @@ struct FlightRecorder {
 
   std::vector<Entry> dump_entries();
 
-  // Returns the entry with the given id, if it exists. Otherwise, returns
-  // std::nullopt.
+  // Returns the index in entries_ for the given id and reset_epoch.
+  // Caller must hold mutex_lock before calling this method.
+  size_t getIdxFromId(size_t id, size_t reset_epoch) const;
+
+  // Returns the entry with the given id and reset_epoch, if it exists.
+  // Otherwise, returns std::nullopt.
+  TORCH_API std::optional<Entry> getEntry(
+      std::optional<size_t> id,
+      std::optional<size_t> reset_epoch);
+
   TORCH_API std::optional<Entry> getEntry(std::optional<size_t> id);
 
   /*
@@ -227,6 +260,11 @@ struct FlightRecorder {
   never hang. (timing must also be enabled for compute_duration - see
   TORCH_NCCL_ENABLE_TIMING).
   */
+  TORCH_API void retire_id(
+      std::optional<size_t> id,
+      std::optional<size_t> reset_epoch,
+      bool compute_duration = true);
+
   TORCH_API void retire_id(
       std::optional<size_t> id,
       bool compute_duration = true);
diff --git a/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp b/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp
index 8813c95158460..28647b8c50f5a 100644
--- a/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp
+++ b/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp
@@ -24,8 +24,8 @@ std::string FlightRecorder<EventType>::Entry::getTraceback() {
   for (auto idx : c10::irange(s_tb.size())) {
     auto frame_id = s_tb[idx];
     const auto& frame = s_tbs.all_frames.at(frame_id);
-    oss << "#" << idx << " " << frame.funcname << " from " << frame.filename
-        << ":" << frame.lineno << '\n';
+    oss << '#' << idx << ' ' << frame.funcname << " from " << frame.filename
+        << ':' << frame.lineno << '\n';
   }
   /* Resulted format is like:
     #0 all_reduce from pytorch/torch/distributed/distributed_c10d.py:2696
@@ -53,8 +53,41 @@ std::optional<size_t> FlightRecorder<EventType>::record(
     std::chrono::milliseconds timeout_ms,
     std::shared_ptr<ProcessGroupStatus> pg_status,
     bool isP2P) {
+  auto result = recordWithResetEnabled(
+      pg_id,
+      pg_name,
+      collective_seq_id,
+      p2p_seq_id,
+      op_id,
+      std::move(profiling_name),
+      inputs,
+      outputs,
+      start,
+      end,
+      timeout_ms,
+      std::move(pg_status),
+      isP2P);
+  return result.id;
+}
+
+template <typename EventType>
+typename FlightRecorder<EventType>::TraceIdentifier FlightRecorder<EventType>::
+    recordWithResetEnabled(
+        size_t pg_id,
+        const std::tuple<std::string, std::string>& pg_name,
+        size_t collective_seq_id,
+        size_t p2p_seq_id,
+        size_t op_id,
+        std::string profiling_name,
+        const std::vector<at::Tensor>& inputs,
+        const std::vector<at::Tensor>& outputs,
+        EventType* start,
+        EventType* end,
+        std::chrono::milliseconds timeout_ms,
+        std::shared_ptr<ProcessGroupStatus> pg_status,
+        bool isP2P) {
   if (!enabled_) {
-    return std::nullopt;
+    return TraceIdentifier{std::nullopt, std::nullopt};
   }
   if (all_pg_status_.find(pg_id) == all_pg_status_.end()) {
     // Current pg_status is not in FR.
@@ -64,8 +97,13 @@ std::optional<size_t> FlightRecorder<EventType>::record(
       torch::CapturedTraceback::gather(true, true, capture_cpp_stack_);
   std::lock_guard<std::mutex> guard(mutex_);
 
+  TORCH_CHECK(
+      reset_epoch_start_idx_.find(reset_epoch_) !=
+      reset_epoch_start_idx_.end());
+
   auto te = Entry{
       id_,
+      reset_epoch_,
       pg_id,
       pg_name,
       collective_seq_id,
@@ -104,15 +142,20 @@ std::optional<size_t> FlightRecorder<EventType>::record(
     te.sizes_.insert(te.sizes_.end(), sizes.begin(), sizes.end());
   }
 
+  const auto next = next_++;
+
   if (entries_.size() < max_entries_) {
     entries_.emplace_back(std::move(te));
   } else {
-    entries_[next_++] = std::move(te);
-    if (next_ == max_entries_) {
-      next_ = 0;
-    }
+    entries_[next] = std::move(te);
   }
-  return id_++;
+
+  if (next_ == max_entries_) {
+    next_ = 0;
+  }
+
+  const auto id = id_++;
+  return TraceIdentifier{id, reset_epoch_};
 }
 
 template <typename EventType>
@@ -163,15 +206,20 @@ std::vector<typename FlightRecorder<EventType>::Entry> FlightRecorder<
   std::vector<Entry> result;
   {
     std::lock_guard<std::mutex> guard(mutex_);
-    result.reserve(entries_.size());
-    result.insert(
-        result.end(),
+    // Filter entries during insertion - only keep entries from current epoch
+    auto filter = [this](const Entry& e) {
+      return e.reset_epoch_ == reset_epoch_;
+    };
+    std::copy_if(
         entries_.begin() + static_cast<std::ptrdiff_t>(next_),
-        entries_.end());
-    result.insert(
-        result.end(),
+        entries_.end(),
+        std::back_inserter(result),
+        filter);
+    std::copy_if(
         entries_.begin(),
-        entries_.begin() + static_cast<std::ptrdiff_t>(next_));
+        entries_.begin() + static_cast<std::ptrdiff_t>(next_),
+        std::back_inserter(result),
+        filter);
   }
   // query any remaining events
   for (auto& r : result) {
@@ -182,28 +230,47 @@ std::vector<typename FlightRecorder<EventType>::Entry> FlightRecorder<
 }
 
 template <typename EventType>
-// Returns the entry with the given id, if it exists. Otherwise, returns
-// std::nullopt.
+// Returns the index in entries_ for the given id and reset_epoch.
+// Caller must hold mutex_lock before calling this method.
+size_t FlightRecorder<EventType>::getIdxFromId(size_t id, size_t reset_epoch)
+    const {
+  // Look up the starting idx for the given reset epoch
+  auto it = reset_epoch_start_idx_.find(reset_epoch);
+  TORCH_CHECK(it != reset_epoch_start_idx_.end());
+  // Calculate idx based on where the epoch started
+  return (it->second + id) % max_entries_;
+}
+
+template <typename EventType>
+// Returns the entry with the given id and reset_epoch, if it exists. Otherwise,
+// returns std::nullopt.
 std::optional<typename FlightRecorder<EventType>::Entry> FlightRecorder<
-    EventType>::getEntry(std::optional<size_t> id) {
-  if (!enabled_ || !id) {
+    EventType>::
+    getEntry(std::optional<size_t> id, std::optional<size_t> reset_epoch) {
+  if (!enabled_ || !id || !reset_epoch) {
     return std::nullopt;
   }
 
   std::unique_lock<std::mutex> guard(mutex_);
-  Entry entry = entries_.at(*id % max_entries_);
-  if (entry.id_ == *id) {
+  Entry entry = entries_.at(getIdxFromId(*id, *reset_epoch));
+  if (entry.id_ == *id && entry.reset_epoch_ == *reset_epoch) {
     return entry;
-  } else {
-    return std::nullopt;
   }
+  return std::nullopt;
+}
+
+template <typename EventType>
+std::optional<typename FlightRecorder<EventType>::Entry> FlightRecorder<
+    EventType>::getEntry(std::optional<size_t> id) {
+  return getEntry(id, 0);
 }
 
 template <typename EventType>
 void FlightRecorder<EventType>::retire_id(
     std::optional<size_t> id,
+    std::optional<size_t> reset_epoch,
     bool compute_duration) {
-  if (!enabled_ || !id) {
+  if (!enabled_ || !id || !reset_epoch) {
     return;
   }
 
@@ -214,8 +281,8 @@ void FlightRecorder<EventType>::retire_id(
 
   std::unique_lock<std::mutex> guard(mutex_);
 
-  Entry* entry = &entries_.at(*id % max_entries_);
-  if (entry->id_ == *id) {
+  Entry* entry = &entries_.at(getIdxFromId(*id, *reset_epoch));
+  if (entry->id_ == *id && entry->reset_epoch_ == *reset_epoch) {
     update_state(*entry);
 
     if (compute_duration) {
@@ -237,8 +304,8 @@ void FlightRecorder<EventType>::retire_id(
     guard.lock();
 
     // Refresh the entry pointer, see if the entry has been overwritten
-    entry = &entries_.at(*id % max_entries_);
-    if (entry->id_ != *id) {
+    entry = &entries_.at(getIdxFromId(*id, *reset_epoch));
+    if (!(entry->id_ == *id && entry->reset_epoch_ == *reset_epoch)) {
       LOG(INFO) << "retire_id abandoned for id " << *id
                 << ", event was overwritten while waiting to compute duration.";
       return;
@@ -249,12 +316,23 @@ void FlightRecorder<EventType>::retire_id(
   }
 }
 
+template <typename EventType>
+void FlightRecorder<EventType>::retire_id(
+    std::optional<size_t> id,
+    bool compute_duration) {
+  retire_id(id, 0, compute_duration);
+}
+
 template <typename EventType>
 void FlightRecorder<EventType>::reset_all() {
   std::lock_guard<std::mutex> guard(mutex_);
-  next_ = 0;
-  id_ = 0;
-  entries_.clear();
+  if (!entries_.empty()) {
+    // Soft delete: increment epoch to mark all existing entries as old
+    // Store where the new epoch starts in the circular buffer
+    reset_epoch_++;
+    reset_epoch_start_idx_[reset_epoch_] = next_;
+    id_ = 0;
+  }
 }
 
 template <typename EventType>
diff --git a/torch/csrc/distributed/c10d/HashStore.cpp b/torch/csrc/distributed/c10d/HashStore.cpp
index 15befd9ec34e2..9073333fb9a48 100644
--- a/torch/csrc/distributed/c10d/HashStore.cpp
+++ b/torch/csrc/distributed/c10d/HashStore.cpp
@@ -217,4 +217,14 @@ int64_t HashStore::queueLen(const std::string& key) {
   return static_cast<int64_t>(it->second.size());
 }
 
+std::vector<std::string> HashStore::listKeys() {
+  std::unique_lock<std::mutex> lock(m_);
+  std::vector<std::string> keys;
+  keys.reserve(map_.size());
+  for (const auto& kv : map_) {
+    keys.push_back(kv.first);
+  }
+  return keys;
+}
+
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/HashStore.hpp b/torch/csrc/distributed/c10d/HashStore.hpp
index 4007d543a9371..f7aca03de8b22 100644
--- a/torch/csrc/distributed/c10d/HashStore.hpp
+++ b/torch/csrc/distributed/c10d/HashStore.hpp
@@ -59,6 +59,8 @@ class TORCH_API HashStore : public Store {
 
   int64_t queueLen(const std::string& key) override;
 
+  std::vector<std::string> listKeys() override;
+
  protected:
   bool checkLocked(
       const std::unique_lock<std::mutex>& lock,
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.cpp b/torch/csrc/distributed/c10d/NCCLUtils.cpp
index 8074cc98a04f1..a41f654b9ae20 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.cpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.cpp
@@ -259,6 +259,65 @@ std::shared_ptr<NCCLComm> NCCLComm::split(
 }
 #endif
 
+#ifdef NCCL_HAS_COMM_SHRINK
+std::shared_ptr<NCCLComm> NCCLComm::shrink(
+    NCCLComm* source,
+    std::vector<int>& ranks_to_exclude,
+    ncclConfig_t* config,
+    int shrinkFlags) {
+  // Preconditions are validated in ProcessGroupNCCL::shrink
+
+  LOG(INFO) << "Rank " << source->rank_ << ": shrinking comm " << source->repr()
+            << " excluding " << ranks_to_exclude.size() << " ranks";
+
+  at::cuda::OptionalCUDAGuard gpuGuard(source->deviceIndex_);
+  auto comm = std::make_shared<NCCLComm>();
+
+  // This call will block until the source communicator is initialized
+  auto sourceComm = source->getNcclComm();
+
+  C10D_NCCL_CHECK_NONBLOCKING(
+      ncclCommShrink(
+          sourceComm,
+          ranks_to_exclude.data(),
+          ranks_to_exclude.size(),
+          reinterpret_cast<ncclComm_t*>(&(comm->ncclComm_)),
+          config,
+          shrinkFlags),
+      source->getNcclCommFailureReason());
+
+  // Wait for the child communicator to be ready
+  source->waitReady(true);
+  comm->initialized_ = true;
+
+  // NCCL automatically assigns rank during shrink - query it efficiently
+  int assigned_rank;
+  try {
+    C10D_NCCL_CHECK(
+        ncclCommUserRank(comm->ncclComm_, &assigned_rank), std::nullopt);
+    comm->rank_ = assigned_rank;
+  } catch (const std::exception& e) {
+    // Fallback: if ncclCommUserRank fails, we can't determine the rank
+    LOG(ERROR) << "Failed to query NCCL-assigned rank: " << e.what();
+    throw;
+  }
+
+  // Child comm should be on the same device as parent comm
+  comm->deviceIndex_ = source->deviceIndex_;
+  if (config != nullptr) {
+    comm->nonBlocking_ = config->blocking == 0;
+  } else {
+    // Inherit parent behavior if no config provided
+    comm->nonBlocking_ = source->nonBlocking_;
+  }
+
+  LOG(INFO) << "Rank " << source->rank_ << ": created shrunken comm "
+            << comm->repr() << " with NCCL-assigned rank " << assigned_rank;
+
+  return comm;
+}
+#endif
+
 void NCCLComm::finalize() {
   LockType lock(mutex_);
   if (aborted_) {
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.hpp b/torch/csrc/distributed/c10d/NCCLUtils.hpp
index fdd50f69ef3d7..142633b823744 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.hpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.hpp
@@ -90,6 +90,10 @@ static_assert(
 #define NCCL_HAS_NVLS_CTAS
 #endif
 
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 27, 0)
+#define NCCL_HAS_COMM_SHRINK
+#endif
+
 // Macro to throw on a non-successful NCCL return value.
 #define C10D_NCCL_CHECK(cmd, failureReason)                                   \
   do {                                                                        \
@@ -294,6 +298,14 @@ class NCCLComm {
       ncclConfig_t& config);
 #endif // NCCL_HAS_COMM_SPLIT
 
+#ifdef NCCL_HAS_COMM_SHRINK
+  static std::shared_ptr<NCCLComm> shrink(
+      NCCLComm* source,
+      std::vector<int>& ranks_to_exclude,
+      ncclConfig_t* config,
+      int shrinkFlags = 0);
+#endif // NCCL_HAS_COMM_SHRINK
+
 #if (defined(IS_NCCLX) || defined(USE_ROCM)) && defined(NCCL_COMM_DUMP)
   std::unordered_map<std::string, std::string> ncclCommDump();
 #endif
diff --git a/torch/csrc/distributed/c10d/PrefixStore.cpp b/torch/csrc/distributed/c10d/PrefixStore.cpp
index 057d198f93c2d..fa228c4467f01 100644
--- a/torch/csrc/distributed/c10d/PrefixStore.cpp
+++ b/torch/csrc/distributed/c10d/PrefixStore.cpp
@@ -146,4 +146,18 @@ c10::intrusive_ptr<Store> PrefixStore::getUnderlyingNonPrefixStore() {
   return store;
 }
 
+std::vector<std::string> PrefixStore::listKeys() {
+  auto keys = store_->listKeys();
+  std::vector<std::string> filteredKeys;
+  filteredKeys.reserve(keys.size());
+
+  for (auto& key : keys) {
+    if (key.find(prefix_) == 0) {
+      key = key.substr(prefix_.size() + 1);
+      filteredKeys.push_back(std::move(key));
+    }
+  }
+  return filteredKeys;
+}
+
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/PrefixStore.hpp b/torch/csrc/distributed/c10d/PrefixStore.hpp
index 627d2153bb22b..f950ff96590a3 100644
--- a/torch/csrc/distributed/c10d/PrefixStore.hpp
+++ b/torch/csrc/distributed/c10d/PrefixStore.hpp
@@ -64,6 +64,8 @@ class TORCH_API PrefixStore : public Store {
   // Recursively to fetch the store before layers of wrapping with PrefixStore.
   c10::intrusive_ptr<Store> getUnderlyingNonPrefixStore();
 
+  std::vector<std::string> listKeys() override;
+
  protected:
   std::string prefix_;
   c10::intrusive_ptr<Store> store_;
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
index a9612ce759733..c1d28b2787cda 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
@@ -708,7 +708,8 @@ void ProcessGroupGloo::runLoop(int workerIndex) {
     // TODO: We need to have numel of tensors for gloo as well.
     pgStatus_->lastCompletedNumelIn = 0;
     pgStatus_->lastCompletedNumelOut = 0;
-    FlightRecorder<c10::Event>::get()->retire_id(work->trace_id_, false);
+    FlightRecorder<c10::Event>::get()->retire_id(
+        work->trace_id_, work->trace_reset_epoch_, false);
     lock.lock();
     workInProgress_[workerIndex].reset();
   }
@@ -780,7 +781,7 @@ void ProcessGroupGloo::enqueue(c10::intrusive_ptr<AsyncWork> work) {
   pgStatus_->lastEnqueuedNumelOut = 0;
   // using c10d::FlightRecorder;
   // TODO: We need to have a way to use c10::Event inside gloo as well.
-  work->trace_id_ = FlightRecorder<c10::Event>::get()->record(
+  auto traceId = FlightRecorder<c10::Event>::get()->recordWithResetEnabled(
       local_id_,
       std::make_tuple(pg_uid_, pg_desc_),
       collectiveCounter_,
@@ -795,6 +796,8 @@ void ProcessGroupGloo::enqueue(c10::intrusive_ptr<AsyncWork> work) {
       work->getTimeout(),
       pgStatus_,
       false);
+  work->trace_id_ = traceId.id;
+  work->trace_reset_epoch_ = traceId.reset_epoch;
   workQueue_.push_back(std::move(work));
   lock.unlock();
 
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
index b2cc6993528bf..1a0b7c41b3857 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
@@ -99,6 +99,7 @@ class TORCH_API ProcessGroupGloo : public Backend {
     // unique id used to tell the trace buffer that this
     // work has completed
     std::optional<uint64_t> trace_id_;
+    std::optional<uint64_t> trace_reset_epoch_;
     std::shared_ptr<gloo::Context> context_;
     const std::chrono::milliseconds timeout_;
 
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index fd7f0b4246517..8ae3bf3b314f3 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -165,7 +165,7 @@ ncclRedOpRAII getNcclReduceOp(
 }
 
 // Get a key string from device
-inline std::string getKeyFromDevice(at::Device& device) {
+inline std::string getKeyFromDevice(const at::Device& device) {
   return std::to_string(device.index());
 }
 
@@ -575,6 +575,7 @@ ProcessGroupNCCL::WorkNCCL::WorkNCCL(const WorkNCCL& w)
       futureWorkResult_(w.futureWorkResult_),
       timingEnabled_(w.timingEnabled_),
       trace_id_(w.trace_id_),
+      trace_reset_epoch_(w.trace_reset_epoch_),
       distDebugLevel_(w.distDebugLevel_) {
   exception_ = w.exception_;
 }
@@ -704,9 +705,9 @@ bool ProcessGroupNCCL::WorkNCCL::checkTimeout(
 // Print the traceback of the collective at call time
 std::string ProcessGroupNCCL::WorkNCCL::getTraceback() const {
   // First step we get the corresponding record entry from FR, based on work's
-  // trace_id_
+  // trace_id_ and trace_reset_epoch_
   std::optional<FlightRecorderCUDA::Entry> entry =
-      FlightRecorderCUDA::get()->getEntry(trace_id_);
+      FlightRecorderCUDA::get()->getEntry(trace_id_, trace_reset_epoch_);
   if (entry.has_value()) {
     auto entryVal = entry.value();
     // Get stack trace from FR entry, in string format
@@ -1103,7 +1104,7 @@ ErrorType ProcessGroupNCCL::getError() {
   return error_;
 }
 
-void ProcessGroupNCCL::registerMemPool(c10::cuda::MemPool* pool, bool symm) {
+void ProcessGroupNCCL::registerMemPool(at::cuda::MemPool* pool, bool symm) {
   const auto key = std::to_string(pool->device());
   LOG(INFO) << logPrefix()
             << "Performing NCCL user buffer registration for all buffers in "
@@ -1137,7 +1138,7 @@ void ProcessGroupNCCL::registerMemPool(c10::cuda::MemPool* pool, bool symm) {
   }
 }
 
-void ProcessGroupNCCL::deregisterMemPool(c10::cuda::MemPool* pool) {
+void ProcessGroupNCCL::deregisterMemPool(at::cuda::MemPool* pool) {
   const auto key = std::to_string(pool->device());
   LOG(INFO) << logPrefix()
             << "Performing NCCL user buffer deregistration for all buffers in "
@@ -1882,7 +1883,7 @@ void ProcessGroupNCCL::HeartbeatMonitor::runLoop() {
       LOG(INFO) << pg_->logPrefix()
                 << "Dump signal received through pipe, triggering FR dump.";
       futures.emplace_back(std::async(std::launch::async, [this, onlyActive]() {
-        return this->pg_->dumpDebuggingInfo(false, onlyActive);
+        return this->pg_->dumpDebuggingInfo(true, onlyActive);
       }));
     }
   }
@@ -2015,7 +2016,7 @@ void ProcessGroupNCCL::HeartbeatMonitor::runLoop() {
           << pg_->logPrefix()
           << "ProcessGroupNCCL monitor thread is disabled, but would have terminated the process"
           << "after attempting to dump debug info, due to " << exitReason
-          << ".";
+          << '.';
     }
   }
 }
@@ -2394,7 +2395,8 @@ void ProcessGroupNCCL::Watchdog::runLoop() {
         pg_->pgStatus_->lastCompletedWorkName = opTypeToString(work.opType_);
         pg_->pgStatus_->lastCompletedNumelIn = work.numelIn_;
         pg_->pgStatus_->lastCompletedNumelOut = work.numelOut_;
-        FlightRecorderCUDA::get()->retire_id(work.trace_id_, true);
+        FlightRecorderCUDA::get()->retire_id(
+            work.trace_id_, work.trace_reset_epoch_, true);
         if (pg_->onCompletionHook_) {
           // Move Work object to completedWorkList_ to be consumed by the hook
           // thread
@@ -3360,7 +3362,7 @@ c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> ProcessGroupNCCL::initWork(
     //   these objects to the Work because it has implications for keeping those
     //   tensors alive longer and adds overhead when copying Work objects
     //   between threads
-    r->trace_id_ = FlightRecorderCUDA::get()->record(
+    auto traceId = FlightRecorderCUDA::get()->recordWithResetEnabled(
         local_id_,
         std::make_tuple(pg_uid_, pg_desc_),
         seqCollective_,
@@ -3374,6 +3376,8 @@ c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> ProcessGroupNCCL::initWork(
         options_->timeout,
         pgStatus_,
         isP2P);
+    r->trace_id_ = traceId.id;
+    r->trace_reset_epoch_ = traceId.reset_epoch;
   }
   return r;
 }
@@ -3593,6 +3597,7 @@ float ProcessGroupNCCL::endTimeEstimate() {
 #ifdef NCCL_SIM_INFO_INITIALIZER
   ncclSimInfo_t simInfo = NCCL_SIM_INFO_INITIALIZER;
   C10D_NCCL_CHECK(ncclGroupSimulateEnd(&simInfo), std::nullopt);
+  --ncclActiveGroupCounter_;
   return simInfo.estimatedTime;
 #else
   TORCH_CHECK(
@@ -3676,7 +3681,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
     // later in endCoalescing we record a 'coalesced' Work which has
     // timing/state updates via watchdog thread, but lacks op metadata such as
     // input/output sizes and profilingTitle per-op in the group.
-    FlightRecorderCUDA::get()->record(
+    FlightRecorderCUDA::get()->recordWithResetEnabled(
         local_id_,
         std::make_tuple(pg_uid_, pg_desc_),
         seqCollective_,
@@ -4168,7 +4173,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
     // TODO(whc) because we don't pass output {tensor} to initWork, we tell
     // initWork to not record, and then we manually call record passing all the
     // information it wants.
-    work->trace_id_ = FlightRecorderCUDA::get()->record(
+    auto traceId = FlightRecorderCUDA::get()->recordWithResetEnabled(
         local_id_,
         std::make_tuple(pg_uid_, pg_desc_),
         seqCollective_,
@@ -4182,6 +4187,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
         options_->timeout,
         pgStatus_,
         /*isP2P=*/true);
+    work->trace_id_ = traceId.id;
+    work->trace_reset_epoch_ = traceId.reset_epoch;
   }
 
   // Only check for NaN for send ops, for recv ops `tensor` can be a random
@@ -5819,7 +5826,7 @@ at::Tensor ProcessGroupNCCL::allocateTensor(
         reinterpret_cast<c10::cuda::CUDACachingAllocator::CUDAAllocator*>(
             getMemAllocator().get());
     // Pool is created
-    memPool_ = std::make_unique<c10::cuda::MemPool>(allocator);
+    memPool_ = std::make_unique<at::cuda::MemPool>(allocator);
     // Register so that we call ncclCommRegister on all new allocations
     registerMemPool(memPool_.get(), /*symmetric*/ false);
     LOG(INFO) << logPrefix() << "Created memory pool";
@@ -5842,6 +5849,139 @@ at::Tensor ProcessGroupNCCL::allocateTensor(
   return tensor;
 }
 
+#ifdef NCCL_HAS_COMM_SHRINK
+c10::intrusive_ptr<Backend> ProcessGroupNCCL::shrink(
+    const std::vector<int64_t>& ranks_to_exclude,
+    int shrink_flags,
+    const c10::intrusive_ptr<Backend::Options>& opts_override) {
+  // Runtime version check with better error message
+  auto runtime_version = torch::cuda::nccl::version();
+  TORCH_CHECK(
+      runtime_version >= NCCL_VERSION(2, 27, 0),
+      "ProcessGroupNCCL::shrink requires NCCL version 2.27.0 or later. "
+      "Found version: ",
+      runtime_version);
+
+  // Early validation with detailed error messages
+  TORCH_CHECK_VALUE(
+      !ranks_to_exclude.empty(), "ranks_to_exclude cannot be empty");
+  TORCH_CHECK_VALUE(
+      static_cast<int>(ranks_to_exclude.size()) < size_,
+      "Cannot exclude all ranks (",
+      ranks_to_exclude.size(),
+      " >= ",
+      size_,
+      ")");
+
+  // Validate ranks and convert to int efficiently
+  std::vector<int> int_ranks_to_exclude;
+  int_ranks_to_exclude.reserve(ranks_to_exclude.size());
+  for (int64_t rank : ranks_to_exclude) {
+    TORCH_CHECK_VALUE(
+        rank >= 0 && rank < size_,
+        "Invalid rank ",
+        rank,
+        " for group size ",
+        size_);
+    int_ranks_to_exclude.push_back(static_cast<int>(rank));
+  }
+
+  // Get primary communicator with better error context
+  auto primary_device_index = guessDeviceId();
+  auto primary_device = at::Device(at::kCUDA, primary_device_index);
+  const auto primary_key = getKeyFromDevice(primary_device);
+
+  std::shared_ptr<NCCLComm> primary_comm = getNCCLComm(primary_key);
+  TORCH_CHECK(
+      primary_comm,
+      "Primary NCCL communicator for device ",
+      primary_device,
+      " (key: ",
+      primary_key,
+      ") is not initialized");
+
+  // Cache device index before shrink operation
+  at::DeviceIndex parent_device_index = primary_comm->getDeviceIndex();
+
+  ncclConfig_t* config = nullptr;
+  // Default to inheriting from parent options
+  bool high_priority_stream = options_->is_high_priority_stream;
+  if (opts_override) {
+    auto nccl_opts =
+        c10::static_intrusive_pointer_cast<ProcessGroupNCCL::Options>(
+            opts_override);
+    config = &nccl_opts->config;
+    // If user provided override options, honor is_high_priority_stream as well
+    high_priority_stream = nccl_opts->is_high_priority_stream;
+  }
+
+  std::shared_ptr<NCCLComm> shrunk_comm = NCCLComm::shrink(
+      primary_comm.get(),
+      int_ranks_to_exclude,
+      (config != nullptr ? config : &options_->config),
+      shrink_flags);
+
+  // Calculate new size and get NCCL-assigned rank
+  int new_size = size_ - static_cast<int>(ranks_to_exclude.size());
+  int new_rank = shrunk_comm->rank_;
+
+  // Create new ProcessGroupNCCL with optimized options cloning
+  auto new_store = store_->clone();
+  auto new_opts = ProcessGroupNCCL::Options::create(high_priority_stream);
+  new_opts->timeout = options_->timeout;
+  if (config != nullptr) {
+    new_opts->config = *config;
+  } else {
+    new_opts->config = options_->config;
+  }
+
+  auto new_pg = c10::make_intrusive<ProcessGroupNCCL>(
+      new_store, new_rank, new_size, new_opts);
+
+  // Set up the new process group with optimized device setup
+  new_pg->initializeDeviceStateForComm(
+      at::Device(at::kCUDA, parent_device_index), shrunk_comm);
+
+  return c10::static_intrusive_pointer_cast<Backend>(new_pg);
+}
+
+#else // !NCCL_HAS_COMM_SHRINK
+// Backend interface override: raise consistent error when shrink is
+// unsupported.
+c10::intrusive_ptr<Backend> ProcessGroupNCCL::shrink(
+    const std::vector<int64_t>& /*ranks_to_exclude*/,
+    int /*shrink_flags*/,
+    const c10::intrusive_ptr<Backend::Options>& /*opts_override*/) {
+  TORCH_CHECK(
+      false,
+      "ProcessGroupNCCL::shrink requires NCCL version 2.27.0 or later, "
+      "but PyTorch was built with an older version or without NCCL shrink support.");
+}
+
+#endif // NCCL_HAS_COMM_SHRINK
+
+void ProcessGroupNCCL::initializeDeviceStateForComm(
+    const at::Device& device,
+    std::shared_ptr<NCCLComm> comm) {
+  const auto key = getKeyFromDevice(device);
+  std::unique_lock<std::mutex> lock(mutex_);
+  at::cuda::OptionalCUDAGuard gpuGuard(device);
+
+  bool force_high = getCvarBool(TORCH_NCCL_HIGH_PRIORITY, false);
+  auto stream = at::cuda::getStreamFromPool(
+      options_->is_high_priority_stream || force_high);
+
+  devNCCLCommMap_[key] = comm;
+  ncclStreams_.emplace(key, stream);
+  ncclEvents_.emplace(key, at::cuda::CUDAEvent(cudaEventDisableTiming));
+  usedDeviceIdxs_.insert(device.index());
+
+  if (shouldAllCommunicatorsRegisterAllTensors()) {
+    std::lock_guard<std::mutex> map_lock(ncclCommMemPoolMapMutex);
+    ncclCommMemPoolMap.emplace(std::move(comm), MemPoolSet{});
+  }
+}
+
 } // namespace c10d
 
 #endif // USE_C10D_NCCL
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
index 286eab14d1a86..22d9245b13e05 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -30,6 +30,7 @@
 #include <ATen/DynamicLibrary.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/CUDAEvent.h>
+#include <ATen/cuda/MemPool.h>
 #include <c10/core/Stream.h>
 #include <c10/core/StreamGuard.h>
 #include <c10/cuda/CUDACachingAllocator.h>
@@ -505,6 +506,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     // unique id used to tell the trace buffer that this
     // work has completed
     std::optional<uint64_t> trace_id_;
+    std::optional<uint64_t> trace_reset_epoch_;
     DebugLevel distDebugLevel_;
     friend class ProcessGroupNCCL;
   };
@@ -997,6 +999,21 @@ class TORCH_API ProcessGroupNCCL : public Backend {
 
   ErrorType getError() override;
 
+  bool supportsShrinking() const override {
+#ifdef NCCL_HAS_COMM_SHRINK
+    return true;
+#else
+    return false;
+#endif
+  }
+
+  // Backend-style shrink override that returns a Backend instance.
+  c10::intrusive_ptr<Backend> shrink(
+      const std::vector<int64_t>& ranks_to_exclude,
+      int shrink_flags = 0,
+      const c10::intrusive_ptr<Backend::Options>& opts_override =
+          nullptr) override;
+
   std::shared_ptr<c10::Allocator> getMemAllocator() override;
 
   // Allocate tensor from communication-optimized memory pool
@@ -1007,11 +1024,11 @@ class TORCH_API ProcessGroupNCCL : public Backend {
 
   // Performs NCCL user buffer registration for all buffers in
   // the given MemPool
-  void registerMemPool(c10::cuda::MemPool* pool, bool symm = false);
+  void registerMemPool(at::cuda::MemPool* pool, bool symm = false);
 
   // Performs NCCL user buffer de-registration for all buffers in
   // the given MemPool
-  void deregisterMemPool(c10::cuda::MemPool* pool);
+  void deregisterMemPool(at::cuda::MemPool* pool);
 
   // This method adds a temporary extension for the timeout period,
   // applying to all collectives between the calling of this API and
@@ -1065,6 +1082,12 @@ class TORCH_API ProcessGroupNCCL : public Backend {
       int p2pRank = 0,
       bool isSendRecvSelf = false);
 
+  // Initialize device-specific state (comm, stream, event, bookkeeping) for a
+  // given communicator on this process group instance.
+  void initializeDeviceStateForComm(
+      const at::Device& device,
+      std::shared_ptr<NCCLComm> comm);
+
   // Wrapper method which can be overridden for tests.
   virtual std::exception_ptr checkForNCCLErrors(
       std::shared_ptr<NCCLComm>& ncclComm);
@@ -1469,7 +1492,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   std::optional<bool> useNonblocking_{std::nullopt};
 
   // Communication-optimized memory pool associated with this PG
-  std::unique_ptr<c10::cuda::MemPool> memPool_ = nullptr;
+  std::unique_ptr<at::cuda::MemPool> memPool_ = nullptr;
 };
 
 // Reset the flighrecorder recordings for the current rank.
diff --git a/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp b/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
index 624a8fc11b615..fa40ff15ec74f 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
@@ -174,7 +174,7 @@ struct CollectiveFingerPrint {
           ss << "Detected mismatch between collectives on ranks. Rank "
              << backend->getRank() << " is running collective: " << *this
              << ", but Rank " << rank
-             << " is running collective: " << rank_fingerprint << ".";
+             << " is running collective: " << rank_fingerprint << '.';
           auto diff_result = compute_collective_diff(rank_fingerprint);
           if (std::get<0>(diff_result)) {
             ss << std::get<1>(diff_result);
diff --git a/torch/csrc/distributed/c10d/Store.hpp b/torch/csrc/distributed/c10d/Store.hpp
index 8260d33597d9c..9a037c65ee7c2 100644
--- a/torch/csrc/distributed/c10d/Store.hpp
+++ b/torch/csrc/distributed/c10d/Store.hpp
@@ -114,6 +114,11 @@ class TORCH_API Store : public torch::CustomClassHolder {
     C10_THROW_ERROR(NotImplementedError, "queue support is not implemented.");
   }
 
+  virtual std::vector<std::string> listKeys() {
+    C10_THROW_ERROR(
+        NotImplementedError, "listKeys support is not implemented.");
+  }
+
  protected:
   std::chrono::milliseconds timeout_;
 };
diff --git a/torch/csrc/distributed/c10d/TCPStore.cpp b/torch/csrc/distributed/c10d/TCPStore.cpp
index d4fd1c6c1e424..0c1cf581887d1 100644
--- a/torch/csrc/distributed/c10d/TCPStore.cpp
+++ b/torch/csrc/distributed/c10d/TCPStore.cpp
@@ -723,6 +723,30 @@ int64_t TCPStore::queueLen(const std::string& key) {
   return client_->receiveValue<int64_t>();
 }
 
+std::vector<std::string> TCPStore::listKeys() {
+  STATIC_SCOPED_WAIT_COUNTER(pytorch.wait_counter.TCPStore__list);
+
+  const std::lock_guard<std::mutex> lock(activeOpLock_);
+
+  detail::SendBuffer buffer(*client_, detail::QueryType::LIST_KEYS);
+  buffer.flush();
+
+  auto numKeys = client_->receiveValue<int64_t>();
+  std::vector<std::string> keys;
+  keys.reserve(numKeys);
+  for (auto i = 0; i < numKeys; ++i) {
+    auto bits = client_->receiveBits();
+    std::string str(bits.begin(), bits.end());
+    if (str.find(keyPrefix_) == 0) {
+      str = str.substr(keyPrefix_.size());
+    } else {
+      continue;
+    }
+    keys.emplace_back(str);
+  }
+  return keys;
+}
+
 bool TCPStore::hasExtendedApi() const {
   return true;
 }
diff --git a/torch/csrc/distributed/c10d/TCPStore.hpp b/torch/csrc/distributed/c10d/TCPStore.hpp
index 2caab088a609a..09d7ae111c57a 100644
--- a/torch/csrc/distributed/c10d/TCPStore.hpp
+++ b/torch/csrc/distributed/c10d/TCPStore.hpp
@@ -121,6 +121,8 @@ class TORCH_API TCPStore : public Store {
 
   int64_t queueLen(const std::string& key) override;
 
+  std::vector<std::string> listKeys() override;
+
   // Waits for all workers to join.
   void waitForWorkers();
 
diff --git a/torch/csrc/distributed/c10d/TCPStoreBackend.cpp b/torch/csrc/distributed/c10d/TCPStoreBackend.cpp
index 22455a22a4610..dd25729a6ee13 100644
--- a/torch/csrc/distributed/c10d/TCPStoreBackend.cpp
+++ b/torch/csrc/distributed/c10d/TCPStoreBackend.cpp
@@ -78,6 +78,7 @@ class TCPStoreMasterDaemon : public BackgroundThread {
   void multiGetHandler(int socket);
   void multiSetHandler(int socket);
   void cancelWaitHandler(int socket);
+  void listKeysHandler(int socket);
   void addMiscellaneousSocket(int socket);
   void removeMiscellaneousSocket(int socket);
   bool isMiscellaneousSocket(int socket);
@@ -295,6 +296,8 @@ void TCPStoreMasterDaemon::query(int socket) {
     multiSetHandler(socket);
   } else if (qt == QueryType::CANCEL_WAIT) {
     cancelWaitHandler(socket);
+  } else if (qt == QueryType::LIST_KEYS) {
+    listKeysHandler(socket);
   } else {
     TORCH_CHECK(false, "Unexpected query type");
   }
@@ -482,6 +485,13 @@ void TCPStoreMasterDaemon::cancelWaitHandler(int socket) {
       socket, detail::WaitResponseType::WAIT_CANCELED);
 }
 
+void TCPStoreMasterDaemon::listKeysHandler(int socket) {
+  tcputil::sendValue<size_t>(socket, tcpStore_.size());
+  for (const auto& kv : tcpStore_) {
+    tcputil::sendString(socket, kv.first);
+  }
+}
+
 bool TCPStoreMasterDaemon::checkKeys(
     const std::vector<std::string>& keys) const {
   return std::all_of(keys.begin(), keys.end(), [this](const std::string& s) {
diff --git a/torch/csrc/distributed/c10d/TCPStoreBackend.hpp b/torch/csrc/distributed/c10d/TCPStoreBackend.hpp
index d5f7f0248bba5..d176ccb702838 100644
--- a/torch/csrc/distributed/c10d/TCPStoreBackend.hpp
+++ b/torch/csrc/distributed/c10d/TCPStoreBackend.hpp
@@ -36,6 +36,7 @@ enum class QueryType : uint8_t {
   QUEUE_PUSH,
   QUEUE_POP,
   QUEUE_LEN,
+  LIST_KEYS,
 };
 
 enum class CheckResponseType : uint8_t { READY, NOT_READY };
diff --git a/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp b/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp
index 80dd7340709a0..f3ff9e623043e 100644
--- a/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp
+++ b/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp
@@ -683,6 +683,7 @@ class LibUVStoreDaemon : public BackgroundThread {
       const std::string& queueName,
       const c10::intrusive_ptr<UvHandle>& client);
   int64_t queueLen(const std::string& queueName);
+  std::vector<std::string> listKeys();
 
   void registerClient(const c10::intrusive_ptr<UvHandle>& client);
   void unregisterClient(const c10::intrusive_ptr<UvHandle>& client);
@@ -822,6 +823,10 @@ class UvClient : public UvTcpSocket {
             if (!parse_queue_len_command())
               return;
             break;
+          case QueryType::LIST_KEYS:
+            if (!parse_list_keys_command())
+              return;
+            break;
           default:
             C10D_DEBUG(
                 "Client sent invalid command. client:{} command:{}",
@@ -1164,6 +1169,19 @@ class UvClient : public UvTcpSocket {
     return true;
   }
 
+  bool parse_list_keys_command() {
+    C10D_TRACE("list_keys address:{}", this->address());
+
+    auto keys = store->listKeys();
+    StreamWriter sw(iptr());
+    sw.write_value<int64_t>(static_cast<int64_t>(keys.size()));
+    for (const auto& key : keys) {
+      sw.write_string(key);
+    }
+    sw.send();
+    return true;
+  }
+
  public:
   explicit UvClient(uv_loop_t* loop, LibUVStoreDaemon* store)
       : UvTcpSocket(loop), store(store) {}
@@ -1542,6 +1560,16 @@ int64_t LibUVStoreDaemon::queueLen(const std::string& key) {
   }
   return static_cast<int64_t>(it->second.size());
 }
+
+std::vector<std::string> LibUVStoreDaemon::listKeys() {
+  std::vector<std::string> keys;
+  keys.reserve(tcpStore_.size());
+  for (const auto& kv : tcpStore_) {
+    keys.push_back(kv.first);
+  }
+  return keys;
+}
+
 #endif
 
 std::unique_ptr<BackgroundThread> create_libuv_tcpstore_backend(
diff --git a/torch/csrc/distributed/c10d/UCCTracing.cpp b/torch/csrc/distributed/c10d/UCCTracing.cpp
index 66d62d662c259..78fac30d2ab9f 100644
--- a/torch/csrc/distributed/c10d/UCCTracing.cpp
+++ b/torch/csrc/distributed/c10d/UCCTracing.cpp
@@ -51,7 +51,7 @@ void ProcessGroupUCCLogger::flushComms(int rank, int world_size) {
   _outfile.open(trace_filename, std::ofstream::out | std::ofstream::trunc);
   // flush the traced comms
   if (_outfile.is_open()) {
-    _outfile << "[" << c10::Join(",", trace_generator->getCommsTrace())
+    _outfile << '[' << c10::Join(",", trace_generator->getCommsTrace())
              << "\n]";
     _outfile.flush();
     _outfile.close();
diff --git a/torch/csrc/distributed/c10d/UCCUtils.cpp b/torch/csrc/distributed/c10d/UCCUtils.cpp
index 6794c4eaa594f..9e297ad339fa6 100644
--- a/torch/csrc/distributed/c10d/UCCUtils.cpp
+++ b/torch/csrc/distributed/c10d/UCCUtils.cpp
@@ -35,7 +35,7 @@ ucc_status_t oob_allgather(
     *req = coll_info;
   } catch (std::exception& ex) {
     LOG(ERROR) << "(oob_allgather) Caught exception in Store Operation .. "
-               << "[" << ex.what() << "]";
+               << '[' << ex.what() << ']';
     return UCC_ERR_NO_MESSAGE;
   }
   return UCC_OK;
@@ -61,7 +61,7 @@ ucc_status_t oob_allgather_test(void* req) {
     }
   } catch (std::exception& ex) {
     LOG(ERROR) << "(oob_allgather) Caught exception in Store Operation .. "
-               << "[" << ex.what() << "]";
+               << '[' << ex.what() << ']';
     return UCC_ERR_NO_MESSAGE;
   }
   return UCC_OK;
@@ -91,7 +91,7 @@ ucc_status_t oob_allgather_free(void* req) {
         info->getKey(kAllGatherFree + std::to_string(info->rank)));
   } catch (std::exception& ex) {
     LOG(ERROR) << "(oob_allgather) Caught exception in Store Operation .. "
-               << "[" << ex.what() << "]";
+               << '[' << ex.what() << ']';
     return UCC_ERR_NO_MESSAGE;
   }
   return UCC_OK;
diff --git a/torch/csrc/distributed/c10d/Utils.hpp b/torch/csrc/distributed/c10d/Utils.hpp
index fc9d735401c73..25193b54af9fd 100644
--- a/torch/csrc/distributed/c10d/Utils.hpp
+++ b/torch/csrc/distributed/c10d/Utils.hpp
@@ -48,14 +48,14 @@ TORCH_API std::vector<at::Tensor> getTensorShapes(
 // Turns at::IntArrayRef into "(1, 2, 3, 4)".
 inline std::string toString(at::IntArrayRef l) {
   std::stringstream ss;
-  ss << "(";
+  ss << '(';
   for (const auto i : c10::irange(l.size())) {
     if (i > 0) {
       ss << ", ";
     }
     ss << l[i];
   }
-  ss << ")";
+  ss << ')';
   return ss.str();
 }
 
diff --git a/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp b/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
index 2f77bb119a956..8bbe857620790 100644
--- a/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
+++ b/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
@@ -87,17 +87,17 @@ WorkerServer::WorkerServer(const std::string& hostOrFile, int port) {
       "/handler/",
       [](const httplib::Request& req [[maybe_unused]], httplib::Response& res) {
         std::ostringstream body;
-        body << "[";
+        body << '[';
         bool first = true;
         for (const auto& name : getHandlerNames()) {
           if (!first) {
-            body << ",";
+            body << ',';
           }
           first = false;
 
-          body << "\"" << jsonStrEscape(name) << "\"";
+          body << '"' << jsonStrEscape(name) << '"';
         }
-        body << "]";
+        body << ']';
 
         res.set_content(body.str(), "application/json");
       });
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index a6c6c6f8c4744..94a8c0bbe228b 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -19,6 +19,7 @@
 #include <torch/csrc/distributed/c10d/FakeProcessGroup.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
 #include <torch/csrc/distributed/c10d/PyProcessGroup.hpp>
+#include <torch/csrc/distributed/c10d/python_callback_work.hpp>
 
 #ifdef USE_C10D_GLOO
 #include <torch/csrc/distributed/c10d/ProcessGroupGloo.hpp>
@@ -1655,6 +1656,12 @@ See queue_push for more details.
 
 Arguments:
     key (str): The key of the queue to get the length.
+)")
+          .def(
+              "list_keys",
+              &::c10d::Store::listKeys,
+              R"(
+Returns a list of all keys in the store.
 )")
           .def(
               "has_extended_api",
@@ -2734,12 +2741,23 @@ The hook must have the following signature:
               "supports_time_estimate",
               &::c10d::Backend::supportsTimeEstimation,
               "(test whether the backend supports collective time estimation)")
+          .def_property_readonly(
+              "supports_shrinking",
+              &::c10d::Backend::supportsShrinking,
+              "(test whether the backend supports communicator shrinking)")
           .def(
               "set_timeout",
               &::c10d::Backend::setTimeout,
               py::arg("timeout"),
               py::call_guard<py::gil_scoped_release>(),
               R"(Sets the default timeout for all future operations.)")
+          .def(
+              "shrink",
+              &::c10d::Backend::shrink,
+              py::arg("ranks_to_exclude"),
+              py::arg("shrink_flags") = 0,
+              py::arg("opts_override") = nullptr,
+              py::call_guard<py::gil_scoped_release>())
           .def(
               "broadcast",
               &::c10d::Backend::broadcast,
@@ -3876,6 +3894,33 @@ such as `dist.all_reduce(tensor, async_op=True)`.
           .def("wait", &::c10d::FakeWork::wait, py::arg("timeout") = kNoTimeout)
           .def("getFuture", &::c10d::FakeWork::getFuture);
 
+  auto pythonCallbackWork =
+      intrusive_ptr_no_gil_destructor_class_<::c10d::PythonCallbackWork>(
+          module, "PythonCallbackWork", work)
+          .def(py::init<py::object>(), py::arg("callback"))
+          .def(
+              "wait",
+              &::c10d::PythonCallbackWork::wait,
+              py::arg("timeout") = kNoTimeout,
+              R"(
+              Waits until the callback completes. Blocking operation.
+              The callback is invoked with the timeout parameter and should return a boolean.
+              Throws if the callback completes with an exception.
+              Returns the boolean value returned by the callback.
+            )")
+          .def(
+              "get_future",
+              [](::c10d::PythonCallbackWork& work)
+                  -> std::shared_ptr<jit::PythonFutureWrapper> {
+                return std::make_shared<jit::PythonFutureWrapper>(
+                    work.getFuture());
+              },
+              R"(
+            Returns:
+                A ``torch.futures.Future`` object which is associated with the completion of
+                the ``PythonCallbackWork``.
+           )");
+
   py::class_<c10::DDPLoggingData>(module, "DDPLoggingData")
       .def(py::init<>())
       .def_readwrite("strs_map", &c10::DDPLoggingData::strs_map)
diff --git a/torch/csrc/distributed/c10d/logger.cpp b/torch/csrc/distributed/c10d/logger.cpp
index 170748a60352b..c9ef7262f8c8b 100644
--- a/torch/csrc/distributed/c10d/logger.cpp
+++ b/torch/csrc/distributed/c10d/logger.cpp
@@ -215,10 +215,10 @@ void Logger::set_construction_data_and_log(
         ddp_logging_data_->ints_map["rank"]);
     std::stringstream ddpLoggingDataInfo;
     for (const auto& intItem : ddp_logging_data_->ints_map) {
-      ddpLoggingDataInfo << intItem.first << ": " << intItem.second << "\n";
+      ddpLoggingDataInfo << intItem.first << ": " << intItem.second << '\n';
     }
     for (const auto& strItem : ddp_logging_data_->strs_map) {
-      ddpLoggingDataInfo << strItem.first << ": " << strItem.second << "\n";
+      ddpLoggingDataInfo << strItem.first << ": " << strItem.second << '\n';
     }
     LOG(INFO) << initInfo << ddpLoggingDataInfo.str();
   }
diff --git a/torch/csrc/distributed/c10d/python_callback_work.cpp b/torch/csrc/distributed/c10d/python_callback_work.cpp
new file mode 100644
index 0000000000000..47bef1831a480
--- /dev/null
+++ b/torch/csrc/distributed/c10d/python_callback_work.cpp
@@ -0,0 +1,64 @@
+#include <c10/core/TensorOptions.h>
+#include <torch/csrc/distributed/c10d/python_callback_work.hpp>
+
+namespace c10d {
+
+PythonCallbackWork::PythonCallbackWork(py::function callback)
+    : callback_(std::move(callback)) {
+  // Create a future that will be marked as complete when wait() is called
+  future_ = c10::make_intrusive<c10::ivalue::Future>(
+      c10::ListType::create(c10::TensorType::get()));
+}
+
+// NOLINTNEXTLINE(bugprone-exception-escape)
+PythonCallbackWork::~PythonCallbackWork() {
+  py::gil_scoped_acquire ag;
+  callback_.dec_ref();
+  // Explicitly set callback_ to nullptr to prevent py::object's dtor
+  // to decref on the PyObject again.
+  // See Note [Destructing py::object] in python_ivalue.h
+  callback_.ptr() = nullptr;
+}
+
+bool PythonCallbackWork::wait(std::chrono::milliseconds timeout) {
+  py::gil_scoped_acquire ag;
+
+  try {
+    // Call the Python callback with timeout
+    py::object result = callback_(timeout);
+
+    // Extract the boolean result
+    bool success = result.cast<bool>();
+
+    // Mark the work as completed if successful
+    if (success) {
+      finish();
+      // Mark the future as complete with an empty list
+      if (!future_->completed()) {
+        future_->markCompleted(c10::IValue(c10::List<at::Tensor>()));
+      }
+    }
+
+    return success;
+  } catch (py::error_already_set& e) {
+    // Capture the Python exception and store it
+    finish(std::current_exception());
+    if (!future_->completed()) {
+      future_->setErrorIfNeeded(std::current_exception());
+    }
+    throw;
+  } catch (const std::exception& e) {
+    // Capture any C++ exception and store it
+    finish(std::current_exception());
+    if (!future_->completed()) {
+      future_->setErrorIfNeeded(std::current_exception());
+    }
+    throw;
+  }
+}
+
+c10::intrusive_ptr<c10::ivalue::Future> PythonCallbackWork::getFuture() {
+  return future_;
+}
+
+} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/python_callback_work.hpp b/torch/csrc/distributed/c10d/python_callback_work.hpp
new file mode 100644
index 0000000000000..48966e785ad60
--- /dev/null
+++ b/torch/csrc/distributed/c10d/python_callback_work.hpp
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <torch/csrc/distributed/c10d/Work.hpp>
+#include <torch/csrc/jit/python/pybind_utils.h>
+#include <torch/csrc/utils.h>
+
+namespace c10d {
+
+// PythonCallbackWork is a subclass of Work that wraps a Python callback
+// function that implements wait(). This allows asynchronous work to
+// be integrated with Python code, enabling custom completion logic or
+// post-processing in Python.
+class PythonCallbackWork : public Work {
+ public:
+  explicit PythonCallbackWork(py::function callback);
+
+  ~PythonCallbackWork() override;
+
+  bool wait(std::chrono::milliseconds timeout) override;
+
+  c10::intrusive_ptr<c10::ivalue::Future> getFuture() override;
+
+ private:
+  py::function callback_;
+  c10::intrusive_ptr<c10::ivalue::Future> future_;
+};
+
+} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index 10a2251754cde..a1c9b4a3039d5 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -615,8 +615,8 @@ void Reducer::delay_all_reduce() {
           param_name != param_names_.end(),
           "Expected to find parameter name from unused parameters map in debug mode.");
       // Add the param_name
-      unused_params_stream << "{" << param_name->second << "," << unused_index
-                           << "}";
+      unused_params_stream << '{' << param_name->second << ',' << unused_index
+                           << '}';
     }
 
     // Each rank prints out all the unused parameters detected
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
index 4523333c7fad4..f83d42df4ac68 100644
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
@@ -517,6 +517,11 @@ static void init_multicast_for_block(
   using McHandleType =
       std::conditional_t<use_fabric_handle, CUmemFabricHandle, int>;
 
+  McHandleType invalidator;
+  std::memset(&invalidator, UINT8_MAX, sizeof(McHandleType));
+
+  // Phase 1: export handle (rank 0 only)
+  McHandleType mc_exported_handle{};
   if (rank == 0) {
     CUmulticastObjectProp mc_prop{};
     mc_prop.numDevices = world_size;
@@ -525,68 +530,82 @@ static void init_multicast_for_block(
 
     // create a multicast object, which acts as a handle that allows multiple
     // devices or processes to access the same memory allocation coherently.
-    auto err = driver_api->cuMulticastCreate_(&mc_handle, &mc_prop);
-    if (err != CUDA_SUCCESS) {
-      const char* err_str;
-      CUresult get_error_str_err = driver_api->cuGetErrorString_(err, &err_str);
-      if (get_error_str_err != CUDA_SUCCESS) {
-        err_str = "unknown cuda driver error";
-      }
-      LOG(WARNING)
-          << "SymmetricMemory: cuMulticastCreate failed with: \"" << err_str
-          << "\". Gracefully skipping multicast initialization. "
-          << "However, this is unexpected. Please report the issue on GitHub.";
+    try {
+      C10_CUDA_DRIVER_CHECK(
+          driver_api->cuMulticastCreate_(&mc_handle, &mc_prop));
+      // using the CUDA Driver API to export a multicast object into a POSIX file
+      // descriptor.
+      C10_CUDA_DRIVER_CHECK(driver_api->cuMemExportToShareableHandle_(
+          &mc_exported_handle, mc_handle, handleType, 0));
+    } catch (const std::exception& e) {
       // Allow peers gracefully skip multicast initialization by sending -1
-      // TODO: allow graceful skip for fabric
-      if constexpr (!use_fabric_handle) {
-        ipc_channel.broadcast_fds(rank, 0, pids, -1);
-      }
-      return;
-    }
-
-    McHandleType mc_exported_handle;
-    // using the CUDA Driver API to export a multicast object into a POSIX file
-    // descriptor.
-    C10_CUDA_DRIVER_CHECK(driver_api->cuMemExportToShareableHandle_(
-        &mc_exported_handle, mc_handle, handleType, 0));
-    if constexpr (!use_fabric_handle) {
-      ipc_channel.broadcast_fds(rank, 0, pids, mc_exported_handle);
-      // Ref count is incremented as soon as SCM_RIGHTS send happens
-      close(mc_exported_handle);
-    } else {
-      // TODO implement storeExchange.broadcast
-      storeExchange.all_gather(store, rank, world_size, mc_exported_handle);
+      mc_exported_handle = invalidator;
+      LOG(WARNING)
+          << "SymmetricMemory: fail to export multicast handle.\n"
+          << e.what();
     }
+  }
 
+  // Phase 2: Exchange handle
+  McHandleType recv_handle;
+  if constexpr (!use_fabric_handle) {
+    recv_handle = ipc_channel.broadcast_fds(rank, 0, pids, mc_exported_handle);
   } else {
+    // TODO implement storeExchange.broadcast
+    auto gathered_handles = storeExchange.all_gather(store, rank, world_size, mc_exported_handle);
+    recv_handle = std::move(gathered_handles[0]);
+  }
+
+  // Check exchange result
+  if (memcmp(&recv_handle, &invalidator, sizeof(McHandleType)) == 0) {
+    LOG(WARNING) << "Gracefully skipping multicast initialization.";
+    return;
+  }
+
+  // Flip to true after all CUDA steps finish
+  bool success_end = false;
+
+  // Phase 3: Import handle (non-0 ranks only)
+  if (rank != 0) {
     if constexpr (!use_fabric_handle) {
-      int mc_fd = ipc_channel.broadcast_fds(rank, 0, pids, -1);
-      if (mc_fd == -1) {
-        return;
-      }
       // Convert back to a handle from the broadcasted POSIX file descriptor.
-      C10_CUDA_DRIVER_CHECK(driver_api->cuMemImportFromShareableHandle_(
+      C10_CUDA_DRIVER_CHECK_GOTO(driver_api->cuMemImportFromShareableHandle_(
           &mc_handle,
-          (void*)(uintptr_t)mc_fd,
-          CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
-      close(mc_fd);
+          (void*)(uintptr_t)recv_handle,
+          CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR), check_all);
     } else {
-      CUmemFabricHandle null_handle{};
-      auto mc_handles =
-          storeExchange.all_gather(store, rank, world_size, null_handle);
-      C10_CUDA_DRIVER_CHECK(driver_api->cuMemImportFromShareableHandle_(
-          &mc_handle, (void*)&(mc_handles[0]), CU_MEM_HANDLE_TYPE_FABRIC));
+      C10_CUDA_DRIVER_CHECK_GOTO(driver_api->cuMemImportFromShareableHandle_(
+          &mc_handle, (void*)&(recv_handle), CU_MEM_HANDLE_TYPE_FABRIC), check_all);
     }
   }
 
+  // Phase 4: Bind memory
   // All rank adds their physical allocation to the multicast object
-  C10_CUDA_DRIVER_CHECK(
-      driver_api->cuMulticastAddDevice_(mc_handle, block->device_idx));
-  C10_CUDA_DRIVER_CHECK(driver_api->cuMulticastBindMem_(
-      mc_handle, 0, block->alloc_ref->handle, 0, block->block_size, 0));
+  C10_CUDA_DRIVER_CHECK_GOTO(
+      driver_api->cuMulticastAddDevice_(mc_handle, block->device_idx), check_all);
+  C10_CUDA_DRIVER_CHECK_GOTO(driver_api->cuMulticastBindMem_(
+      mc_handle, 0, block->alloc_ref->handle, 0, block->block_size, 0), check_all);
+
+  success_end = true;
 
+check_all:
+  // Whether all ranks have succeeded
+  bool all_succeed = true;
+  auto rank_successes = storeExchange.all_gather(store, rank, world_size, success_end);
+  for (int r = 0; r < world_size; ++r) {
+    all_succeed &= rank_successes[r];
+  }
+  // Close the file descriptor before exit
+  if constexpr (!use_fabric_handle) {
+    close(recv_handle);
+  }
+  if (!all_succeed) {
+    LOG(WARNING) << "Gracefully skipping multicast initialization.";
+    return;
+  }
+
+  // Phase 5: Map to virtual memory
   map_block(&mc_addr, mc_handle, block->block_size, block->device_idx);
-  storeExchange.barrier(store, rank, world_size);
 #endif
 }
 
@@ -699,7 +718,11 @@ c10::intrusive_ptr<CUDASymmetricMemory> make_symm_mem(
 #elif defined(USE_ROCM)
     C10_HIP_CHECK(hipMemImportFromShareableHandle(
         &handles[r],
+#if ROCM_VERSION >= 70100
+        reinterpret_cast<void*>(static_cast<uintptr_t>(imported_handles[r])),
+#else
         (void*)(uintptr_t) & (imported_handles[r]),
+#endif
         hipMemHandleTypePosixFileDescriptor));
 #else
     TORCH_CHECK(
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp
index efec39e9eb72c..e246620df31e8 100644
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp
@@ -61,7 +61,7 @@ class StoreExchange {
     peer_keys.reserve(world_size);
     for (int r = 0; r < world_size; ++r) {
       std::ostringstream oss;
-      oss << store_prefix_ << "/" << seq_id_ << "/" << r;
+      oss << store_prefix_ << '/' << seq_id_ << '/' << r;
       peer_keys.push_back(oss.str());
     }
     ++seq_id_;
diff --git a/torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.cpp b/torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.cpp
index 0d54c389ddee6..44a19e96deeab 100644
--- a/torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.cpp
+++ b/torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.cpp
@@ -7,7 +7,7 @@ std::string get_detector_key(
     c10::DeviceType device_type,
     const std::string& connection_type) {
   std::ostringstream oss;
-  oss << device_type << "/" << connection_type;
+  oss << device_type << '/' << connection_type;
   return oss.str();
 }
 
diff --git a/torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
index 0eda605fad6fb..c099e2d72ecfd 100644
--- a/torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
@@ -207,7 +207,7 @@ class NCCLSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
     auto buffer_size_map =
         storeExchange.all_gather(group_info.store, group_info.rank, group_info.world_size, it->second->buffer_size);
 
-    LOG(INFO) << "[rank " << group_info.rank << "]"
+    LOG(INFO) << "[rank " << group_info.rank << ']'
               << "buffer_size_map: " << buffer_size_map;
     // NCCL window registration api requires all ranks to have the same buffer size
     // we have this check to make sure all ranks have the same buffer size.
diff --git a/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu
index 69e75df453f51..510f5c4dd1b32 100644
--- a/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu
@@ -71,7 +71,7 @@ class NVSHMEMPeerAllocInfo : public c10::intrusive_ptr_target {
           storeExchange.all_gather(store, rank_, world_size_, global_rank);
       exchanged_n_times++;
       if (rank_ == 0) {
-        LOG(INFO) << "[rank " << rank_ << "]"
+        LOG(INFO) << "[rank " << rank_ << ']'
                   << " rank_to_global_rank: " << group_info.rank_to_global_rank
                   << ", group_name: " << group_name
                   << ", exchanged_n_times: " << exchanged_n_times;
diff --git a/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp
index 0d53d100cee7d..f62577e701847 100644
--- a/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp
+++ b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp
@@ -121,7 +121,7 @@ static std::vector<T> storeAllGather(
   std::vector<std::string> peerKeys;
   for (size_t r = 0; r < worldSize; ++r) {
     std::ostringstream oss;
-    oss << prefix << "-" << r;
+    oss << prefix << '-' << r;
     peerKeys.push_back(oss.str());
   }
 
@@ -187,7 +187,7 @@ bool IntraNodeComm::rendezvous() {
     if (strcmp(info.hostname, peerDevInfos.front().hostname) != 0) {
       LOG(WARNING) << "Aborting IntraNodeComm::rendezvous because some "
                       "participants are not on the same host ("
-                   << info.hostname << ", " << devInfo.hostname << ")";
+                   << info.hostname << ", " << devInfo.hostname << ')';
       return false;
     }
     rankToDeviceIdx.emplace_back(info.deviceIdx);
diff --git a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
index cb5d40ef41837..a7a87e4bd8627 100644
--- a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
@@ -57,7 +57,7 @@ bool is_nvshmem_available() {
     // Open the shared library, RTLD_LAZY defers symbol resolution until needed
     handle = dlopen("libnvshmem_host.so.3", RTLD_LAZY);
     if (!handle) {
-      std::cerr << dlerror() << "\n";
+      std::cerr << dlerror() << '\n';
       is_available = 0;
     } else {
       is_available = 1;
diff --git a/torch/csrc/distributed/rpc/rpc_agent.cpp b/torch/csrc/distributed/rpc/rpc_agent.cpp
index 9eee15bdc4d88..a41969ebc1293 100644
--- a/torch/csrc/distributed/rpc/rpc_agent.cpp
+++ b/torch/csrc/distributed/rpc/rpc_agent.cpp
@@ -326,7 +326,7 @@ std::unordered_map<std::string, std::string> RpcAgent::getDebugInfo() {
 
 std::ostream& operator<<(std::ostream& os, const WorkerInfo& workerInfo) {
   return os << "WorkerInfo(id=" << workerInfo.id_
-            << ", name=" << workerInfo.name_ << ")";
+            << ", name=" << workerInfo.name_ << ')';
 }
 
 } // namespace torch::distributed::rpc
diff --git a/torch/csrc/distributed/rpc/rref_impl.cpp b/torch/csrc/distributed/rpc/rref_impl.cpp
index ecf3cbd999104..59087eb3e6a4e 100644
--- a/torch/csrc/distributed/rpc/rref_impl.cpp
+++ b/torch/csrc/distributed/rpc/rref_impl.cpp
@@ -290,12 +290,12 @@ void OwnerRRef::setError(std::exception_ptr eptr) {
 std::ostream& operator<<(std::ostream& os, const RRef& rref) {
   if (rref.isOwner()) {
     return os << "OwnerRRef("
-              << "rref_id=" << rref.rrefId() << ")";
+              << "rref_id=" << rref.rrefId() << ')';
   } else {
     return os << "UserRRef("
               << "rref_id=" << rref.rrefId()
               << ", fork_id=" << static_cast<const UserRRef*>(&rref)->forkId()
-              << ")";
+              << ')';
   }
 }
 
diff --git a/torch/csrc/distributed/rpc/types.cpp b/torch/csrc/distributed/rpc/types.cpp
index 8a3a18e96a264..1a19fa4708273 100644
--- a/torch/csrc/distributed/rpc/types.cpp
+++ b/torch/csrc/distributed/rpc/types.cpp
@@ -83,7 +83,7 @@ GloballyUniqueId GloballyUniqueId::fromIValue(const at::IValue& ivalue) {
 
 std::ostream& operator<<(std::ostream& os, GloballyUniqueId const& globalId) {
   return os << "GloballyUniqueId(created_on=" << globalId.createdOn_
-            << ", local_id=" << globalId.localId_ << ")";
+            << ", local_id=" << globalId.localId_ << ')';
 }
 
 ///////////////////////////  SerializedPyObj   ///////////////////////////
diff --git a/torch/csrc/dynamo/extra_state.cpp b/torch/csrc/dynamo/extra_state.cpp
index b9dccb456fd65..8dc316b98e63c 100644
--- a/torch/csrc/dynamo/extra_state.cpp
+++ b/torch/csrc/dynamo/extra_state.cpp
@@ -13,6 +13,11 @@
 #define _PyCode_SetExtra PyUnstable_Code_SetExtra
 #endif
 
+namespace {
+// Short-term fix for: https://github.com/pytorch/pytorch/issues/166926
+bool use_lru = true;
+} // namespace
+
 Py_ssize_t extra_index = -1;
 
 CacheEntry* ExtraState::get_first_entry() {
@@ -190,7 +195,9 @@ void lookup(
     ++index;
   }
   if (found) {
-    extra_state->move_to_front(found);
+    if (use_lru) {
+      extra_state->move_to_front(found);
+    }
     *maybe_cached_code = found->code.ptr();
     *trace_annotation = found->trace_annotation.c_str();
     return;
@@ -202,8 +209,14 @@ CacheEntry* create_cache_entry(
     ExtraState* extra_state,
     PyObject* guarded_code,
     PyObject* backend) {
-  extra_state->cache_entry_list.emplace_front(guarded_code, backend);
-  auto new_iter = extra_state->cache_entry_list.begin();
+  std::list<CacheEntry>::iterator new_iter;
+  if (use_lru) {
+    extra_state->cache_entry_list.emplace_front(guarded_code, backend);
+    new_iter = extra_state->cache_entry_list.begin();
+  } else {
+    extra_state->cache_entry_list.emplace_back(guarded_code, backend);
+    new_iter = std::prev(extra_state->cache_entry_list.end());
+  }
   new_iter->_owner = extra_state;
   new_iter->_owner_loc = new_iter;
   // Set guard_manager references to extra_state and CacheEntry
@@ -269,6 +282,14 @@ void _load_precompile_entry(
   extra->precompile_entries.push_back(std::move(entry));
 }
 
+void _set_lru_cache(py::object boolean) {
+  if (py::cast<bool>(boolean)) {
+    use_lru = true;
+  } else {
+    use_lru = false;
+  }
+}
+
 py::list _debug_get_precompile_entries(const py::handle& code_obj) {
   if (!py::isinstance(code_obj, py::module::import("types").attr("CodeType"))) {
     throw py::type_error("expected a code object!");
diff --git a/torch/csrc/dynamo/extra_state.h b/torch/csrc/dynamo/extra_state.h
index 1630ac90b21dd..bc62e93bf3f1d 100644
--- a/torch/csrc/dynamo/extra_state.h
+++ b/torch/csrc/dynamo/extra_state.h
@@ -203,5 +203,6 @@ void _load_precompile_entry(
     py::object guard_manager,
     py::object dynamo_code);
 py::list _debug_get_precompile_entries(const py::handle& code_obj);
+void _set_lru_cache(py::object boolean);
 
 #endif
diff --git a/torch/csrc/dynamo/guards.cpp b/torch/csrc/dynamo/guards.cpp
index ac07dc47c5574..55ab898870069 100644
--- a/torch/csrc/dynamo/guards.cpp
+++ b/torch/csrc/dynamo/guards.cpp
@@ -122,6 +122,16 @@ typedef struct {
 
 namespace torch::dynamo {
 
+thread_local bool tls_is_in_mode_without_ignore_compile_internals = false;
+
+void set_is_in_mode_without_ignore_compile_internals(bool value) {
+  tls_is_in_mode_without_ignore_compile_internals = value;
+}
+
+bool get_is_in_mode_without_ignore_compile_internals() {
+  return tls_is_in_mode_without_ignore_compile_internals;
+}
+
 // Macro to skip addition of duplicate guards like EQUALS_MATCH
 #define SKIP_IF_GUARD_ALREADY_PRESENT(name) \
   if (self.is_leaf_guard_present(name)) {   \
@@ -7831,6 +7841,11 @@ PyObject* torch_c_dynamo_guards_init() {
 
 #endif
 
+  py_m.def(
+      "set_is_in_mode_without_ignore_compile_internals",
+      &set_is_in_mode_without_ignore_compile_internals,
+      "Set the thread-local cache for whether we're in a mode with ignore_compile_internals=False");
+
   return m;
 }
 
diff --git a/torch/csrc/dynamo/guards.h b/torch/csrc/dynamo/guards.h
index 0bb5590283f20..5de8199b2d639 100644
--- a/torch/csrc/dynamo/guards.h
+++ b/torch/csrc/dynamo/guards.h
@@ -13,15 +13,36 @@ PyObject* torch_c_dynamo_guards_init();
 void* convert_to_root_guard_manager(py::object root);
 bool run_root_guard_manager(void* root, FrameLocalsMapping* f_locals);
 
+extern thread_local bool tls_is_in_mode_without_ignore_compile_internals;
+
+void set_is_in_mode_without_ignore_compile_internals(bool value);
+
+// If we're in a mode with ignore_compile_internals=False, we WON'T mask
+// Python keys from guard checking (they should be visible, so eager fallback is
+// possible). Otherwise (invisible mode or no mode), we WILL mask Python keys to
+// avoid guard failures on the dispatch keyset at runtime.
+bool get_is_in_mode_without_ignore_compile_internals();
+
 struct LocalState {
   // TLS state that changes operators
   c10::impl::LocalDispatchKeySet dispatch_modifier;
   c10::DispatchKeySet override_dispatch_key_set;
   bool grad_mode_enabled;
+  bool should_mask_python_keys;
 
   at::DispatchKeySet apply(at::DispatchKeySet ks) const {
     if (override_dispatch_key_set.empty()) {
-      return (ks | dispatch_modifier.included_) - dispatch_modifier.excluded_;
+      auto result =
+          (ks | dispatch_modifier.included_) - dispatch_modifier.excluded_;
+
+      if (should_mask_python_keys) {
+        result = result -
+            c10::DispatchKeySet(
+                     {c10::DispatchKey::Python,
+                      c10::DispatchKey::PythonTLSSnapshot});
+      }
+
+      return result;
     } else {
       return override_dispatch_key_set;
     }
@@ -30,7 +51,9 @@ struct LocalState {
   LocalState()
       : dispatch_modifier(c10::impl::tls_local_dispatch_key_set()),
         override_dispatch_key_set(c10::BackendComponent::InvalidBit),
-        grad_mode_enabled(at::GradMode::is_enabled()) {}
+        grad_mode_enabled(at::GradMode::is_enabled()),
+        should_mask_python_keys(
+            !get_is_in_mode_without_ignore_compile_internals()) {}
 
   void overrideDispatchKeySet(c10::DispatchKeySet ks) {
     override_dispatch_key_set = ks;
diff --git a/torch/csrc/dynamo/init.cpp b/torch/csrc/dynamo/init.cpp
index f1590e19d49cf..790ff9acff3a1 100644
--- a/torch/csrc/dynamo/init.cpp
+++ b/torch/csrc/dynamo/init.cpp
@@ -254,6 +254,7 @@ void initDynamoBindings(PyObject* torch) {
   m.def("_reset_precompile_entries", &_reset_precompile_entries);
   m.def("_load_precompile_entry", &_load_precompile_entry);
   m.def("_debug_get_precompile_entries", &_debug_get_precompile_entries);
+  m.def("_set_lru_cache", &_set_lru_cache);
   py::bind_vector<std::vector<uint8_t>>(m, "VectorUInt8");
   init_THPCaches();
   if (THP_PyOpcode_Caches != nullptr) {
diff --git a/torch/csrc/dynamo/python_compiled_autograd.cpp b/torch/csrc/dynamo/python_compiled_autograd.cpp
index 0e70be3e9ffc4..c24f2cffdd762 100644
--- a/torch/csrc/dynamo/python_compiled_autograd.cpp
+++ b/torch/csrc/dynamo/python_compiled_autograd.cpp
@@ -434,10 +434,10 @@ struct VerboseLogger : public PythonLogger {
       }
       oss << it->key_size;
       if (std::next(it) != cached_keys.end()) {
-        oss << ",";
+        oss << ',';
       }
     }
-    oss << "]";
+    oss << ']';
     std::string compile_reason = oss.str();
     log(PythonLogger::DEBUG, compile_reason);
     return compile_reason;
@@ -454,7 +454,7 @@ struct VerboseLogger : public PythonLogger {
     }
     oss << "sizes["
         << std::to_string(new_dyn_sizes_idx[new_dyn_sizes_idx.size() - 1])
-        << "]";
+        << ']';
     std::string recompile_reason = oss.str();
     log(PythonLogger::DEBUG, recompile_reason);
     return recompile_reason;
diff --git a/torch/csrc/export/upgrader.cpp b/torch/csrc/export/upgrader.cpp
index 04da1ab2a2d28..ec275593e6ff4 100644
--- a/torch/csrc/export/upgrader.cpp
+++ b/torch/csrc/export/upgrader.cpp
@@ -78,7 +78,7 @@ void registerUpgrader(
                      << " and keypath: ";
         for (size_t i = 0; i < keypath.size(); ++i) {
           if (i > 0)
-            error_stream << ".";
+            error_stream << '.';
           error_stream << keypath[i];
         }
         TORCH_CHECK(false, error_stream.str());
diff --git a/torch/csrc/inductor/aoti_eager/kernel_meta_info.cpp b/torch/csrc/inductor/aoti_eager/kernel_meta_info.cpp
index 1642ee4beca01..25cd32b6b52fe 100644
--- a/torch/csrc/inductor/aoti_eager/kernel_meta_info.cpp
+++ b/torch/csrc/inductor/aoti_eager/kernel_meta_info.cpp
@@ -100,12 +100,12 @@ std::ostream& operator<<(
   stream << "device_: " << tensor_metadata.device_ << '\n';
   stream << "sizes_: ";
   for (const auto& size : tensor_metadata.sizes_) {
-    stream << size << " ";
+    stream << size << ' ';
   }
   stream << '\n';
   stream << "strides_: ";
   for (const auto& stride : tensor_metadata.strides_) {
-    stream << stride << " ";
+    stream << stride << ' ';
   }
 
   stream << "requires_grad_: " << tensor_metadata.requires_grad_ << '\n';
diff --git a/torch/csrc/inductor/aoti_package/model_package_loader.cpp b/torch/csrc/inductor/aoti_package/model_package_loader.cpp
index 05d7aa04425f5..93c8f71e84d80 100644
--- a/torch/csrc/inductor/aoti_package/model_package_loader.cpp
+++ b/torch/csrc/inductor/aoti_package/model_package_loader.cpp
@@ -696,7 +696,7 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
   } else {
     LOG(WARNING)
         << "You are using an outdated version of the pt2 archive which do not have a prefix in front of each filename. Example: \n"
-        << found_filenames[0] << "\n"
+        << found_filenames[0] << '\n'
         << found_filenames[1];
   }
 
diff --git a/torch/csrc/inductor/aoti_runner/pybind.cpp b/torch/csrc/inductor/aoti_runner/pybind.cpp
index d2cf3535f2d8b..d4117c2ed7cf7 100644
--- a/torch/csrc/inductor/aoti_runner/pybind.cpp
+++ b/torch/csrc/inductor/aoti_runner/pybind.cpp
@@ -66,6 +66,12 @@ void initAOTIRunnerBindings(PyObject* module) {
            int,
            const std::string&,
            const std::string&>())
+      .def(py::init<
+           const std::string&,
+           int,
+           const std::string&,
+           const std::string&,
+           const bool>())
       .def(
           "run",
           &AOTIModelContainerRunnerCuda::run,
diff --git a/torch/csrc/inductor/aoti_runtime/model_base.h b/torch/csrc/inductor/aoti_runtime/model_base.h
index a23c836a46735..bf8f07edb1458 100644
--- a/torch/csrc/inductor/aoti_runtime/model_base.h
+++ b/torch/csrc/inductor/aoti_runtime/model_base.h
@@ -468,7 +468,7 @@ class AOTInductorModelBase {
       auto code = cudaEventDestroy(*run_finished_);
       if (code != cudaSuccess) {
         std::cerr << "Failed to destroy CUDA event in AOTInductor model: "
-                  << cudaGetErrorString(code) << "\n";
+                  << cudaGetErrorString(code) << '\n';
       }
     }
 #endif // USE_CUDA
@@ -836,9 +836,10 @@ class AOTInductorModelBase {
   }
 
   void update_constants_array_from_map() {
-    STD_TORCH_CHECK(
-        constants_map_,
-        "constants_map_ was not ready when constants_ is trying to be constructed from it!");
+    if (!constants_map_) {
+      throw std::runtime_error{
+          "constants_map_ was not ready when constants_ is trying to be constructed from it!"};
+    }
     if (!constants_) {
       constants_ =
           std::make_shared<std::vector<ConstantHandle>>(constants_info_.size());
@@ -874,7 +875,9 @@ class AOTInductorModelBase {
   /// Returns true if the model is complete.
   bool is_finished() {
 #ifdef USE_CUDA
-    STD_TORCH_CHECK(run_finished_, "Model CUDA event was not initialized");
+    if (!run_finished_) {
+      throw std::runtime_error{"Model CUDA event was not initialized"};
+    }
 
     auto event_status = cudaEventQuery(*run_finished_);
     if (event_status == cudaSuccess) {
@@ -883,13 +886,13 @@ class AOTInductorModelBase {
       return false;
     }
 
-    STD_TORCH_CHECK(
-        false,
-        "The model did not finish successfully. Error: ",
+    throw std::runtime_error(
+        std::string("The model did not finish successfully. Error: ") +
         cudaGetErrorString(cudaGetLastError()));
 #elif defined(USE_XPU)
-    STD_TORCH_CHECK(run_finished_, "Model XPU event was not initialized");
-
+    if (!run_finished_) {
+      throw std::runtime_error{"Model XPU event was not initialized"};
+    }
     using namespace sycl::info;
     return (*run_finished_)->get_info<event::command_execution_status>() ==
         event_command_status::complete;
@@ -901,14 +904,19 @@ class AOTInductorModelBase {
 
   /// Synchronizes completion event.
   void wait_for_completion() {
-    STD_TORCH_CHECK(run_finished_, "Model event was not initialized");
 #ifdef USE_CUDA
+    if (!run_finished_) {
+      throw std::runtime_error{"Model event was not initialized"};
+    }
+
     AOTI_RUNTIME_CUDA_CHECK(cudaEventSynchronize(*run_finished_));
 #endif // USE_CUDA
-
 #ifdef USE_XPU
+    if (!run_finished_) {
+      throw std::runtime_error{"Model event was not initialized"};
+    }
     (*run_finished_)->wait_and_throw();
-#endif // USE_XPU
+#endif
   }
 
  protected:
diff --git a/torch/csrc/inductor/aoti_runtime/model_container.h b/torch/csrc/inductor/aoti_runtime/model_container.h
index 61c64760f5328..5cb7daa28a064 100644
--- a/torch/csrc/inductor/aoti_runtime/model_container.h
+++ b/torch/csrc/inductor/aoti_runtime/model_container.h
@@ -123,10 +123,8 @@ class AOTInductorModelContainer {
       constants_folding_lk.unlock();
       model_lk.lock();
     } else if (const_folded != ConstantState::FOLDED) {
-      STD_TORCH_CHECK(
-          false,
-          "Unknown constant state: ",
-          toStringConstantState(constant_folded_));
+      throw std::runtime_error(
+          "Unknown constant state: " + toStringConstantState(constant_folded_));
     }
 
     try {
@@ -169,10 +167,8 @@ class AOTInductorModelContainer {
           /* validate_full_update = */ false);
       const_folded = ConstantState::FOLDED;
     } else if (constant_folded_ != ConstantState::FOLDED) {
-      STD_TORCH_CHECK(
-          false,
-          "Unknown constant state: ",
-          toStringConstantState(constant_folded_));
+      throw std::runtime_error(
+          "Unknown constant state: " + toStringConstantState(constant_folded_));
     }
 
     model->run_single_threaded(
@@ -206,56 +202,56 @@ class AOTInductorModelContainer {
   }
 
   size_t num_constants() const {
-    STD_TORCH_CHECK(
-        this->num_models() != 0, "No available models in container!");
-
+    if (this->num_models() == 0) {
+      throw std::runtime_error("No available models in container!");
+    }
     return models_[0]->num_constants();
   }
 
   // retrieve the constant name of constants_info_[idx]
   const char* constant_name(size_t idx) const {
-    STD_TORCH_CHECK(
-        this->num_models() != 0, "No available models in container!");
-
+    if (this->num_models() == 0) {
+      throw std::runtime_error("No available models in container!");
+    }
     return models_[0]->constant_name(static_cast<int64_t>(idx));
   }
 
   // retrieve original FQN of constants_info_[idx]
   const char* constant_original_fqn(size_t idx) const {
-    STD_TORCH_CHECK(
-        this->num_models() != 0, "No available models in container!");
-
+    if (this->num_models() == 0) {
+      throw std::runtime_error("No available models in container!");
+    }
     return models_[0]->constant_original_fqn(static_cast<int64_t>(idx));
   }
 
   // retrieve whether constant is from folded of constants_info_[idx]
   bool constant_from_folded(size_t idx) const {
-    STD_TORCH_CHECK(
-        this->num_models() != 0, "No available models in container!");
-
+    if (this->num_models() == 0) {
+      throw std::runtime_error("No available models in container!");
+    }
     return models_[0]->constant_from_folded(static_cast<int64_t>(idx));
   }
 
   size_t constant_data_size(size_t idx) const {
-    STD_TORCH_CHECK(
-        this->num_models() != 0, "No available models in container!");
-
+    if (this->num_models() == 0) {
+      throw std::runtime_error("No available models in container!");
+    }
     return models_[0]->constant_data_size(static_cast<int64_t>(idx));
   }
 
   // retrieve type of constants_info_[idx]
   int32_t constant_type(size_t idx) const {
-    STD_TORCH_CHECK(
-        this->num_models() != 0, "No available models in container!");
-
+    if (this->num_models() == 0) {
+      throw std::runtime_error("No available models in container!");
+    }
     return models_[0]->constant_type(static_cast<int64_t>(idx));
   }
 
   // retrieve dtype of constants_info_[idx]
   int32_t constant_dtype(size_t idx) const {
-    STD_TORCH_CHECK(
-        this->num_models() != 0, "No available models in container!");
-
+    if (this->num_models() == 0) {
+      throw std::runtime_error("No available models in container!");
+    }
     return models_[0]->constant_dtype(static_cast<int64_t>(idx));
   }
 
@@ -387,12 +383,9 @@ class AOTInductorModelContainer {
                     << " in model, but not provided by user!\n";
           continue;
         }
-
-        STD_TORCH_CHECK(
-            false,
-            "Cannot find constants ",
-            constant_name,
-            " in constants_map!");
+        throw std::runtime_error(
+            std::string("Cannot find constants ") + constant_name +
+            std::string(" in constants_map!"));
       }
     }
   }
@@ -402,8 +395,9 @@ class AOTInductorModelContainer {
       std::unordered_map<std::string, AtenTensorHandle>&& constants_map,
       bool use_inactive,
       bool validate_full_update) {
-    STD_TORCH_CHECK(
-        this->num_models() != 0, "No available models in container!");
+    if (this->num_models() == 0) {
+      throw std::runtime_error("No model available in container!");
+    }
     if (validate_full_update) {
       assert_all_constants(constants_map);
     }
@@ -449,9 +443,9 @@ class AOTInductorModelContainer {
       bool use_inactive,
       bool validate_full_update,
       bool user_managed = false) {
-    STD_TORCH_CHECK(
-        this->num_models() != 0, "No model available in container!");
-
+    if (this->num_models() == 0) {
+      throw std::runtime_error("No model available in container!");
+    }
     if (validate_full_update) {
       assert_all_constants(constants_map);
     }
diff --git a/torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h b/torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h
index 24c7b48743265..18e0b80589622 100644
--- a/torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h
+++ b/torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h
@@ -7,7 +7,7 @@ namespace torch::aot_inductor {
 
 template <typename T>
 inline RAIIAtenTensorHandle scalar_to_tensor_handle(T value) {
-  STD_TORCH_CHECK(false, "Unsupported scalar_to_tensor_handle");
+  throw std::runtime_error("Unsupported scalar_to_tensor_handle");
 }
 
 // Specialize for supported C++ primitive types
diff --git a/torch/csrc/inductor/aoti_runtime/thread_local.h b/torch/csrc/inductor/aoti_runtime/thread_local.h
index cf7ab0c1e6ed5..fd931c95626e4 100644
--- a/torch/csrc/inductor/aoti_runtime/thread_local.h
+++ b/torch/csrc/inductor/aoti_runtime/thread_local.h
@@ -11,11 +11,11 @@ template <>
 struct ThreadLocalCachedOutputTensor<RAIIAtenTensorHandle> {
   explicit ThreadLocalCachedOutputTensor(const RAIIAtenTensorHandle&) {}
   void copy_data_from(const RAIIAtenTensorHandle& handle) {
-    STD_TORCH_CHECK(false, "can't happen");
+    throw std::runtime_error("can't happen");
   }
 
   AtenTensorHandle tensor() const {
-    STD_TORCH_CHECK(false, "can't happen");
+    throw std::runtime_error("can't happen");
   }
 };
 
@@ -23,11 +23,11 @@ template <>
 struct ThreadLocalCachedOutputTensor<AtenTensorHandle> {
   explicit ThreadLocalCachedOutputTensor(const AtenTensorHandle&) {}
   void copy_data_from(const AtenTensorHandle& handle) {
-    STD_TORCH_CHECK(false, "can't happen");
+    throw std::runtime_error("can't happen");
   }
 
   AtenTensorHandle tensor() const {
-    STD_TORCH_CHECK(false, "can't happen");
+    throw std::runtime_error("can't happen");
   }
 };
 
@@ -35,11 +35,11 @@ template <>
 struct ThreadLocalCachedOutputTensor<ConstantHandle> {
   explicit ThreadLocalCachedOutputTensor(const ConstantHandle&) {}
   void copy_data_from(const ConstantHandle& handle) {
-    STD_TORCH_CHECK(false, "can't happen");
+    throw std::runtime_error("can't happen");
   }
 
   AtenTensorHandle tensor() const {
-    STD_TORCH_CHECK(false, "can't happen");
+    throw std::runtime_error("can't happen");
   }
 };
 
@@ -92,18 +92,18 @@ struct ThreadLocalCachedOutputArray;
 template <>
 struct ThreadLocalCachedOutputArray<RAIIAtenTensorHandle> {
   explicit ThreadLocalCachedOutputArray(const RAIIAtenTensorHandle&) {
-    STD_TORCH_CHECK(false, "can't happen");
+    throw std::runtime_error("can't happen");
   }
 
   // Not supported yet! We would need to put contiguous() or
   // expect_contiguous() into the ABI.
   void copy_data_from(const RAIIAtenTensorHandle&) {
-    STD_TORCH_CHECK(false, "can't happen");
+    throw std::runtime_error("can't happen");
   }
 
   template <typename U>
   ArrayRefTensor<U> arrayref_tensor() const {
-    STD_TORCH_CHECK(false, "can't happen");
+    throw std::runtime_error("can't happen");
   }
 };
 
@@ -111,18 +111,18 @@ struct ThreadLocalCachedOutputArray<RAIIAtenTensorHandle> {
 template <>
 struct ThreadLocalCachedOutputArray<ConstantHandle> {
   explicit ThreadLocalCachedOutputArray(const ConstantHandle&) {
-    STD_TORCH_CHECK(false, "can't happen");
+    throw std::runtime_error("can't happen");
   }
 
   // Not supported yet! We would need to put contiguous() or
   // expect_contiguous() into the ABI.
   void copy_data_from(const ConstantHandle&) {
-    STD_TORCH_CHECK(false, "can't happen");
+    throw std::runtime_error("can't happen");
   }
 
   template <typename U>
   ArrayRefTensor<U> arrayref_tensor() const {
-    STD_TORCH_CHECK(false, "can't happen");
+    throw std::runtime_error("can't happen");
   }
 };
 
diff --git a/torch/csrc/inductor/aoti_torch/c/shim.h b/torch/csrc/inductor/aoti_torch/c/shim.h
index 996c6c8de5ea4..4fb746ea15271 100644
--- a/torch/csrc/inductor/aoti_torch/c/shim.h
+++ b/torch/csrc/inductor/aoti_torch/c/shim.h
@@ -41,7 +41,6 @@
 #include <c10/util/BFloat16.h>
 #include <c10/util/Half.h>
 #include <c10/util/complex.h>
-#include <torch/headeronly/util/Exception.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -622,8 +621,34 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_proxy_executor_call_function(
     int num_tensors,
     AtenTensorHandle* flatten_tensor_args);
 
-// Preserve for BC and will delete it later, using the STD_TORCH_CHECK directly
-#define AOTI_TORCH_CHECK(cond, ...) STD_TORCH_CHECK(cond, ##__VA_ARGS__)
+AOTI_TORCH_EXPORT void aoti_torch_check(
+    bool cond,
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* msg);
+
+#ifdef STRIP_ERROR_MESSAGES
+#define AOTI_TORCH_CHECK(cond, ...)              \
+  if (!(cond)) {                                 \
+    aoti_torch_check(                            \
+        false,                                   \
+        __func__,                                \
+        __FILE__,                                \
+        static_cast<uint32_t>(__LINE__),         \
+        TORCH_CHECK_MSG(cond, "", __VA_ARGS__)); \
+  }
+#else
+#define AOTI_TORCH_CHECK(cond, ...)                \
+  if (!(cond)) {                                   \
+    aoti_torch_check(                              \
+        false,                                     \
+        __func__,                                  \
+        __FILE__,                                  \
+        static_cast<uint32_t>(__LINE__),           \
+        TORCH_CHECK_MSG(cond, "", ##__VA_ARGS__)); \
+  }
+#endif
 
 AOTI_TORCH_EXPORT void aoti_torch_warn(
     const char* func,
diff --git a/torch/csrc/inductor/aoti_torch/shim_common.cpp b/torch/csrc/inductor/aoti_torch/shim_common.cpp
index 61f26159ceb4f..d6db06af5f2cc 100644
--- a/torch/csrc/inductor/aoti_torch/shim_common.cpp
+++ b/torch/csrc/inductor/aoti_torch/shim_common.cpp
@@ -1261,7 +1261,7 @@ void aoti_torch_print_tensor_handle(AtenTensorHandle self, const char* msg) {
   at::Tensor* t = tensor_handle_to_tensor_pointer(self);
 
   // Display message
-  std::cout << "[";
+  std::cout << '[';
   if (msg) {
     std::cout << "  " << msg;
   }
@@ -1270,7 +1270,7 @@ void aoti_torch_print_tensor_handle(AtenTensorHandle self, const char* msg) {
   // Print exact tensor values for small size tensors
   const int64_t numel = t->numel();
   if (numel <= AOTI_TORCH_MAX_NUMEL_TO_PRINT) {
-    std::cout << *t << "\n";
+    std::cout << *t << '\n';
   }
 
   // Print summary stats of the tensor
@@ -1316,7 +1316,7 @@ void aoti_torch_print_tensor_handle(AtenTensorHandle self, const char* msg) {
         std::cout
             << "[INFO] Aten built-in function `min_all_cuda/max_all_cuda` not implemented for current dtype: "
             << t->dtype() << ". Printing out the whole value:\n"
-            << *t << "\n";
+            << *t << '\n';
       }
     }
   }
@@ -1339,14 +1339,13 @@ AOTITorchError aoti_torch_proxy_executor_call_function(
     int num_tensors,
     AtenTensorHandle* flatten_tensor_args) {
   AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
-    TORCH_CHECK(
-        proxy_executor != nullptr,
-        "Unable to find a proxy executor to run custom ops.",
-        "Please check if there is a json file generated",
-        "in the same directory as the so,",
-        "or use torch._inductor.aoti_compile_and_package",
-        "to package everything into a PT2 artifact.");
-
+    if (!proxy_executor) {
+      throw std::runtime_error(
+          "Unable to find a proxy executor to run custom ops. Please check if "
+          "there is a json file generated in the same directory as the so, or use "
+          "torch._inductor.aoti_compile_and_package to package everything into a "
+          "PT2 artifact.");
+    }
     ProxyExecutor* executor = reinterpret_cast<ProxyExecutor*>(proxy_executor);
     executor->call_function(
         extern_node_index,
@@ -1357,6 +1356,17 @@ AOTITorchError aoti_torch_proxy_executor_call_function(
   });
 }
 
+void aoti_torch_check(
+    bool cond,
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* msg) {
+  if (C10_UNLIKELY_OR_CONST(!cond)) {
+    ::c10::detail::torchCheckFail(func, file, line, msg);
+  }
+}
+
 void aoti_torch_warn(
     const char* func,
     const char* file,
diff --git a/torch/csrc/inductor/aoti_torch/shim_mps.cpp b/torch/csrc/inductor/aoti_torch/shim_mps.cpp
index eb753e82f259b..568350fa717d8 100644
--- a/torch/csrc/inductor/aoti_torch/shim_mps.cpp
+++ b/torch/csrc/inductor/aoti_torch/shim_mps.cpp
@@ -10,7 +10,9 @@ AOTITorchError aoti_torch_mps_set_arg_tensor(
     AtenTensorHandle tensor) {
   AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
     auto t = tensor_handle_to_tensor_pointer(tensor);
-    TORCH_CHECK(t != nullptr, "Tensor is null.");
+    if (t == nullptr) {
+      throw std::runtime_error("Tensor is null.");
+    }
     auto func = reinterpret_cast<at::native::mps::MetalKernelFunction*>(handle);
     func->setArg(idx, *t);
   });
diff --git a/torch/csrc/inductor/aoti_torch/utils.h b/torch/csrc/inductor/aoti_torch/utils.h
index 78ab1e8387365..22018cd70c829 100644
--- a/torch/csrc/inductor/aoti_torch/utils.h
+++ b/torch/csrc/inductor/aoti_torch/utils.h
@@ -92,11 +92,13 @@ inline void assert_inf_and_nan(
     const std::string& tensor_name,
     at::Tensor& check_tensor) {
   auto isnan_tensor = check_tensor.isnan();
-  TORCH_CHECK(
-      !isnan_tensor.any().item<bool>(), "At least one NaN in ", tensor_name);
+  if (isnan_tensor.any().item<bool>()) {
+    throw std::runtime_error("At least one NaN in " + tensor_name);
+  }
   auto isinf_tensor = check_tensor.isinf();
-  TORCH_CHECK(
-      !isinf_tensor.any().item<bool>(), "At least one INF in ", tensor_name);
+  if (isinf_tensor.any().item<bool>()) {
+    throw std::runtime_error("At least one INF in " + tensor_name);
+  }
 }
 
 // utility functions to convert a pointer to an optional value
diff --git a/torch/csrc/inductor/cpp_prefix.h b/torch/csrc/inductor/cpp_prefix.h
index decdef52a1daa..7dc161d13fd52 100644
--- a/torch/csrc/inductor/cpp_prefix.h
+++ b/torch/csrc/inductor/cpp_prefix.h
@@ -74,6 +74,22 @@ template <typename T, int N>
 struct IsVecMaskType<at::vec::VecMask<T, N>> : std::true_type {};
 #endif
 
+template <typename T>
+struct GetScalarType {
+  using type = T;
+};
+
+#if INDUCTOR_USE_VECTOR_TYPES()
+template <typename T>
+struct GetScalarType<at::vec::Vectorized<T>> {
+  using type = T;
+};
+template <typename T, int N>
+struct GetScalarType<at::vec::VectorizedN<T, N>> {
+  using type = T;
+};
+#endif
+
 template <typename T, uint64_t kChunkSize>
 struct CascadeSumHelper {
   // A data struct to help cascade summation:
@@ -139,7 +155,7 @@ struct WelfordHelper {
   // 1. Save the reciprocal of weights to avoid redundant divisions.
   // 2. Save the welford stack, which is used to combine welford reduction
   //    with cascade summation to improve numerical stability.
-  static std::vector<typename T::value_type> weight_recps;
+  static std::vector<typename GetScalarType<T>::type> weight_recps;
   std::vector<Welford<T>> welford_stk{};
   uint64_t depth{0}; // depth of welford_stk.
   uint64_t num_chunks{0}; // number of chunks stored in welford_stk.
@@ -154,9 +170,9 @@ struct WelfordHelper {
 };
 
 template <typename T, uint64_t kChunkSize>
-std::vector<typename T::value_type> WelfordHelper<T, kChunkSize>::weight_recps =
-    []() {
-      using scalar_t = typename T::value_type;
+std::vector<typename GetScalarType<T>::type>
+    WelfordHelper<T, kChunkSize>::weight_recps = []() {
+      using scalar_t = typename GetScalarType<T>::type;
       std::vector<scalar_t> temp(kChunkSize);
       for (const auto i : c10::irange(kChunkSize)) {
         temp[i] = scalar_t(static_cast<double>(1) / static_cast<double>(i + 1));
@@ -202,21 +218,19 @@ Welford<T> welford_combine(
   // stability.
   // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
   // https://en.wikipedia.org/wiki/Pairwise_summation
-  if constexpr (IsVecType<T>::value) {
-    if (w != nullptr && w->depth > 0 && acc.index == kChunkSize) {
-      w->welford_stk[0] = welford_combine(w->welford_stk[0], acc);
-      w->num_chunks += 1;
-      acc.mean = T(0);
-      acc.m2 = T(0);
-      acc.weight = T(0);
-      acc.index = 0;
-      uint64_t mask = w->num_chunks;
-      for (uint64_t j = 1; j < w->depth && (mask & 1) == 0; ++j) {
-        w->welford_stk[j] =
-            welford_combine(w->welford_stk[j], w->welford_stk[j - 1]);
-        w->welford_stk[j - 1] = Welford<T>();
-        mask >>= 1;
-      }
+  if (w != nullptr && w->depth > 0 && acc.index == kChunkSize) {
+    w->welford_stk[0] = welford_combine(w->welford_stk[0], acc);
+    w->num_chunks += 1;
+    acc.mean = T(0);
+    acc.m2 = T(0);
+    acc.weight = T(0);
+    acc.index = 0;
+    uint64_t mask = w->num_chunks;
+    for (uint64_t j = 1; j < w->depth && (mask & 1) == 0; ++j) {
+      w->welford_stk[j] =
+          welford_combine(w->welford_stk[j], w->welford_stk[j - 1]);
+      w->welford_stk[j - 1] = Welford<T>();
+      mask >>= 1;
     }
   }
   // Add a single data point
@@ -224,22 +238,18 @@ Welford<T> welford_combine(
   auto new_weight = acc.weight + T(1);
   auto delta = data - acc.mean;
   T new_mean;
-  if constexpr (!IsVecType<T>::value) {
-    new_mean = acc.mean + delta / new_weight;
-  } else {
-    // use new_index to fecth 1 / new_weight to avoid divisions
-    new_mean = acc.mean +
-        ((w == nullptr || acc.index >= w->weight_recps.size())
-             ? delta / new_weight
-             : delta * T(w->weight_recps[acc.index]));
-  }
+  // use new_index to fecth 1 / new_weight to avoid divisions
+  new_mean = acc.mean +
+      ((w == nullptr || acc.index >= w->weight_recps.size())
+           ? delta / new_weight
+           : delta * T(w->weight_recps[acc.index]));
   auto new_delta = data - new_mean;
   auto result =
       Welford<T>{new_mean, acc.m2 + delta * new_delta, new_weight, new_index};
   return result;
 }
 
-template <typename T, uint64_t kChunkSize = 0>
+template <typename T, uint64_t kChunkSize>
 Welford<T> welford_combine(Welford<T>& acc, WelfordHelper<T, kChunkSize>* w) {
   for (const auto i : c10::irange(w->depth)) {
     acc = welford_combine(acc, w->welford_stk[i]);
@@ -256,7 +266,7 @@ struct IndexValue {
 };
 
 #if INDUCTOR_USE_VECTOR_TYPES()
-template <typename T, uint64_t kChunkSize>
+template <typename T, uint64_t kChunkSize = 0>
 Welford<T> welford_combine(
     Welford<T>& acc,
     T& data,
diff --git a/torch/csrc/jit/api/module.cpp b/torch/csrc/jit/api/module.cpp
index 53be7504fe2c3..61c32680c7c0b 100644
--- a/torch/csrc/jit/api/module.cpp
+++ b/torch/csrc/jit/api/module.cpp
@@ -615,7 +615,7 @@ std::string Module::dump_to_str(
             print_method_bodies, print_attr_values, print_param_values));
   }
   ss << "  }" << '\n';
-  ss << "}" << '\n';
+  ss << '}' << '\n';
 
   return ss.str();
 }
diff --git a/torch/csrc/jit/api/module.h b/torch/csrc/jit/api/module.h
index c9b7793c89b6f..739eaf478f1e2 100644
--- a/torch/csrc/jit/api/module.h
+++ b/torch/csrc/jit/api/module.h
@@ -652,7 +652,7 @@ struct NamedPolicy {
       std::ostringstream ss;
       for (const auto i : c10::irange(cursors.size())) {
         if (i > 0) {
-          ss << ".";
+          ss << '.';
         }
         ss << nameFragment(cursors[i]);
       }
diff --git a/torch/csrc/jit/backends/backend_detail.cpp b/torch/csrc/jit/backends/backend_detail.cpp
index de352f50ab503..2edf832e04262 100644
--- a/torch/csrc/jit/backends/backend_detail.cpp
+++ b/torch/csrc/jit/backends/backend_detail.cpp
@@ -305,8 +305,8 @@ Module codegen_backend_module(
         TORCH_INTERNAL_ASSERT(default_value.has_value());
         std::stringstream def_ss, fwd_ss;
         // Annotate type of the arg
-        def_ss << name << ": " << arg.type()->annotation_str(nullptr) << "=";
-        fwd_ss << name << "=" << name;
+        def_ss << name << ": " << arg.type()->annotation_str(nullptr) << '=';
+        fwd_ss << name << '=' << name;
         default_value->repr(
             def_ss, [](std::ostream&, const IValue&) -> bool { return false; });
         def_inputs.emplace_back(def_ss.str());
@@ -337,18 +337,18 @@ Module codegen_backend_module(
 
     if (out_tuple_ty) {
       auto tuple_elements = out_tuple_ty->elements();
-      type_check_ss << tuple_elements[0]->annotation_str() << ")";
+      type_check_ss << tuple_elements[0]->annotation_str() << ')';
       type_checks.emplace_back(type_check_ss.str());
       for (unsigned i = 1, e = tuple_elements.size(); i < e; ++i) {
         type_check_ss.str(std::string());
         type_check_ss.clear();
         out_ss << ", _" << i;
         type_check_ss << "assert isinstance(_" << i << ", "
-                      << tuple_elements[i]->annotation_str() << ")";
+                      << tuple_elements[i]->annotation_str() << ')';
         type_checks.emplace_back(type_check_ss.str());
       }
     } else {
-      type_check_ss << out_ty->annotation_str() << ")";
+      type_check_ss << out_ty->annotation_str() << ')';
       type_checks.emplace_back(type_check_ss.str());
     }
 
@@ -364,7 +364,7 @@ Module codegen_backend_module(
     // If the output type is a single element tuple then add an extra comma
     // to ensure the final output maintains this type.
     if (out_tuple_ty && out_tuple_ty->elements().size() == 1) {
-      out_ss << ",";
+      out_ss << ',';
     }
 
     method_te.s("ret", out_ss.str());
diff --git a/torch/csrc/jit/backends/coreml/objc/PTMCoreMLBackend.mm b/torch/csrc/jit/backends/coreml/objc/PTMCoreMLBackend.mm
index ce1f210752d6c..7823d066bafc2 100644
--- a/torch/csrc/jit/backends/coreml/objc/PTMCoreMLBackend.mm
+++ b/torch/csrc/jit/backends/coreml/objc/PTMCoreMLBackend.mm
@@ -143,7 +143,7 @@ GenericDict compile(IValue processed, GenericDict method_compile_spec) override
       config = extra_json["config"].get<CoreMLConfig>();
       input_specs = extra_json["inputs"].get<std::vector<TensorSpec>>();
       output_specs = extra_json["outputs"].get<std::vector<TensorSpec>>();
-    } catch (std::exception& exn) {
+    } catch (std::exception&) {
       TORCH_CHECK(false, "Parsing model dict failed!");
     }
 
diff --git a/torch/csrc/jit/backends/coreml/objc/PTMCoreMLCompiler.mm b/torch/csrc/jit/backends/coreml/objc/PTMCoreMLCompiler.mm
index 391818df9bbfc..e4cc496ec8235 100644
--- a/torch/csrc/jit/backends/coreml/objc/PTMCoreMLCompiler.mm
+++ b/torch/csrc/jit/backends/coreml/objc/PTMCoreMLCompiler.mm
@@ -97,7 +97,7 @@ + (BOOL)_compileModel:(NSString *)modelName atPath:(NSString *)modelPath {
   NSURL *temporaryURL;
   try {
     temporaryURL = [MLModel compileModelAtURL:modelURL error:&error];
-  } catch (std::runtime_error &e) {
+  } catch (std::runtime_error&) {
     // Could not compile.
     return NO;
   }
diff --git a/torch/csrc/jit/codegen/fuser/tensor_desc.h b/torch/csrc/jit/codegen/fuser/tensor_desc.h
index 0c5db65d54ad1..55cd4008e1814 100644
--- a/torch/csrc/jit/codegen/fuser/tensor_desc.h
+++ b/torch/csrc/jit/codegen/fuser/tensor_desc.h
@@ -88,10 +88,10 @@ struct TORCH_API TensorDesc {
 };
 
 inline std::ostream& operator<<(std::ostream& out, const TensorDesc& d) {
-  out << d.scalar_type << "[";
+  out << d.scalar_type << '[';
   for (const auto b : d.contiguity)
-    out << b << ";";
-  out << "]";
+    out << b << ';';
+  out << ']';
   return out;
 }
 
diff --git a/torch/csrc/jit/frontend/concrete_module_type.cpp b/torch/csrc/jit/frontend/concrete_module_type.cpp
index 91d41607f9df9..1cb5fb225dc92 100644
--- a/torch/csrc/jit/frontend/concrete_module_type.cpp
+++ b/torch/csrc/jit/frontend/concrete_module_type.cpp
@@ -305,39 +305,37 @@ void ConcreteModuleTypeBuilder::addIgnoredAttribute(std::string name) {
 
 void ConcreteModuleType::dump() const {
   std::cout << "ConcreteModuleType for: "
-            << py::getattr(data_.pyClass_, "__name__") << "\n";
+            << py::getattr(data_.pyClass_, "__name__") << '\n';
   std::cout << "Constants: \n";
   for (const auto& pr : data_.constants_) {
-    std::cout << "\t" << pr.first << ": " << pr.second << "\n";
+    std::cout << '\t' << pr.first << ": " << pr.second << '\n';
   }
   std::cout << "\nAttributes: \n";
   for (const auto& pr : data_.attributes_) {
-    std::cout << "\t" << pr.key() << ": " << pr.value().type_->annotation_str()
-              << "\n";
+    std::cout << '\t' << pr.key() << ": " << pr.value().type_->annotation_str()
+              << '\n';
   }
   std::cout << "\nSubmodules: \n";
   for (const auto& info : data_.modules_) {
-    std::cout << "\t" << info.name_ << ": "
-              << info.meta_->getJitType()->annotation_str() << "\n";
+    std::cout << '\t' << info.name_ << ": "
+              << info.meta_->getJitType()->annotation_str() << '\n';
   }
   std::cout << "\nForward Pre-Hooks: \n";
   for (const auto& pre_hook_id : data_.forwardPreHooks_) {
-    std::cout << "\t"
-              << "pre_hook id: " << pre_hook_id << "\n";
+    std::cout << '\t' << "pre_hook id: " << pre_hook_id << '\n';
   }
   std::cout << "\nForward Hooks: \n";
   for (const auto& hook_id : data_.forwardHooks_) {
-    std::cout << "\t"
-              << "hook id: " << hook_id << "\n";
+    std::cout << '\t' << "hook id: " << hook_id << '\n';
   }
   std::cout << "\nOverloads: \n";
   for (const auto& pr : data_.overloads_) {
-    std::cout << "\t" << pr.first << ": " << pr.second << "\n";
+    std::cout << '\t' << pr.first << ": " << pr.second << '\n';
   }
   std::string isPoisoned = data_.isPoisoned_ ? "true" : "false";
-  std::cout << "isPoisoned: " << isPoisoned << "\n";
+  std::cout << "isPoisoned: " << isPoisoned << '\n';
   if (jitType_) {
-    std::cout << "jit type: " << jitType_->annotation_str() << "\n";
+    std::cout << "jit type: " << jitType_->annotation_str() << '\n';
   }
 }
 
diff --git a/torch/csrc/jit/frontend/error_report.cpp b/torch/csrc/jit/frontend/error_report.cpp
index d5a8408e971c0..47a9343c5387f 100644
--- a/torch/csrc/jit/frontend/error_report.cpp
+++ b/torch/csrc/jit/frontend/error_report.cpp
@@ -99,7 +99,7 @@ std::string ErrorReport::current_call_stack() {
 
 const char* ErrorReport::what() const noexcept {
   std::stringstream msg;
-  msg << "\n" << ss.str();
+  msg << '\n' << ss.str();
   msg << ":\n";
   context.highlight(msg);
 
diff --git a/torch/csrc/jit/frontend/ir_emitter.cpp b/torch/csrc/jit/frontend/ir_emitter.cpp
index e7949b0ac4bee..fba613b5ea8f7 100644
--- a/torch/csrc/jit/frontend/ir_emitter.cpp
+++ b/torch/csrc/jit/frontend/ir_emitter.cpp
@@ -421,7 +421,7 @@ struct Environment {
                    "of another type (torch.jit.annotate(List[T, []]) where T "
                    "is the type of elements in the list for Python 2)";
         }
-        error << "\n" << why_not.str();
+        error << '\n' << why_not.str();
         throw ErrorReport(error);
       }
     }
@@ -842,7 +842,7 @@ struct to_ir {
       throw(
           ErrorReport(def.decl().params().range())
           << "Number of type annotations for"
-          << " function parameters (" << schema.arguments().size() << ")"
+          << " function parameters (" << schema.arguments().size() << ')'
           << " does not match the number of parameters on the function ("
           << expected_annotation_size << ")!");
     }
@@ -3452,7 +3452,7 @@ struct to_ir {
           throw(
               ErrorReport(apply.inputs())
               << "expected an expression of type " << type->repr_str()
-              << " but found " << expr->type()->repr_str() << "\n"
+              << " but found " << expr->type()->repr_str() << '\n'
               << why_not.str());
         }
 
@@ -3828,13 +3828,13 @@ struct to_ir {
       if (!is_key_subtype) {
         err << "Generated key type " << key_type->repr_str()
             << " did not match the annotated key type, which was "
-            << annotated_k_type->repr_str() << "\n";
+            << annotated_k_type->repr_str() << '\n';
       }
 
       if (!is_value_subtype) {
         err << "Generated value type " << value_type->repr_str()
             << " did not match the annotated value type, which was "
-            << annotated_v_type->repr_str() << "\n"
+            << annotated_v_type->repr_str() << '\n'
             << ss.str();
       }
 
diff --git a/torch/csrc/jit/frontend/parser.cpp b/torch/csrc/jit/frontend/parser.cpp
index ef49c15bab24c..f56a392cc9327 100644
--- a/torch/csrc/jit/frontend/parser.cpp
+++ b/torch/csrc/jit/frontend/parser.cpp
@@ -23,7 +23,7 @@ Decl mergeTypesFromTypeComment(
         << type_annotation_decl.params().size()
         << ") did not match the number of "
         << (is_method ? "method" : "function") << " parameters ("
-        << expected_num_annotations << ")";
+        << expected_num_annotations << ')';
   }
   auto old = decl.params();
   auto _new = type_annotation_decl.params();
diff --git a/torch/csrc/jit/frontend/schema_matching.cpp b/torch/csrc/jit/frontend/schema_matching.cpp
index f191c7daf6e26..c3525ac9c8a20 100644
--- a/torch/csrc/jit/frontend/schema_matching.cpp
+++ b/torch/csrc/jit/frontend/schema_matching.cpp
@@ -364,7 +364,7 @@ static std::optional<MatchedSchema> tryMatchSchema(
   }
 
   auto err = [&]() -> std::ostream& {
-    *failure_messages << "\n" << schema << ":\n";
+    *failure_messages << '\n' << schema << ":\n";
     return *failure_messages;
   };
 
@@ -679,7 +679,7 @@ Value* emitBuiltinCall(
     at::ArrayRef<NamedValue> args,
     at::ArrayRef<NamedValue> kwargs,
     const std::optional<NamedValue>& self) {
-  const auto& variants = getAllOperatorsFor(name);
+  auto variants = getAllOperatorsFor(name);
   const auto& builtin_functions = getAllBuiltinFunctionsFor(name);
 
   // first let's set the graph's version
@@ -751,7 +751,7 @@ Value* emitBuiltinCall(
     } else {
       error << "Here are some suggestions: \n";
       for (const auto& sym : close_symbols) {
-        error << "\t" << sym.toQualString() << "\n";
+        error << '\t' << sym.toQualString() << '\n';
       }
       error << "\nThe original call is";
     }
diff --git a/torch/csrc/jit/frontend/source_range.cpp b/torch/csrc/jit/frontend/source_range.cpp
index 89815d386ac05..b9263ad08978f 100644
--- a/torch/csrc/jit/frontend/source_range.cpp
+++ b/torch/csrc/jit/frontend/source_range.cpp
@@ -310,7 +310,7 @@ void SourceRange::print_with_context(
     if (!funcname.empty()) {
       out << ", in " << funcname;
     }
-    out << "\n";
+    out << '\n';
   }
   // print out initial context
   out << str.substr(begin_context, start() - begin_context);
@@ -327,7 +327,7 @@ void SourceRange::print_with_context(
       auto actual_line = str.substr(line_start, (line_end - line_start) + 1);
       out << actual_line;
       if (actual_line.back() != '\n') {
-        out << "\n";
+        out << '\n';
       }
 
       size_t empty_space = 0;
@@ -377,7 +377,7 @@ void SourceRange::print_with_context(
     auto line_substr = str.substr(line_end, end_context - line_end);
     out << line_substr;
     if (!line_substr.empty() && line_substr.back() != '\n') {
-      out << "\n";
+      out << '\n';
     }
   }
 }
diff --git a/torch/csrc/jit/frontend/tree.h b/torch/csrc/jit/frontend/tree.h
index 12e75ec41c69d..a11f196c5ac0c 100644
--- a/torch/csrc/jit/frontend/tree.h
+++ b/torch/csrc/jit/frontend/tree.h
@@ -93,9 +93,9 @@ struct Tree : c10::intrusive_ptr_target {
     if (trees().size() < expected_subtrees ||
         (!allow_more && trees().size() != expected_subtrees)) {
       std::stringstream ss;
-      ss << filename << ":" << lineno << ": expected at least "
+      ss << filename << ':' << lineno << ": expected at least "
          << expected_subtrees << " subtrees, but found only " << trees().size()
-         << "\n";
+         << '\n';
       range().highlight(ss);
       TORCH_CHECK(false, ss.str());
     }
@@ -184,11 +184,11 @@ struct pretty_tree {
         out << t->stringValue();
         break;
       default:
-        out << "(" << kindToString(t->kind());
+        out << '(' << kindToString(t->kind());
         for (const auto& e : t->trees()) {
-          out << " " << get_flat(e);
+          out << ' ' << get_flat(e);
         }
-        out << ")";
+        out << ')';
         break;
     }
     auto it_ = flat_strings.emplace(t, out.str());
@@ -201,12 +201,12 @@ struct pretty_tree {
       return;
     }
     std::string k = kindToString(t->kind());
-    out << "(" << k;
+    out << '(' << k;
     for (const auto& e : t->trees()) {
-      out << "\n" << std::string(indent + 2, ' ');
+      out << '\n' << std::string(indent + 2, ' ');
       print(out, e, indent + 2);
     }
-    out << ")";
+    out << ')';
   }
 };
 
diff --git a/torch/csrc/jit/ir/alias_analysis.cpp b/torch/csrc/jit/ir/alias_analysis.cpp
index 16edf669da9be..513258236ac4b 100644
--- a/torch/csrc/jit/ir/alias_analysis.cpp
+++ b/torch/csrc/jit/ir/alias_analysis.cpp
@@ -419,14 +419,14 @@ std::string AliasDb::getElementName(const Element* e) const {
   } else {
     std::ostringstream ss;
     if (e->values.size() == 1) {
-      ss << "%" << (*e->values.begin())->debugName();
+      ss << '%' << (*e->values.begin())->debugName();
       return ss.str();
     }
-    ss << "(";
+    ss << '(';
     for (const Value* v : e->values) {
-      ss << "%" << v->debugName() << ", ";
+      ss << '%' << v->debugName() << ", ";
     }
-    ss << ")";
+    ss << ')';
     return ss.str();
   }
 }
@@ -454,7 +454,7 @@ std::string AliasDb::toString() const {
         ++ct;
         ss << getElementName(memoryDAG_->fromIndex(pointedTo));
       }
-      ss << "\n";
+      ss << '\n';
     }
     ct = 0;
     if (!element->containedElements.empty()) {
@@ -466,7 +466,7 @@ std::string AliasDb::toString() const {
         }
         ++ct;
       }
-      ss << "\n";
+      ss << '\n';
     }
   }
 
@@ -479,9 +479,9 @@ std::string AliasDb::toString() const {
     for (const auto value : values) {
       ss << getElementName(memoryDAG_->fromIndex(value)) << ", ";
     }
-    ss << "\n";
+    ss << '\n';
   }
-  ss << "\n";
+  ss << '\n';
   return ss.str();
 }
 
@@ -511,7 +511,7 @@ std::string AliasDb::toGraphviz() const {
     } else {
       std::ostringstream ss;
       if (e->values.size() == 1) {
-        ss << "\"\\%" << (*e->values.begin())->debugName() << "\"";
+        ss << "\"\\%" << (*e->values.begin())->debugName() << '"';
         return ss.str();
       }
       ss << "\"(";
@@ -538,7 +538,7 @@ std::string AliasDb::toGraphviz() const {
     if (!element->pointsTo.empty()) {
       for (const auto pointedTo : element->pointsTo) {
         dot << "  " << name(element) << " -> "
-            << name(memoryDAG_->fromIndex(pointedTo)) << "\n";
+            << name(memoryDAG_->fromIndex(pointedTo)) << '\n';
       }
     }
     if (!element->containedElements.empty()) {
@@ -617,7 +617,7 @@ void AliasDb::analyzeImpl(Node* node) {
         oss << input->type()->str() << ", ";
       }
       oss << "\n\nCandidates:";
-      const auto& candidates = getAllOperatorsFor(node->kind());
+      auto candidates = getAllOperatorsFor(node->kind());
       for (const auto& candidate : candidates) {
         oss << "\n\t" << candidate->schema();
       }
diff --git a/torch/csrc/jit/ir/ir.cpp b/torch/csrc/jit/ir/ir.cpp
index 4368b3c8191d8..9b00a703e352e 100644
--- a/torch/csrc/jit/ir/ir.cpp
+++ b/torch/csrc/jit/ir/ir.cpp
@@ -64,7 +64,7 @@ constexpr topo_position_t kMidPoint = 0;
 constexpr topo_position_t kAppendInterval = 1099511627776ULL /* 2^40 */;
 
 void printValueRef(std::ostream& out, const Value* n) {
-  out << "%" << n->debugName();
+  out << '%' << n->debugName();
 }
 
 bool isNumber(std::string_view str) {
@@ -160,7 +160,7 @@ static void printAttribute(std::ostream& out, const at::Tensor& tensor) {
   // 1-elem tensors are usually boxed scalars, so print them like it
   if (tensor.numel() == 1) {
     auto scalar_tensor = tensor.view(std::vector<int64_t>{}).item();
-    out << "{";
+    out << '{';
     if (scalar_tensor.isFloatingPoint()) {
       out << scalar_tensor.toDouble();
     } else if (scalar_tensor.isComplex()) {
@@ -168,7 +168,7 @@ static void printAttribute(std::ostream& out, const at::Tensor& tensor) {
     } else {
       out << scalar_tensor.toLong();
     }
-    out << "}";
+    out << '}';
   } else if (tensor.numel() <= max_tensor_display_size) {
     // TODO: This is awful code.  Also it doesn't work on Windows.
     std::ostringstream tensor_ss;
@@ -191,7 +191,7 @@ static void printAttribute(std::ostream& out, const IValue& ival) {
       ss << "[<Tensors>]";
       return true;
     } else if (input.isObject() && !input.type()->is_module()) {
-      ss << "object(" << &input.toObjectRef() << ")";
+      ss << "object(" << &input.toObjectRef() << ')';
       return true;
     }
     return false;
@@ -202,14 +202,14 @@ static void printAttribute(std::ostream& out, const IValue& ival) {
 static void printTypeList(
     std::ostream& out,
     const std::vector<TypePtr>& items) {
-  out << "[";
+  out << '[';
   int i = 0;
   for (auto& item : items) {
     if (i++ > 0)
       out << ", ";
     out << *item;
   }
-  out << "]";
+  out << ']';
 }
 
 void Node::printAttrValue(std::ostream& out, const Symbol& name) const {
@@ -265,7 +265,7 @@ void Node::printAttrValue(std::ostream& out, const Symbol& name) const {
 
 void Node::printAttributes(std::ostream& out, bool ignore_subgraph = false)
     const {
-  out << "[";
+  out << '[';
   auto names = attributeNames();
   int i = 0;
   for (auto name : names) {
@@ -279,11 +279,11 @@ void Node::printAttributes(std::ostream& out, bool ignore_subgraph = false)
     // don't want to print the qualifier since it should always
     // be attribute, but you might be able to track down a weird
     // bug by printing it out.
-    out << name.toUnqualString() << "=";
+    out << name.toUnqualString() << '=';
 
     printAttrValue(out, name);
   }
-  out << "]";
+  out << ']';
 }
 
 SourceRange Node::sourceRange() const {
@@ -313,11 +313,11 @@ std::ostream& Node::print(
   out << " = ";
   if (kind() == prim::PythonOp) {
     auto* pyOp = static_cast<const ::torch::jit::PythonOp*>(this);
-    out << "^" << pyOp->name();
+    out << '^' << pyOp->name();
     printAttributes(out, /*ignore_subgraph=*/false);
     pyOp->writeScalars(out);
   } else if (hasAttribute(attr::Subgraph) && groups) {
-    out << kind().toQualString() << "_" << groups->size();
+    out << kind().toQualString() << '_' << groups->size();
     if (print_attributes && numAttributes() > 1 &&
         kind() != prim::DifferentiableGraph) {
       printAttributes(out, /*ignore_subgraph=*/true);
@@ -330,7 +330,7 @@ std::ostream& Node::print(
       printAttributes(out);
     }
   }
-  out << "(" << inputs() << ")";
+  out << '(' << inputs() << ')';
 
   if (print_scopes) {
     std::string scName = scopeName();
@@ -350,7 +350,7 @@ std::ostream& Node::print(
     }
     if (auto file_line_col = r.file_line_col()) {
       auto [filename, line, col] = *file_line_col;
-      out << " # " << filename << ":" << line << ":" << col;
+      out << " # " << filename << ':' << line << ':' << col;
     }
   }
 
@@ -358,11 +358,11 @@ std::ostream& Node::print(
     return out;
   }
 
-  out << "\n";
+  out << '\n';
 
   for (const auto i : c10::irange(blocks().size())) {
     auto b = blocks()[i];
-    indent(out, level + 1) << "block" << i << "("
+    indent(out, level + 1) << "block" << i << '('
                            << const_value_list_with_types(b->inputs())
                            << "):\n";
     for (auto nested : b->nodes()) {
@@ -389,7 +389,7 @@ std::ostream& Graph::print(std::ostream& out, bool print_source_locations)
   out << "  return (" << outputs() << ")\n";
   size_t i = 0;
   for (auto fg : groups) {
-    out << "with " << fg->kind().toQualString() << "_" << i++ << " = "
+    out << "with " << fg->kind().toQualString() << '_' << i++ << " = "
         << *fg->g(attr::Subgraph);
   }
   out.flush();
@@ -397,7 +397,7 @@ std::ostream& Graph::print(std::ostream& out, bool print_source_locations)
   /*
   // Uncomment this to debug all_nodes issues
   {
-    out << "\n";
+    out << '\n';
     out << "all_nodes:\n";
     for (auto& n : all_nodes) {
       printNode(out, const_cast<Node*>(n), nullptr);
@@ -654,7 +654,7 @@ void Graph::lint() const {
 }
 
 void Graph::dump() const {
-  std::cout << *this << "\n";
+  std::cout << *this << '\n';
 }
 
 void Graph::push_scope(const std::string& scope_name) {
@@ -888,7 +888,7 @@ Value* Value::setDebugName(const std::string& name) {
       static std::locale c_locale("C");
       ss.imbue(c_locale);
 #endif
-      ss << name_base << "." << suffix++;
+      ss << name_base << '.' << suffix++;
       replacement_name = ss.str();
     } while (names.count(replacement_name) > 0);
 
@@ -1069,7 +1069,7 @@ bool Node::mustBeNone() const {
 }
 
 void Node::dump() const {
-  std::cout << *this << "\n";
+  std::cout << *this << '\n';
 }
 
 const FunctionSchema& Node::schema() const {
@@ -1088,7 +1088,7 @@ const FunctionSchema* Node::maybeSchema() const {
 
 const Operator* Node::maybeOperator() const {
   if (!op_) {
-    const auto& candidates = getAllOperatorsFor(kind());
+    auto candidates = getAllOperatorsFor(kind());
     for (const auto& candidate : candidates) {
       if (matches(candidate->schema())) {
         op_ = candidate.get();
@@ -1106,7 +1106,7 @@ const Operator& Node::getOperator() const {
 
   auto er = ErrorReport(sourceRange());
   er << "Schema not found for node. File a bug report.\n";
-  er << "Node: " << *this << "\n";
+  er << "Node: " << *this << '\n';
   er << "Input types:";
   for (const auto i : c10::irange(inputs().size())) {
     if (i > 0)
@@ -1117,13 +1117,13 @@ const Operator& Node::getOperator() const {
   if (!candidates.empty()) {
     er << "\ncandidates were:\n";
     for (auto& candidate : candidates) {
-      er << "  " << candidate->schema() << "\n";
+      er << "  " << candidate->schema() << '\n';
     }
   } else {
     er << "\nno candidates found\n";
   }
   er << "within the graph:\n";
-  er << *owningGraph() << "\n";
+  er << *owningGraph() << '\n';
   throw er;
 }
 
diff --git a/torch/csrc/jit/ir/irparser.cpp b/torch/csrc/jit/ir/irparser.cpp
index bf7feda0ba48d..2fadc7d573e25 100644
--- a/torch/csrc/jit/ir/irparser.cpp
+++ b/torch/csrc/jit/ir/irparser.cpp
@@ -214,11 +214,11 @@ ParsedLiteral IRParser::parseScalarLiteral(Node* n) {
         double imag = 0.0f;
         try {
           imag = std::stod(str.substr(0, str.size() - 1));
-        } catch (const std::invalid_argument& e) {
+        } catch (const std::invalid_argument&) {
           throw(
               ErrorReport(token.range)
               << "Number cannot be converted to double");
-        } catch (const std::out_of_range& e) {
+        } catch (const std::out_of_range&) {
           throw(
               ErrorReport(token.range)
               << "Number is too long to be represented in type double");
@@ -230,11 +230,11 @@ ParsedLiteral IRParser::parseScalarLiteral(Node* n) {
         r.k = AttributeKind::f;
         try {
           r.f = std::stod(str);
-        } catch (const std::invalid_argument& e) {
+        } catch (const std::invalid_argument&) {
           throw(
               ErrorReport(token.range)
               << "Number cannot be converted to double");
-        } catch (const std::out_of_range& e) {
+        } catch (const std::out_of_range&) {
           throw(
               ErrorReport(token.range)
               << "Number is too long to be represented in type double");
@@ -243,11 +243,11 @@ ParsedLiteral IRParser::parseScalarLiteral(Node* n) {
         r.k = AttributeKind::i;
         try {
           r.i = std::stoll(str);
-        } catch (const std::invalid_argument& e) {
+        } catch (const std::invalid_argument&) {
           throw(
               ErrorReport(token.range)
               << "Number cannot be converted to integer");
-        } catch (const std::out_of_range& e) {
+        } catch (const std::out_of_range&) {
           throw(ErrorReport(token.range) << "Number is too big");
         }
       }
diff --git a/torch/csrc/jit/jit_log.cpp b/torch/csrc/jit/jit_log.cpp
index 745d397f593c0..83f0e158d31bb 100644
--- a/torch/csrc/jit/jit_log.cpp
+++ b/torch/csrc/jit/jit_log.cpp
@@ -159,9 +159,9 @@ std::string jit_log_prefix(
     int l,
     const std::string& in_str) {
   std::stringstream prefix_ss;
-  prefix_ss << "[";
-  prefix_ss << level << " ";
-  prefix_ss << c10::detail::StripBasename(std::string(fn)) << ":";
+  prefix_ss << '[';
+  prefix_ss << level << ' ';
+  prefix_ss << c10::detail::StripBasename(std::string(fn)) << ':';
   prefix_ss << std::setfill('0') << std::setw(3) << l;
   prefix_ss << "] ";
 
diff --git a/torch/csrc/jit/mobile/debug_info.cpp b/torch/csrc/jit/mobile/debug_info.cpp
index 0a410a42fef04..be61d1d2ec57b 100644
--- a/torch/csrc/jit/mobile/debug_info.cpp
+++ b/torch/csrc/jit/mobile/debug_info.cpp
@@ -103,7 +103,7 @@ std::pair<std::string, std::string> getStackTraceWithModuleHierarchy(
       std::get<kDebugInfoTupleNodeNameIndex>(last_entry);
   module_info.append(".").append(node_name);
   std::ostringstream ss;
-  ss << "Module hierarchy:" << module_info << "\n";
+  ss << "Module hierarchy:" << module_info << '\n';
   format_stack_trace(ss, stack_entries);
   return {ss.str(), std::move(module_info)};
 }
diff --git a/torch/csrc/jit/mobile/flatbuffer_loader.cpp b/torch/csrc/jit/mobile/flatbuffer_loader.cpp
index 0fb50a5d5dd03..69e931d074c63 100644
--- a/torch/csrc/jit/mobile/flatbuffer_loader.cpp
+++ b/torch/csrc/jit/mobile/flatbuffer_loader.cpp
@@ -414,7 +414,7 @@ std::unique_ptr<mobile::Function> FlatbufferLoader::parseFunction(
           false /*is_varret*/);
 
       function->setSchema(std::move(schema));
-    } catch (const c10::Error& e) {
+    } catch (const c10::Error&) {
     }
   }
   return function;
diff --git a/torch/csrc/jit/mobile/import_data.cpp b/torch/csrc/jit/mobile/import_data.cpp
index 1bd34e4a823ae..7071a08daf6f4 100644
--- a/torch/csrc/jit/mobile/import_data.cpp
+++ b/torch/csrc/jit/mobile/import_data.cpp
@@ -138,7 +138,7 @@ c10::IValue IValueUnpickler::readArchive(
 
   auto read_record = [&](const std::string& name) {
     std::stringstream ss;
-    ss << archive_name << "/" << name;
+    ss << archive_name << '/' << name;
     return std::get<0>(reader_->getRecord(ss.str()));
   };
 
diff --git a/torch/csrc/jit/mobile/interpreter.cpp b/torch/csrc/jit/mobile/interpreter.cpp
index b5e67cd83cbb2..41fc8d49efb16 100644
--- a/torch/csrc/jit/mobile/interpreter.cpp
+++ b/torch/csrc/jit/mobile/interpreter.cpp
@@ -95,11 +95,11 @@ bool InterpreterState::run(Stack& stack) {
         debug_handle = *handle;
       }
 
-      // std::cout << "RUNNING " << pc << " " << code.instructions_[pc];
+      // std::cout << "RUNNING " << pc << ' ' << code.instructions_[pc];
       // if (inst.op == OP) {
       //   std::cout << ", " << code.op_names_[inst.X].name;
       //   if (!code.op_names_[inst.X].overload_name.empty()) {
-      //     std::cout << "." << code.op_names_[inst.X].overload_name;
+      //     std::cout << '.' << code.op_names_[inst.X].overload_name;
       //   }
       // }
       // std::cout << std::endl;
diff --git a/torch/csrc/jit/mobile/model_tracer/tracer.cpp b/torch/csrc/jit/mobile/model_tracer/tracer.cpp
index b821e7dfcdcd7..c6a94dc8a1fb8 100644
--- a/torch/csrc/jit/mobile/model_tracer/tracer.cpp
+++ b/torch/csrc/jit/mobile/model_tracer/tracer.cpp
@@ -60,7 +60,7 @@ static void printOpYAML(
     bool is_used_for_training,
     bool is_root_operator,
     bool include_all_overloads) {
-  out << std::string(indent, ' ') << op_name << ":" << '\n';
+  out << std::string(indent, ' ') << op_name << ':' << '\n';
   out << std::string(indent + 2, ' ')
       << "is_used_for_training: " << (is_used_for_training ? "true" : "false")
       << '\n';
@@ -88,7 +88,7 @@ static void printDTypeYAML(
     const std::string& kernel_tag_name,
     const std::set<std::string>& dtypes) {
   std::string indent_str = std::string(indent, ' ');
-  out << indent_str << kernel_tag_name << ":" << '\n';
+  out << indent_str << kernel_tag_name << ':' << '\n';
   for (auto& dtype : dtypes) {
     out << indent_str << "- " << dtype << '\n';
   }
diff --git a/torch/csrc/jit/passes/check_strict_fusion.cpp b/torch/csrc/jit/passes/check_strict_fusion.cpp
index 41f60fa359132..731382c316398 100644
--- a/torch/csrc/jit/passes/check_strict_fusion.cpp
+++ b/torch/csrc/jit/passes/check_strict_fusion.cpp
@@ -73,7 +73,7 @@ static void checkForUnfusedOps(Node* enter_node) {
     std::stringstream ss;
     ss << "Found multiple fusions: \n";
     for (Node* n : guarding_ifs) {
-      ss << *n << "\n";
+      ss << *n << '\n';
     }
     throw(ErrorReport(enter_node->input()->node()->sourceRange()) << ss.str());
   }
@@ -100,13 +100,13 @@ static void checkForUnfusedOps(Node* enter_node) {
     std::stringstream ss;
     ss << "Found unfused operators: \n";
     for (Node* unfused : unfused_nodes_not_used_in_guard) {
-      ss << "\t";
+      ss << '\t';
       if (unfused->maybeSchema()) {
         ss << unfused->schema();
       } else {
         unfused->kind().toDisplayString();
       }
-      ss << "\n";
+      ss << '\n';
     }
     throw(ErrorReport(enter_node->input()->node()->sourceRange()) << ss.str());
   }
diff --git a/torch/csrc/jit/passes/liveness.cpp b/torch/csrc/jit/passes/liveness.cpp
index c4a80872d61b4..138c6fc78f752 100644
--- a/torch/csrc/jit/passes/liveness.cpp
+++ b/torch/csrc/jit/passes/liveness.cpp
@@ -72,7 +72,7 @@ struct LivenessAnalyzer {
         std::cout << e.first->outputs()[0]->debugName();
       }
 
-      std::cout << " " << e.first->kind().toQualString();
+      std::cout << ' ' << e.first->kind().toQualString();
       std::cout << " = ";
       dump(e.second);
       std::cout << '\n';
@@ -83,16 +83,16 @@ struct LivenessAnalyzer {
 
   void dump(const std::vector<Value*>& set) {
     bool first = true;
-    std::cout << "[";
+    std::cout << '[';
     for (auto el : set) {
       if (first) {
         first = false;
       } else {
         std::cout << ", ";
       }
-      std::cout << el->debugName() << "(" << el->unique() << ")";
+      std::cout << el->debugName() << '(' << el->unique() << ')';
     }
-    std::cout << "]";
+    std::cout << ']';
   }
 
  private:
diff --git a/torch/csrc/jit/passes/onnx.cpp b/torch/csrc/jit/passes/onnx.cpp
index cddae77768228..d3231222cb935 100644
--- a/torch/csrc/jit/passes/onnx.cpp
+++ b/torch/csrc/jit/passes/onnx.cpp
@@ -292,7 +292,7 @@ void NodeToONNX(
       std::ostringstream ss;
       ss << "symbolic for " << op_name
          << " produced an incorrect number of outputs (expected ";
-      ss << num_old_outputs << ", but got " << outputs.size() << ")";
+      ss << num_old_outputs << ", but got " << outputs.size() << ')';
       throw std::runtime_error(ss.str());
     }
     // For const node, it does not need params_dict info, so set it to {}.
diff --git a/torch/csrc/jit/passes/onnx/constant_map.cpp b/torch/csrc/jit/passes/onnx/constant_map.cpp
index e4ec14a5a0175..902dc5f8924cd 100644
--- a/torch/csrc/jit/passes/onnx/constant_map.cpp
+++ b/torch/csrc/jit/passes/onnx/constant_map.cpp
@@ -301,7 +301,7 @@ void ConstantValueMap::PrintMaps() {
         }
       }
     }
-    ss << " (rank = " << x.second << ")";
+    ss << " (rank = " << x.second << ')';
     std::cout << "node " << x.first << ": " << ss.str() << '\n';
   }
   std::cout << '\n';
@@ -346,9 +346,9 @@ void ConstantValueMap::PrintMaps() {
     std::cout << "(node " << x.first << ": ";
     for (const auto& dim : x.second.dim()) {
       if (dim.has_dim_param()) {
-        std::cout << dim.dim_param() << " ";
+        std::cout << dim.dim_param() << ' ';
       } else {
-        std::cout << dim.dim_value() << " ";
+        std::cout << dim.dim_value() << ' ';
       }
     }
     std::cout << "), ";
@@ -361,7 +361,7 @@ void ConstantValueMap::PrintMaps() {
   std::cout << "SymbolDim Map:" << '\n';
   count = 0;
   for (const auto& x : ConstantValueMap::getInstance().symbolDimMap) {
-    std::cout << "(" << x.first << ": " << x.second << "), ";
+    std::cout << '(' << x.first << ": " << x.second << "), ";
     count++;
     if (count % 10 == 0) {
       std::cout << '\n';
@@ -370,7 +370,7 @@ void ConstantValueMap::PrintMaps() {
   std::cout << "DimSymbol Map:" << '\n';
   count = 0;
   for (const auto& x : ConstantValueMap::getInstance().dimSymbolMap) {
-    std::cout << "(" << x.first << ": " << x.second << "), ";
+    std::cout << '(' << x.first << ": " << x.second << "), ";
     count++;
     if (count % 10 == 0) {
       std::cout << '\n';
diff --git a/torch/csrc/jit/passes/onnx/function_extraction.cpp b/torch/csrc/jit/passes/onnx/function_extraction.cpp
index 7901b44bb85f5..fab3110954fde 100644
--- a/torch/csrc/jit/passes/onnx/function_extraction.cpp
+++ b/torch/csrc/jit/passes/onnx/function_extraction.cpp
@@ -250,7 +250,7 @@ void FunctionExtractor::DebugPrintScopeContexts(
     GRAPH_UPDATE("Children scopes: ", [&]() {
       std::stringstream ss;
       for (const auto& child_scope : it.second->children_) {
-        ss << child_scope->name().toDisplayString() << " ";
+        ss << child_scope->name().toDisplayString() << ' ';
       }
       return ss.str();
     }());
diff --git a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
index a188eb0abd6b8..48f13499a5fc0 100644
--- a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
+++ b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
@@ -440,7 +440,7 @@ std::string InplaceConverter::ValueTracker::toString() const {
     ss << "Value[" << idx << "]: " << it.first->debugName() << '\n';
     ss << "  Mapping to ";
     for (auto v : it.second) {
-      ss << v->debugName() << " ";
+      ss << v->debugName() << ' ';
     }
     ss << '\n';
     idx++;
diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp
index 6561dc5bad1d2..2ae32f5fc5082 100644
--- a/torch/csrc/jit/passes/shape_analysis.cpp
+++ b/torch/csrc/jit/passes/shape_analysis.cpp
@@ -60,7 +60,7 @@ void PropertyPropBase::propagateBlock(Block* block, bool insert_expands) {
   for (Node* node : block->nodes()) {
     try {
       propagateNode(node, insert_expands);
-    } catch (propagation_error& e) {
+    } catch (propagation_error&) {
       setUnshapedType(node);
     } catch (std::exception& e) {
       throw(
diff --git a/torch/csrc/jit/passes/symbolic_shape_analysis.cpp b/torch/csrc/jit/passes/symbolic_shape_analysis.cpp
index 153408b350bf2..999f8247b7c84 100644
--- a/torch/csrc/jit/passes/symbolic_shape_analysis.cpp
+++ b/torch/csrc/jit/passes/symbolic_shape_analysis.cpp
@@ -152,11 +152,11 @@ static std::ostream& operator<<(std::ostream& os, const ShapeArguments& sa) {
     return os;
   }
 
-  os << "(";
+  os << '(';
   for (const auto i : c10::irange(sa.len())) {
     os << sa.at(i);
   }
-  os << ")";
+  os << ')';
 
   return os;
 }
diff --git a/torch/csrc/jit/passes/utils/subgraph_utils.cpp b/torch/csrc/jit/passes/utils/subgraph_utils.cpp
index f9fd65f9ce541..f54adbd7223a2 100644
--- a/torch/csrc/jit/passes/utils/subgraph_utils.cpp
+++ b/torch/csrc/jit/passes/utils/subgraph_utils.cpp
@@ -612,7 +612,7 @@ static std::string truncateStrWithHash(const std::string& s, size_t maxlen) {
       (maxlen > hash_str.size() + 1) ? (maxlen - hash_str.size() - 1) : maxlen;
   std::stringstream truncated;
   truncated << s.substr(0, trunc_len);
-  truncated << "_" << hash_str;
+  truncated << '_' << hash_str;
   return truncated.str();
 }
 
@@ -626,7 +626,7 @@ std::string generateNameForGraph(
     if (!node->kind().is_aten()) {
       continue;
     }
-    graph_name << "_" << node->kind().toUnqualString();
+    graph_name << '_' << node->kind().toUnqualString();
   }
   return truncateStrWithHash(graph_name.str(), maxlen);
 }
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index beb6f89519804..a7f16a7dc5a04 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -1693,7 +1693,7 @@ void initJITBindings(PyObject* module) {
       [](const std::string& op_name, const std::string& overload_name) {
         try {
           auto symbol = Symbol::fromQualString(op_name);
-          const auto& operations = getAllOperatorsFor(symbol);
+          auto operations = getAllOperatorsFor(symbol);
           for (const auto& op : operations) {
             if (op->schema().overload_name() == overload_name) {
               return op->schema();
@@ -1714,7 +1714,7 @@ void initJITBindings(PyObject* module) {
          const std::string& overload_name) -> std::optional<py::tuple> {
         try {
           auto symbol = Symbol::fromQualString(op_name);
-          const auto& operations = getAllOperatorsFor(symbol);
+          auto operations = getAllOperatorsFor(symbol);
           bool allow_numbers_as_tensors = opAllowsNumbersAsTensors(symbol);
           for (const auto& op : operations) {
             if (op->schema().overload_name() == overload_name) {
@@ -1798,7 +1798,7 @@ void initJITBindings(PyObject* module) {
                     << "' with schema(s):\n";
 
           for (const auto& op : sortedOps) {
-            docstring << "  " << op->schema() << "\n";
+            docstring << "  " << op->schema() << '\n';
           }
 
           py::list overload_names;
@@ -2138,7 +2138,7 @@ void initJITBindings(PyObject* module) {
   m.def("_jit_get_custom_class_schemas", customClassSchemasForBCCheck);
   m.def("_jit_get_schemas_for_operator", [](const std::string& qualified_name) {
     auto symbol = Symbol::fromQualString(qualified_name);
-    const auto& operations = getAllOperatorsFor(symbol);
+    auto operations = getAllOperatorsFor(symbol);
     return fmap(operations, [](const std::shared_ptr<Operator>& op) {
       return op->schema();
     });
diff --git a/torch/csrc/jit/python/pybind_utils.cpp b/torch/csrc/jit/python/pybind_utils.cpp
index d60a6a0990082..9f7c2756d0d73 100644
--- a/torch/csrc/jit/python/pybind_utils.cpp
+++ b/torch/csrc/jit/python/pybind_utils.cpp
@@ -587,7 +587,9 @@ py::object toPyObject(IValue ivalue) {
   } else if (ivalue.isTensor()) {
     auto tensor = std::move(ivalue).toTensor();
     if (tensor.unsafeGetTensorImpl()->is_wrapped_number()) {
-      TORCH_INTERNAL_ASSERT(tensor.device().is_cpu());
+      TORCH_INTERNAL_ASSERT(
+          tensor.device().is_cpu() ||
+          (tensor._is_zerotensor() && tensor.dim() == 0));
       auto py_tensor = py::cast(tensor);
       if (PyObject_HasAttrString(py_tensor.ptr(), "_wrapped_number")) {
         return py_tensor.attr("_wrapped_number");
@@ -595,17 +597,27 @@ py::object toPyObject(IValue ivalue) {
       auto scalar_type = tensor.scalar_type();
       switch (scalar_type) {
         case at::ScalarType::Bool:
-          return py::cast(*tensor.const_data_ptr<bool>());
+          return (tensor._is_zerotensor())
+              ? py::cast(false)
+              : py::cast(*tensor.const_data_ptr<bool>());
         case at::ScalarType::Long:
-          return py::cast(*tensor.const_data_ptr<int64_t>());
+          return (tensor._is_zerotensor())
+              ? py::cast(int64_t(0))
+              : py::cast(*tensor.const_data_ptr<int64_t>());
         case at::ScalarType::UInt64:
-          return py::cast(*tensor.const_data_ptr<uint64_t>());
+          return (tensor._is_zerotensor())
+              ? py::cast(uint64_t(0))
+              : py::cast(*tensor.const_data_ptr<uint64_t>());
         case at::ScalarType::Double:
-          return py::cast(*tensor.const_data_ptr<double>());
+          return (tensor._is_zerotensor())
+              ? py::cast(0.0)
+              : py::cast(*tensor.const_data_ptr<double>());
         case at::ScalarType::ComplexDouble:
           // TODO: https://github.com/pytorch/pytorch/issues/77134
-          return py::cast(static_cast<std::complex<double>>(
-              *tensor.const_data_ptr<c10::complex<double>>()));
+          return (tensor._is_zerotensor())
+              ? py::cast(std::complex<double>(0.0, 0.0))
+              : py::cast(static_cast<std::complex<double>>(
+                    *tensor.const_data_ptr<c10::complex<double>>()));
         default:
           TORCH_CHECK(
               false,
diff --git a/torch/csrc/jit/python/python_arg_flatten.h b/torch/csrc/jit/python/python_arg_flatten.h
index 232f5b6ea0812..472b257736491 100644
--- a/torch/csrc/jit/python/python_arg_flatten.h
+++ b/torch/csrc/jit/python/python_arg_flatten.h
@@ -79,17 +79,17 @@ static inline std::ostream& operator<<(
       out << ", ";
     out << meta.sizes[i];
   }
-  out << "}";
+  out << '}';
   return out;
 }
 
 static inline std::ostream& operator<<(
     std::ostream& out,
     const IODescriptor& desc) {
-  out << desc.structure << "\n";
-  out << "  with grad_enabled=" << desc.grad_enabled << "\n";
+  out << desc.structure << '\n';
+  out << "  with grad_enabled=" << desc.grad_enabled << '\n';
   for (const auto i : c10::irange(desc.metadata.size())) {
-    out << "  with v" << i << " having type " << desc.metadata[i] << "\n";
+    out << "  with v" << i << " having type " << desc.metadata[i] << '\n';
   }
   return out;
 }
diff --git a/torch/csrc/jit/python/python_ir.cpp b/torch/csrc/jit/python/python_ir.cpp
index 88794ecbf3d73..6e5dcde957ddb 100644
--- a/torch/csrc/jit/python/python_ir.cpp
+++ b/torch/csrc/jit/python/python_ir.cpp
@@ -61,7 +61,7 @@ static std::ostream& printPyObject(std::ostream& out, const THPObjectPtr& obj) {
     // tuple.__str__; this doesn't work because Python doesn't allow
     // monkeypatching methods of built-in types.
     auto pytuple = pyobj.cast<py::tuple>();
-    out << "(";
+    out << '(';
     size_t i = 0;
     for (const auto& o : pytuple) {
       if (i > 0) {
@@ -72,9 +72,9 @@ static std::ostream& printPyObject(std::ostream& out, const THPObjectPtr& obj) {
       i++;
     }
     if (i == 1) {
-      out << ",";
+      out << ',';
     }
-    out << ")";
+    out << ')';
     return out;
   } else {
     return out << THPUtils_unpackString(py::str(pyobj).ptr());
@@ -154,14 +154,14 @@ std::optional<THPObjectPtr> ConcretePythonOp::autogradFunction() const {
 }
 
 void ConcretePythonOp::writeScalars(std::ostream& out) const {
-  out << "(";
+  out << '(';
   int i = 0;
   for (auto& scalar : scalar_args) {
     if (i++ > 0)
       out << ", ";
     printPyObject(out, scalar);
   }
-  out << ")";
+  out << ')';
 }
 
 void ConcretePythonOp::lint_python() const {
@@ -506,7 +506,7 @@ void initPythonIRBindings(PyObject* module_) {
           "__repr__",
           [](Value& n) {
             std::stringstream ss;
-            ss << n.debugName() << " defined in (" << *n.node() << ")";
+            ss << n.debugName() << " defined in (" << *n.node() << ')';
             return ss.str();
           })
       .VS(type)
diff --git a/torch/csrc/jit/python/python_tracer.cpp b/torch/csrc/jit/python/python_tracer.cpp
index 81da1605fcbe2..9210311997384 100644
--- a/torch/csrc/jit/python/python_tracer.cpp
+++ b/torch/csrc/jit/python/python_tracer.cpp
@@ -55,8 +55,8 @@ SourceRange getPythonInterpreterSourceRange() {
       if (src && src->filename()) {
         auto line =
             src->starting_line_no() + src->lineno_for_offset(range.start());
-        stack_trace << *(src->filename()) << "(" << line
-                    << "): " << entry.filename << "\n";
+        stack_trace << *(src->filename()) << '(' << line
+                    << "): " << entry.filename << '\n';
         if (!source_filename) {
           source_filename = *(src->filename());
           source_line = line;
@@ -218,7 +218,7 @@ void initPythonTracerBindings(PyObject* module) {
           "__repr__",
           [](const TracingState& s) {
             std::ostringstream ss;
-            ss << "<TracingState " << (const void*)&s << ">";
+            ss << "<TracingState " << (const void*)&s << '>';
             return ss.str();
           })
       .def(
diff --git a/torch/csrc/jit/python/script_init.cpp b/torch/csrc/jit/python/script_init.cpp
index b9fbf4d1ec30f..ca75e6b986404 100644
--- a/torch/csrc/jit/python/script_init.cpp
+++ b/torch/csrc/jit/python/script_init.cpp
@@ -497,7 +497,7 @@ static bool ivalue_tags_match(const Module& lhs, const Module& rhs) {
     if (item.a.isPtrType()) {
       // uncomment to debug type matching errors
       // std::cout << "MATCHING " << /*item.a <<*/ "(" << *item.a.type() << ") "
-      //          << item.a.internalToPointer() << " " << /*item.b <<*/ " ("
+      //          << item.a.internalToPointer() << ' ' << /*item.b <<*/ " ("
       //          << *item.b.type() << ") " << item.b.internalToPointer() <<
       //          "\n";
 
@@ -902,7 +902,7 @@ void initJitScriptBindings(PyObject* module) {
                     std::stringstream err;
                     err << "Tried to deepcopy object ";
                     if (auto qualname = class_type->name()) {
-                      err << qualname->qualifiedName() << " ";
+                      err << qualname->qualifiedName() << ' ';
                     }
                     err << "which does not have a __setstate__ method defined!";
                     throw std::runtime_error(err.str());
@@ -912,7 +912,7 @@ void initJitScriptBindings(PyObject* module) {
                 std::stringstream err;
                 err << "Tried to deepcopy object ";
                 if (auto qualname = self.type()->name()) {
-                  err << qualname->qualifiedName() << " ";
+                  err << qualname->qualifiedName() << ' ';
                 }
                 err << "which does not have a __getstate__ method defined!";
                 throw std::runtime_error(err.str());
@@ -929,7 +929,7 @@ void initJitScriptBindings(PyObject* module) {
                 std::stringstream err;
                 err << "Tried to serialize object ";
                 if (auto qualname = self.type()->name()) {
-                  err << qualname->qualifiedName() << " ";
+                  err << qualname->qualifiedName() << ' ';
                 }
                 err << "which does not have a __getstate__ method defined!";
                 throw std::runtime_error(err.str());
@@ -966,7 +966,7 @@ void initJitScriptBindings(PyObject* module) {
                 std::stringstream err;
                 err << "Tried to deserialize object ";
                 if (auto qualname = class_type->name()) {
-                  err << qualname->qualifiedName() << " ";
+                  err << qualname->qualifiedName() << ' ';
                 }
                 err << "which does not have a __setstate__ method defined!";
                 throw std::runtime_error(err.str());
diff --git a/torch/csrc/jit/runtime/argument_spec.cpp b/torch/csrc/jit/runtime/argument_spec.cpp
index 0a50a64e5f1bf..667b94556f2b9 100644
--- a/torch/csrc/jit/runtime/argument_spec.cpp
+++ b/torch/csrc/jit/runtime/argument_spec.cpp
@@ -127,7 +127,7 @@ void ArgumentSpecCreator::dump() const {
         break;
     }
   }
-  std::cout << "\n";
+  std::cout << '\n';
 }
 
 ArgumentSpec ArgumentSpecCreator::create(bool with_grad, const Stack& input)
diff --git a/torch/csrc/jit/runtime/argument_spec.h b/torch/csrc/jit/runtime/argument_spec.h
index 1b4cf86a1963c..a7758f1674fab 100644
--- a/torch/csrc/jit/runtime/argument_spec.h
+++ b/torch/csrc/jit/runtime/argument_spec.h
@@ -402,12 +402,12 @@ inline std::ostream& operator<<(std::ostream& out, const ArgumentInfo& info) {
   }
   out << "Tensor(device=" << info.device() << ", type=" << toString(info.type())
       << ", requires_grad=" << info.requires_grad() << ", dims=" << info.dim()
-      << ")";
+      << ')';
   return out;
 }
 
 inline std::ostream& operator<<(std::ostream& out, const ArgumentSpec& spec) {
-  out << "{";
+  out << '{';
   for (const auto i : c10::irange(spec.numTensors())) {
     if (i > 0)
       out << ", ";
@@ -419,7 +419,7 @@ inline std::ostream& operator<<(std::ostream& out, const ArgumentSpec& spec) {
       out << ", ";
     out << spec.isPresent(i);
   }
-  out << "}";
+  out << '}';
   return out;
 }
 
@@ -431,20 +431,20 @@ inline std::ostream& operator<<(
   }
   out << "Tensor(device=" << info.device() << ", type=" << toString(info.type())
       << ", requires_grad=" << info.requires_grad()
-      << ", sizes=" << info.sizes() << ", strides=" << info.strides() << ")";
+      << ", sizes=" << info.sizes() << ", strides=" << info.strides() << ')';
   return out;
 }
 
 inline std::ostream& operator<<(
     std::ostream& out,
     const CompleteArgumentSpec& spec) {
-  out << "{";
+  out << '{';
   for (const auto i : c10::irange(spec.size())) {
     if (i > 0)
       out << ", ";
     out << spec.at(i);
   }
-  out << "}";
+  out << '}';
   return out;
 }
 
diff --git a/torch/csrc/jit/runtime/instruction.cpp b/torch/csrc/jit/runtime/instruction.cpp
index b591bf7318b40..7388b8eac6700 100644
--- a/torch/csrc/jit/runtime/instruction.cpp
+++ b/torch/csrc/jit/runtime/instruction.cpp
@@ -47,10 +47,10 @@ std::ostream& operator<<(std::ostream& out, Instruction inst) {
   auto nargs = std::strlen(OpInfo(inst.op));
   out << inst.op;
   if (nargs > 0) {
-    out << " " << inst.X;
+    out << ' ' << inst.X;
   }
   if (nargs > 1) {
-    out << " " << inst.N;
+    out << ' ' << inst.N;
   }
   return out;
 }
diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp
index 9d4d681f8b32f..95b74376d2eb2 100644
--- a/torch/csrc/jit/runtime/interpreter.cpp
+++ b/torch/csrc/jit/runtime/interpreter.cpp
@@ -213,7 +213,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
     out << "Stack:\n";
     for (const auto& val : stack) {
       out << val;
-      out << "\n";
+      out << '\n';
     }
   }
 
@@ -929,7 +929,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
         python_class_name ? *python_class_name : "RuntimeError";
     ss << "The following operation failed in the TorchScript interpreter.\n";
     formatStackTrace(ss);
-    ss << class_name << ": " << msg << "\n";
+    ss << class_name << ": " << msg << '\n';
     if (future_) {
       future_->setError(std::make_exception_ptr(Future::FutureError(ss.str())));
     } else if (is_jit_exception) {
@@ -942,7 +942,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
           not_implemented_error->caller());
     } else {
       if (get_cpp_stacktraces_enabled()) {
-        ss << e.what() << "\n";
+        ss << e.what() << '\n';
       }
       throw std::runtime_error(ss.str());
     }
@@ -1143,7 +1143,7 @@ std::vector<std::string> currentModuleHierarchy() {
 }
 
 std::ostream& operator<<(std::ostream& out, const Code& code) {
-  out << *code.pImpl->graph_ << "\n";
+  out << *code.pImpl->graph_ << '\n';
   code.pImpl->dump(out);
   return out;
 }
diff --git a/torch/csrc/jit/runtime/interpreter/code_impl.h b/torch/csrc/jit/runtime/interpreter/code_impl.h
index 02e64d1961513..3eddaec0dece4 100644
--- a/torch/csrc/jit/runtime/interpreter/code_impl.h
+++ b/torch/csrc/jit/runtime/interpreter/code_impl.h
@@ -866,17 +866,17 @@ struct CodeImpl {
   }
 
   void dump(std::ostream& out, size_t i) const {
-    out << i << " " << instructions_[i];
+    out << i << ' ' << instructions_[i];
     if (instructions_[i].op == OP || instructions_[i].op == CALL ||
         instructions_[i].op == OPN) {
       out << " # " << *instructions_source_[i];
     } else {
-      out << "\n";
+      out << '\n';
     }
   }
 
   void dump(std::ostream& out) const {
-    out << *graph_ << "\n";
+    out << *graph_ << '\n';
     for (const auto i : c10::irange(instructions_.size())) {
       dump(out, i);
     }
diff --git a/torch/csrc/jit/runtime/operator.cpp b/torch/csrc/jit/runtime/operator.cpp
index 35dead2a395c9..6f9dec70cddc9 100644
--- a/torch/csrc/jit/runtime/operator.cpp
+++ b/torch/csrc/jit/runtime/operator.cpp
@@ -53,6 +53,16 @@ struct OperatorRegistry {
     to_register.clear();
   }
 
+  const std::vector<std::shared_ptr<Operator>>& getOperatorsWithLockHeld(
+      Symbol name) {
+    registerPendingOperators();
+    static std::vector<std::shared_ptr<Operator>> empty;
+    auto it = operators.find(name);
+    if (it != operators.end())
+      return it->second;
+    return empty;
+  }
+
  public:
   void registerOperator(Operator&& op) {
     std::lock_guard<std::mutex> guard(lock);
@@ -143,14 +153,35 @@ struct OperatorRegistry {
     return it->second;
   }
 
-  const std::vector<std::shared_ptr<Operator>>& getOperators(Symbol name) {
+  // This function returns internal lock-protected state. We need to
+  // copy it to avoid race conditions.
+  std::vector<std::shared_ptr<Operator>> getOperators(Symbol name) {
     std::lock_guard<std::mutex> guard(lock);
-    registerPendingOperators();
-    static std::vector<std::shared_ptr<Operator>> empty;
-    auto it = operators.find(name);
-    if (it != operators.end())
-      return it->second;
-    return empty;
+    return getOperatorsWithLockHeld(name);
+  }
+
+  std::vector<std::shared_ptr<Operator>> getSortedOperators(Symbol name) {
+    std::lock_guard<std::mutex> guard(lock);
+    const auto& unsortedOps = getOperatorsWithLockHeld(name);
+    // Depending on the order of registration, aten or jit ops may be
+    // registered first. This sorting is helpful in cases where
+    // deterministic (i.e. not dependent on build config) behavior is
+    // desired; e.g. torch.ops.aten.* uses this function, and tries to
+    // find the "first" op that matches input args. Without the sorting,
+    // the "first" op may change depending on registration order.
+    std::vector<std::shared_ptr<Operator>> sortedOps;
+    sortedOps.reserve(unsortedOps.size());
+    std::copy_if(
+        unsortedOps.begin(),
+        unsortedOps.end(),
+        std::back_inserter(sortedOps),
+        [](const std::shared_ptr<Operator>& op) { return op->isC10Op(); });
+    std::copy_if(
+        unsortedOps.begin(),
+        unsortedOps.end(),
+        std::back_inserter(sortedOps),
+        [](const std::shared_ptr<Operator>& op) { return !op->isC10Op(); });
+    return sortedOps;
   }
 
   std::vector<Symbol> findSimilarOperators(Symbol input_op) {
@@ -387,35 +418,16 @@ void deregisterOperator(const FunctionSchema& schema) {
   getRegistry().deregisterOperator(schema);
 }
 
-const std::vector<std::shared_ptr<Operator>> getAllOperators() {
+std::vector<std::shared_ptr<Operator>> getAllOperators() {
   return getRegistry().getAllOperators();
 }
 
-const std::vector<std::shared_ptr<Operator>>& getAllOperatorsFor(Symbol name) {
+std::vector<std::shared_ptr<Operator>> getAllOperatorsFor(Symbol name) {
   return getRegistry().getOperators(name);
 }
 
 std::vector<std::shared_ptr<Operator>> getAllSortedOperatorsFor(Symbol name) {
-  const auto& unsortedOps = getAllOperatorsFor(name);
-  // Depending on the order of registration, aten or jit ops may be
-  // registered first. This sorting is helpful in cases where
-  // deterministic (i.e. not dependent on build config) behavior is
-  // desired; e.g. torch.ops.aten.* uses this function, and tries to
-  // find the "first" op that matches input args. Without the sorting,
-  // the "first" op may change depending on registration order.
-  std::vector<std::shared_ptr<Operator>> sortedOps;
-  sortedOps.reserve(unsortedOps.size());
-  std::copy_if(
-      unsortedOps.begin(),
-      unsortedOps.end(),
-      std::back_inserter(sortedOps),
-      [](const std::shared_ptr<Operator>& op) { return op->isC10Op(); });
-  std::copy_if(
-      unsortedOps.begin(),
-      unsortedOps.end(),
-      std::back_inserter(sortedOps),
-      [](const std::shared_ptr<Operator>& op) { return !op->isC10Op(); });
-  return sortedOps;
+  return getRegistry().getSortedOperators(name);
 }
 
 std::shared_ptr<Operator> findOperatorFor(const c10::OperatorName& full_name) {
diff --git a/torch/csrc/jit/runtime/operator.h b/torch/csrc/jit/runtime/operator.h
index bde3825f5ea38..6b6972deeebf0 100644
--- a/torch/csrc/jit/runtime/operator.h
+++ b/torch/csrc/jit/runtime/operator.h
@@ -260,8 +260,9 @@ struct TORCH_API Operator {
 
 TORCH_API std::string canonicalSchemaString(const FunctionSchema& schema);
 
-TORCH_API const std::vector<std::shared_ptr<Operator>> getAllOperators();
-TORCH_API const std::vector<std::shared_ptr<Operator>>& getAllOperatorsFor(
+TORCH_API std::vector<std::shared_ptr<Operator>> getAllOperators();
+// This function returns a copy for thread safety.
+TORCH_API std::vector<std::shared_ptr<Operator>> getAllOperatorsFor(
     Symbol name);
 // Returns operators in the order which OpOverloadPacket resolves them.
 TORCH_API std::vector<std::shared_ptr<Operator>> getAllSortedOperatorsFor(
diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp
index 310fe35ffaacb..1f168d24e8adf 100644
--- a/torch/csrc/jit/runtime/register_prim_ops.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops.cpp
@@ -145,7 +145,7 @@ bool isSortableListOfObjectsOrTuples(
   why_not << "Only list of Tensors, ints, floats, bools, strs, "
           << "a User Defined Class that defines the __lt__ compare method "
           << "or Tuples of aforementioned types can be sorted, got list of "
-          << type->repr_str() << "\n";
+          << type->repr_str() << '\n';
   return false;
 }
 
@@ -820,7 +820,7 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs{
           bool first = true;
           for (const IValue& i : last(stack, num_inputs)) {
             if (!first)
-              ss << " ";
+              ss << ' ';
             first = false;
             ss << i;
           }
@@ -2971,10 +2971,10 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs2{
         auto i = pop(stack).toInt();                     \
         std::stringstream ss;                            \
         if (i < 0) {                                     \
-          ss << "-";                                     \
+          ss << '-';                                     \
           i = -i;                                        \
         }                                                \
-        ss << "0" << prefix << char_op << i;             \
+        ss << '0' << prefix << char_op << i;             \
         push(stack, ss.str());                           \
       },                                                 \
       aliasAnalysisFromSchema())
@@ -2991,7 +2991,7 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs2{
             push(stack, "0b0");
           } else {
             if (i < 0) {
-              ss << "-";
+              ss << '-';
               i = -i;
             }
             std::string str = std::bitset<8 * sizeof(i)>(i).to_string();
diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index 0a6e0b3564add..8ad348bb162c1 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -145,9 +145,9 @@ std::string dumpValueSet(
   std::ostringstream oss;
   oss << set_name << ": {";
   for (const auto* val : value_set) {
-    oss << "%" << val->debugName() << ", ";
+    oss << '%' << val->debugName() << ", ";
   }
-  oss << "}";
+  oss << '}';
   return oss.str();
 }
 
@@ -1521,7 +1521,7 @@ void BlockRunner::benchmark(
     } else if (results.native_nodes.count(kind)) {
       std::cout << ", native)" << '\n';
     } else {
-      std::cout << ")" << '\n';
+      std::cout << ')' << '\n';
     }
 
     if (generate_ai_pep_output) {
@@ -1566,13 +1566,13 @@ void BlockRunner::benchmark(
   auto unsupported_nodes_count = results.total_nodes_count -
       results.out_nodes_count - results.native_nodes.size();
   std::cout << "Total number of 'out' variant nodes/total number of nodes: "
-            << results.out_nodes_count << "/" << results.total_nodes_count
+            << results.out_nodes_count << '/' << results.total_nodes_count
             << " ("
             << 100.0 * static_cast<float>(results.out_nodes_count) /
           static_cast<float>(results.total_nodes_count)
             << "%)" << '\n';
   std::cout << "Total number of nodes not covered by SR/total number of nodes: "
-            << unsupported_nodes_count << "/" << results.total_nodes_count
+            << unsupported_nodes_count << '/' << results.total_nodes_count
             << " ("
             << 100.0 * static_cast<float>(unsupported_nodes_count) /
           static_cast<float>(results.total_nodes_count)
diff --git a/torch/csrc/jit/runtime/symbolic_shape_registry.cpp b/torch/csrc/jit/runtime/symbolic_shape_registry.cpp
index 74f87e46757ea..b1f0f410f14fe 100644
--- a/torch/csrc/jit/runtime/symbolic_shape_registry.cpp
+++ b/torch/csrc/jit/runtime/symbolic_shape_registry.cpp
@@ -79,7 +79,7 @@ auto compilation_unit = std::make_shared<CompilationUnit>();
 
 const std::optional<const FunctionSchema*> getInplaceVariant(
     const FunctionSchema& base_schema) {
-  auto& inplace_variants =
+  auto inplace_variants =
       getAllOperatorsFor(c10::Symbol::fromQualString(base_schema.name() + "_"));
 
   for (const auto& variant : inplace_variants) {
diff --git a/torch/csrc/jit/serialization/onnx.cpp b/torch/csrc/jit/serialization/onnx.cpp
index 82ce2e4e360c1..499ed582a661d 100644
--- a/torch/csrc/jit/serialization/onnx.cpp
+++ b/torch/csrc/jit/serialization/onnx.cpp
@@ -27,7 +27,7 @@ void dump(const onnx::TensorProto& tensor, std::ostream& stream) {
   for (const auto i : c10::irange(tensor.dims_size())) {
     stream << tensor.dims(i) << (i == tensor.dims_size() - 1 ? "" : " ");
   }
-  stream << "]";
+  stream << ']';
 }
 
 void dump(const onnx::TensorShapeProto& shape, std::ostream& stream) {
@@ -36,7 +36,7 @@ void dump(const onnx::TensorShapeProto& shape, std::ostream& stream) {
     if (dim.has_dim_value()) {
       stream << dim.dim_value();
     } else {
-      stream << "?";
+      stream << '?';
     }
     stream << (i == shape.dim_size() - 1 ? "" : " ");
   }
@@ -67,7 +67,7 @@ void dump(const onnx::TypeProto_Optional& optional_type, std::ostream& stream) {
   } else {
     stream << "None";
   }
-  stream << ">";
+  stream << '>';
 }
 
 void dump(const onnx::TypeProto_Sequence& sequence_type, std::ostream& stream) {
@@ -77,7 +77,7 @@ void dump(const onnx::TypeProto_Sequence& sequence_type, std::ostream& stream) {
   } else {
     stream << "None";
   }
-  stream << ">";
+  stream << '>';
 }
 
 void dump(const onnx::TypeProto& type, std::ostream& stream) {
@@ -95,7 +95,7 @@ void dump(const onnx::TypeProto& type, std::ostream& stream) {
 void dump(const onnx::ValueInfoProto& value_info, std::ostream& stream) {
   stream << "{name: \"" << value_info.name() << "\", type:";
   dump(value_info.type(), stream);
-  stream << "}";
+  stream << '}';
 }
 
 void dump(const onnx::GraphProto& graph, std::ostream& stream, size_t indent);
@@ -123,36 +123,36 @@ void dump(
     for (const auto i : c10::irange(attr.floats_size())) {
       stream << attr.floats(i) << (i == attr.floats_size() - 1 ? "" : " ");
     }
-    stream << "]";
+    stream << ']';
   } else if (attr.ints_size()) {
     stream << "ints, values: [";
     for (const auto i : c10::irange(attr.ints_size())) {
       stream << attr.ints(i) << (i == attr.ints_size() - 1 ? "" : " ");
     }
-    stream << "]";
+    stream << ']';
   } else if (attr.strings_size()) {
     stream << "strings, values: [";
     for (const auto i : c10::irange(attr.strings_size())) {
       stream << "'" << attr.strings(i) << "'"
              << (i == attr.strings_size() - 1 ? "" : " ");
     }
-    stream << "]";
+    stream << ']';
   } else if (attr.tensors_size()) {
     stream << "tensors, values: [";
     for (auto& t : attr.tensors()) {
       dump(t, stream);
     }
-    stream << "]";
+    stream << ']';
   } else if (attr.graphs_size()) {
     stream << "graphs, values: [";
     for (auto& g : attr.graphs()) {
       dump(g, stream, indent + 1);
     }
-    stream << "]";
+    stream << ']';
   } else {
     stream << "UNKNOWN";
   }
-  stream << "}";
+  stream << '}';
 }
 
 void dump(const onnx::NodeProto& node, std::ostream& stream, size_t indent) {
@@ -174,31 +174,31 @@ void dump(const onnx::NodeProto& node, std::ostream& stream, size_t indent) {
 
 void dump(const onnx::GraphProto& graph, std::ostream& stream, size_t indent) {
   stream << idt(indent) << "GraphProto {" << nlidt(indent + 1) << "name: \""
-         << graph.name() << "\"" << nlidt(indent + 1) << "inputs: [";
+         << graph.name() << '"' << nlidt(indent + 1) << "inputs: [";
   for (const auto i : c10::irange(graph.input_size())) {
     dump(graph.input(i), stream);
     stream << (i == graph.input_size() - 1 ? "" : ",");
   }
-  stream << "]" << nlidt(indent + 1) << "outputs: [";
+  stream << ']' << nlidt(indent + 1) << "outputs: [";
   for (const auto i : c10::irange(graph.output_size())) {
     dump(graph.output(i), stream);
     stream << (i == graph.output_size() - 1 ? "" : ",");
   }
-  stream << "]" << nlidt(indent + 1) << "value_infos: [";
+  stream << ']' << nlidt(indent + 1) << "value_infos: [";
   for (const auto i : c10::irange(graph.value_info_size())) {
     dump(graph.value_info(i), stream);
     stream << (i == graph.value_info_size() - 1 ? "" : ",");
   }
-  stream << "]" << nlidt(indent + 1) << "initializers: [";
+  stream << ']' << nlidt(indent + 1) << "initializers: [";
   for (const auto i : c10::irange(graph.initializer_size())) {
     dump(graph.initializer(i), stream);
     stream << (i == graph.initializer_size() - 1 ? "" : ",");
   }
-  stream << "]" << nlidt(indent + 1) << "nodes: [" << nlidt(indent + 2);
+  stream << ']' << nlidt(indent + 1) << "nodes: [" << nlidt(indent + 2);
   for (const auto i : c10::irange(graph.node_size())) {
     dump(graph.node(i), stream, indent + 2);
     if (i != graph.node_size() - 1) {
-      stream << "," << nlidt(indent + 2);
+      stream << ',' << nlidt(indent + 2);
     }
   }
   stream << nlidt(indent + 1) << "]\n" << idt(indent) << "}\n";
@@ -208,14 +208,14 @@ void dump(
     const onnx::OperatorSetIdProto& operator_set_id,
     std::ostream& stream) {
   stream << "OperatorSetIdProto { domain: " << operator_set_id.domain()
-         << ", version: " << operator_set_id.version() << "}";
+         << ", version: " << operator_set_id.version() << '}';
 }
 
 void dump(const onnx::ModelProto& model, std::ostream& stream, size_t indent) {
   stream << idt(indent) << "ModelProto {" << nlidt(indent + 1)
-         << "producer_name: \"" << model.producer_name() << "\""
-         << nlidt(indent + 1) << "domain: \"" << model.domain() << "\""
-         << nlidt(indent + 1) << "doc_string: \"" << model.doc_string() << "\"";
+         << "producer_name: \"" << model.producer_name() << '"'
+         << nlidt(indent + 1) << "domain: \"" << model.domain() << '"'
+         << nlidt(indent + 1) << "doc_string: \"" << model.doc_string() << '"';
   if (model.has_graph()) {
     stream << nlidt(indent + 1) << "graph:\n";
     dump(model.graph(), stream, indent + 2);
diff --git a/torch/csrc/jit/serialization/pickler.cpp b/torch/csrc/jit/serialization/pickler.cpp
index 0622dbb5cd98e..1d5a2e77931c0 100644
--- a/torch/csrc/jit/serialization/pickler.cpp
+++ b/torch/csrc/jit/serialization/pickler.cpp
@@ -130,7 +130,7 @@ void Pickler::pushIValueImpl(const IValue& ivalue) {
     err << "Cannot serialize custom bound C++ class";
     if (memoized_class_types_ && !memoized_class_types_->empty()) {
       if (auto qualname = memoized_class_types_->back()->name()) {
-        err << " " << qualname->qualifiedName();
+        err << ' ' << qualname->qualifiedName();
       }
     }
     err << ". Please define serialization methods via def_pickle() for "
diff --git a/torch/csrc/jit/serialization/python_print.cpp b/torch/csrc/jit/serialization/python_print.cpp
index 70e188816fb4c..bf7e5250487d1 100644
--- a/torch/csrc/jit/serialization/python_print.cpp
+++ b/torch/csrc/jit/serialization/python_print.cpp
@@ -381,7 +381,7 @@ struct PythonPrintImpl {
   static std::string makeValidIdentifier(const std::string& candidate) {
     std::stringstream ss;
     if (candidate.empty() || isdigit(candidate[0]))
-      ss << "_";
+      ss << '_';
     for (char c : candidate) {
       if (isupper(c) || islower(c) || isdigit(c) || c == '_')
         ss << c;
@@ -487,11 +487,11 @@ struct PythonPrintImpl {
     if (isValidIdentifier(val_name)) {
       stmt << val_name;
     } else {
-      stmt << "(" << val_name << ")";
+      stmt << '(' << val_name << ')';
     }
-    stmt << "[";
+    stmt << '[';
     stmt << useOf(inputs[1]);
-    stmt << "]";
+    stmt << ']';
   }
 
   void printDict(
@@ -534,7 +534,7 @@ struct PythonPrintImpl {
     body_ << " = ";
     // or if value is being assigned to something of a union type
     printValueList(body_, rhs);
-    body_ << "\n";
+    body_ << '\n';
   }
 
   bool requiresAnnotation(Value* lhs, Value* rhs) {
@@ -555,7 +555,7 @@ struct PythonPrintImpl {
       if (requiresAnnotation(lhs[i], rhs[i])) {
         body_ << ": " << lhs[i]->type()->annotation_str(type_printer_);
       }
-      body_ << " = " << useOf(rhs[i]) << "\n";
+      body_ << " = " << useOf(rhs[i]) << '\n';
     }
   }
 
@@ -705,7 +705,7 @@ struct PythonPrintImpl {
       printValueList(body_, node->outputs());
       body_ << " = ";
     }
-    body_ << expr << "\n";
+    body_ << expr << '\n';
   }
 
   // Recursively check contained types for any class dependencies
@@ -794,7 +794,7 @@ struct PythonPrintImpl {
           indent();
           body_ << "return ";
           printValueList(body_, node->inputs());
-          body_ << "\n";
+          body_ << '\n';
         }
         break;
       case prim::Loop:
@@ -814,7 +814,7 @@ struct PythonPrintImpl {
         if (!node->outputs().empty()) {
           printValueList(body_, node->outputs(), "", ", = ");
         }
-        body_ << useOf(node->input()) << "\n";
+        body_ << useOf(node->input()) << '\n';
         break;
       case prim::SetAttr: {
         const auto obj = node->inputs().at(0);
@@ -822,8 +822,8 @@ struct PythonPrintImpl {
         const auto type = obj->type()->expect<ClassType>();
         const auto& attrname = node->s(attr::name);
         indent();
-        body_ << useOf(obj) << "." << attrname << " = " << useOf(newVal)
-              << "\n";
+        body_ << useOf(obj) << '.' << attrname << " = " << useOf(newVal)
+              << '\n';
       } break;
       case prim::fork: {
         // the subgraph gets emitted as another function
@@ -836,7 +836,7 @@ struct PythonPrintImpl {
         }
         printBody(graph->block());
         std::stringstream ss;
-        ss << "fork(" << name << ")";
+        ss << "fork(" << name << ')';
         printOutputDefinition(node, ss.str());
       } break;
       case prim::awaitable: {
@@ -850,7 +850,7 @@ struct PythonPrintImpl {
         }
         printBody(graph->block());
         std::stringstream ss;
-        ss << "awaitable(" << name << ")";
+        ss << "awaitable(" << name << ')';
         printOutputDefinition(node, ss.str());
       } break;
       case prim::Enter: {
@@ -884,7 +884,7 @@ struct PythonPrintImpl {
         auto name = useOf(node->output())->str();
         std::shared_ptr<Graph> graph = node->g(attr::Subgraph);
         indent();
-        body_ << "def " << name << "(";
+        body_ << "def " << name << '(';
         assignValuesToTheirUniqueNames(graph->inputs());
         for (size_t i = 0; i < graph->inputs().size(); ++i) {
           Value* v = graph->inputs().at(i);
@@ -903,7 +903,7 @@ struct PythonPrintImpl {
         assignValuesToTheirUniqueNames(out);
         indent();
         body_ << useOf(out) << " : " << out->type()->annotation_str() << " = "
-              << useOf(container) << "[" << useOf(key) << "]\n";
+              << useOf(container) << '[' << useOf(key) << "]\n";
       } break;
       default:
         auto ss = std::make_shared<TaggedStringStream>(&source_range_stack_);
@@ -992,7 +992,7 @@ struct PythonPrintImpl {
       // doing it here ensures we do not have fix up archives later
       stmt << "torch." << kind.toUnqualString();
     } else {
-      stmt << "ops." << kind.ns().toUnqualString() << "."
+      stmt << "ops." << kind.ns().toUnqualString() << '.'
            << kind.toUnqualString();
     }
   }
@@ -1011,14 +1011,14 @@ struct PythonPrintImpl {
               << "If this is a nn.ModuleList, add it to __constants__");
         }
         std::stringstream scalars_stream;
-        stmt << "^" << value->name();
+        stmt << '^' << value->name();
         value->writeScalars(scalars_stream);
         stmt << scalars_stream.str();
         printValueList(stmt, node->inputs(), "(", ")");
       } break;
       case prim::Uninitialized: {
         stmt << "uninitialized("
-             << node->output()->type()->annotation_str(type_printer_) << ")";
+             << node->output()->type()->annotation_str(type_printer_) << ')';
       } break;
       case prim::Constant: {
         if (node->outputs().size() == 1 &&
@@ -1038,7 +1038,7 @@ struct PythonPrintImpl {
       case aten::IntImplicit: {
         stmt << "annotate("
              << node->output()->type()->annotation_str(type_printer_) << ", "
-             << useOf(node->input()) << ")";
+             << useOf(node->input()) << ')';
       } break;
       case aten::Int: {
         printValueList(stmt, node->inputs(), "int(", ")");
@@ -1070,12 +1070,12 @@ struct PythonPrintImpl {
             stmt, node->inputs(), "(", node->inputs().size() == 1 ? ",)" : ")");
       } break;
       case prim::TupleIndex: {
-        stmt << "(" << useOf(node->inputs().at(0)) << ")["
-             << useOf(node->inputs().at(1)) << "]";
+        stmt << '(' << useOf(node->inputs().at(0)) << ")["
+             << useOf(node->inputs().at(1)) << ']';
       } break;
       case prim::TupleSlice: {
-        stmt << "(" << useOf(node->input()) << ")[" << node->i(attr::beg) << ":"
-             << node->i(attr::end) << "]";
+        stmt << '(' << useOf(node->input()) << ")[" << node->i(attr::beg) << ':'
+             << node->i(attr::end) << ']';
       } break;
       case prim::ListConstruct: {
         ListTypePtr list_type = node->output()->type()->expect<ListType>();
@@ -1093,7 +1093,7 @@ struct PythonPrintImpl {
           stmt << "annotate("
                << node->output()->type()->annotation_str(type_printer_) << ", ";
           printValueList(stmt, node->inputs(), "[", "]");
-          stmt << ")";
+          stmt << ')';
           // Otherwise just print a list
         } else {
           printValueList(stmt, node->inputs(), "[", "]");
@@ -1112,7 +1112,7 @@ struct PythonPrintImpl {
           stmt << "annotate("
                << node->output()->type()->annotation_str(type_printer_) << ", ";
           printDict(stmt, node->inputs());
-          stmt << ")";
+          stmt << ')';
           // Otherwise just print a dict
         } else {
           printDict(stmt, node->inputs());
@@ -1121,37 +1121,36 @@ struct PythonPrintImpl {
       case prim::CreateObject: {
         const auto classType = node->output()->type()->expect<ClassType>();
         stmt << classType->annotation_str(type_printer_) << ".__new__("
-             << classType->annotation_str(type_printer_) << ")";
+             << classType->annotation_str(type_printer_) << ')';
       } break;
       case prim::GetAttr: {
         const auto obj = node->inputs().at(0);
         const auto classType = obj->type()->expect<ClassType>();
         const auto& field = node->s(attr::name);
         if (isValidIdentifier(field)) {
-          stmt << useOf(obj) << "." << field;
+          stmt << useOf(obj) << '.' << field;
         } else {
           stmt << "getattr(" << useOf(obj) << ", ";
           std::stringstream field_stream;
           c10::printQuotedString(field_stream, field);
-          stmt << field_stream.str() << ")";
+          stmt << field_stream.str() << ')';
         }
       } break;
       case prim::CallFunction: {
-        stmt << useOf(node->inputs().at(0)) << "(";
+        stmt << useOf(node->inputs().at(0)) << '(';
         for (size_t i = 1; i < node->inputs().size(); i++) {
           stmt << useOf(node->inputs()[i]) << ", ";
         }
-        stmt << ")";
+        stmt << ')';
       } break;
       case prim::CallMethod: {
         const auto& self = node->inputs().at(0);
         const auto& methodName = node->s(attr::name);
-        stmt << "(" << useOf(self) << ")"
-             << "." << methodName << "(";
+        stmt << '(' << useOf(self) << ')' << '.' << methodName << '(';
         for (size_t i = 1; i < node->inputs().size(); i++) {
           stmt << useOf(node->inputs()[i]) << ", ";
         }
-        stmt << ")";
+        stmt << ')';
 
         if (auto selfClass = self->type()->cast<ClassType>()) {
           deps_table_.add(selfClass);
@@ -1169,7 +1168,7 @@ struct PythonPrintImpl {
       } break;
       case aten::_unwrap_optional: {
         printOpName(stmt, node->kind());
-        stmt << "(";
+        stmt << '(';
         // we cannot recover the type of unwrap_optional(None),
         // using normal schema matching, so we route around this by rewriting
         // the call to unwrap_optional(annotated(Optional[T], None))
@@ -1177,11 +1176,11 @@ struct PythonPrintImpl {
             node->input()->mustBeNone()) {
           auto input_type = OptionalType::create(node->output()->type());
           stmt << "annotate(" << input_type->annotation_str(type_printer_)
-               << ", " << useOf(node->input()) << ")";
+               << ", " << useOf(node->input()) << ')';
         } else {
           stmt << useOf(node->input());
         }
-        stmt << ")";
+        stmt << ')';
       } break;
       // unchecked_unwrap_optional is no longer generated by the compiler,
       // but may end up here if it was first loaded from a old model and
@@ -1191,7 +1190,7 @@ struct PythonPrintImpl {
       case prim::unchecked_cast: {
         stmt << "unchecked_cast("
              << node->output()->type()->annotation_str(type_printer_) << ", "
-             << useOf(node->input()) << ")";
+             << useOf(node->input()) << ')';
       } break;
       case prim::isinstance: {
         stmt << "isinstance(" << useOf(node->input()) << ", ";
@@ -1200,7 +1199,7 @@ struct PythonPrintImpl {
           stmt << types.at(0)->annotation_str(type_printer_);
         } else {
           // check multiple things, e.g. (str, list, int)
-          stmt << "(";
+          stmt << '(';
           bool first = true;
           for (const TypePtr& typ : types) {
             if (!first) {
@@ -1209,30 +1208,29 @@ struct PythonPrintImpl {
             stmt << typ->annotation_str(type_printer_);
             first = false;
           }
-          stmt << ")";
+          stmt << ')';
         }
-        stmt << ")";
+        stmt << ')';
       } break;
       case prim::tolist: {
         stmt << "annotate("
              << node->output()->type()->annotation_str(type_printer_) << ", ";
-        stmt << useOf(node->input(0)) << ".tolist()"
-             << ")";
+        stmt << useOf(node->input(0)) << ".tolist()" << ')';
       } break;
       case prim::EnumValue:
         // Note: This CAN NOT be printed as raw operator ops.prim.EnumValue
         // because its return type depends on type of enum and must be further
         // resolved, but ops.prim.EnumValue construction does not provide such
         // functionality.
-        stmt << "(" << useOf(node->input()) << ").value";
+        stmt << '(' << useOf(node->input()) << ").value";
         break;
       case prim::EnumName:
-        stmt << "(" << useOf(node->input()) << ").name";
+        stmt << '(' << useOf(node->input()) << ").name";
         break;
       default: {
         printOpName(stmt, node->kind());
         const FunctionSchema& schema = node->schema();
-        stmt << "(";
+        stmt << '(';
         // calculate how many args are specified.
         // see (https://github.com/pytorch/pytorch/pull/56079) for more
         // details.
@@ -1257,7 +1255,7 @@ struct PythonPrintImpl {
             if (i < num_schema_args) {
               auto arg = schema.arguments().at(i);
               if (arg.kwarg_only()) {
-                stmt << arg.name() << "=";
+                stmt << arg.name() << '=';
               }
             } else {
               // vararg functions like format can have extra arguments
@@ -1274,11 +1272,11 @@ struct PythonPrintImpl {
             // figure out the corresponding input at this index
             auto input_idx = node->inputs().size() - (num_schema_args - i);
             if (input_idx < node->inputs().size()) {
-              stmt << arg.name() << "=" << *useOf(node->inputs().at(input_idx));
+              stmt << arg.name() << '=' << *useOf(node->inputs().at(input_idx));
             }
           }
         }
-        stmt << ")";
+        stmt << ')';
       } break;
     }
   }
@@ -1313,7 +1311,7 @@ struct PythonPrintImpl {
       const Argument& arg,
       TaggedStringStream& stmt,
       const IValue& value) {
-    stmt << "=";
+    stmt << '=';
     // handle broadcasting lists
     if (arg.type()->kind() == ListType::Kind &&
         (value.isInt() || value.isDouble() || value.isBool())) {
@@ -1363,7 +1361,7 @@ struct PythonPrintImpl {
     WithSourceRange guard(&source_range_stack_, graph.param_node());
 
     indent();
-    body_ << "def " << func.name() << "(";
+    body_ << "def " << func.name() << '(';
     auto param_it = graph.inputs().begin();
     for (const Argument& arg : schema.arguments()) {
       registerClassDependencies(arg.type());
@@ -1448,14 +1446,14 @@ struct PythonPrintImpl {
         indent();
         body_ << "__parameters__ = [";
         for (const auto& param : params) {
-          body_ << "\"" << param << "\", ";
+          body_ << '"' << param << "\", ";
         }
         body_ << "]\n";
 
         indent();
         body_ << "__buffers__ = [";
         for (const auto& buffer : buffers) {
-          body_ << "\"" << buffer << "\", ";
+          body_ << '"' << buffer << "\", ";
         }
         body_ << "]\n";
         auto forwardPreHooks = classType->getForwardPreHooks();
@@ -1463,7 +1461,7 @@ struct PythonPrintImpl {
           indent();
           body_ << "__forward_pre_hooks__ = [";
           for (const auto& pre_hook : forwardPreHooks) {
-            body_ << "\"" << pre_hook->name() << "\", ";
+            body_ << '"' << pre_hook->name() << "\", ";
           }
           body_ << "]\n";
         }
@@ -1473,7 +1471,7 @@ struct PythonPrintImpl {
           indent();
           body_ << "__forward_hooks__ = [";
           for (const auto& hook : forwardHooks) {
-            body_ << "\"" << hook->name() << "\", ";
+            body_ << '"' << hook->name() << "\", ";
           }
           body_ << "]\n";
         }
@@ -1496,13 +1494,12 @@ struct PythonPrintImpl {
           }
           // Print out a direct manipulation of the annotations dict, like:
           //   __annotations__["0"] = SomeType
-          body_ << "__annotations__["
-                << "\"" << name
-                << "\"] = " << type->annotation_str(type_printer_) << "\n";
+          body_ << "__annotations__[" << '"' << name
+                << "\"] = " << type->annotation_str(type_printer_) << '\n';
         } else {
           // Otherwise: just emit a python 3 attribute annotation, like:
           //   foo : SomeType
-          body_ << name << " : " << type->annotation_str(type_printer_) << "\n";
+          body_ << name << " : " << type->annotation_str(type_printer_) << '\n';
         }
       }
 
@@ -1516,7 +1513,7 @@ struct PythonPrintImpl {
               << "Final[" << v.type()->annotation_str(type_printer_) << "] = ";
         auto ss = std::make_shared<TaggedStringStream>(&source_range_stack_);
         printConstant(*ss, v);
-        body_ << ss->str() << "\n";
+        body_ << ss->str() << '\n';
       }
 
       // TODO fields
@@ -1554,7 +1551,7 @@ struct PythonPrintImpl {
           TORCH_INTERNAL_ASSERT(attr.type());
           indent();
           body_ << attr.name() << " : "
-                << attr.type()->annotation_str(type_printer_) << "\n";
+                << attr.type()->annotation_str(type_printer_) << '\n';
         }
       }
     } else if (auto interfaceType = type->cast<InterfaceType>()) {
@@ -1600,7 +1597,7 @@ struct PythonPrintImpl {
         for (const auto& name_value : enumType->enumNamesValues()) {
           indent();
           body_ << name_value.first << " = " << value_wrapper
-                << name_value.second << value_wrapper << "\n";
+                << name_value.second << value_wrapper << '\n';
         }
       }
     } else {
diff --git a/torch/csrc/jit/tensorexpr/block_codegen.cpp b/torch/csrc/jit/tensorexpr/block_codegen.cpp
index 24228cdea32dd..6ec55f998cce0 100644
--- a/torch/csrc/jit/tensorexpr/block_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/block_codegen.cpp
@@ -132,7 +132,7 @@ void BlockPrinter::visit(const ForPtr& v) {
     os() << '\n';
     emitIndent();
     PrintReshapeInfo(buf_writes, true); // print reverse reshape
-    os() << "}";
+    os() << '}';
     os() << '\n';
   } else if (loop_options.is_gpu_thread_index()) {
     PrintDMAs(buf_reads);
@@ -154,12 +154,12 @@ void BlockPrinter::PrintTensorInfo(const std::unordered_set<BufPtr>& bufs) {
     emitIndent();
     auto num_dims = block_analysis_->getMultiDimBuf(buf)->dims().size();
     os() << block_analysis_->getInputName(buf) << " = ";
-    os() << "{";
+    os() << '{';
     for (unsigned long d = 0; d < num_dims; d++) {
-      os() << "{" << dim_names[d] << "};";
+      os() << '{' << dim_names[d] << "};";
     }
     os() << " elem : " << blockDtypeCppString(buf->dtype());
-    os() << "}";
+    os() << '}';
   }
 
   for (auto& buf : bufs) {
@@ -168,15 +168,14 @@ void BlockPrinter::PrintTensorInfo(const std::unordered_set<BufPtr>& bufs) {
     emitIndent();
     auto num_dims = block_analysis_->getMultiDimBuf(buf)->dims().size();
     os() << block_analysis_->getFlatInputName(buf) << " = ";
-    os() << "{";
-    os() << "{" << flat_dim_names[num_dims - 1] << "};";
+    os() << '{';
+    os() << '{' << flat_dim_names[num_dims - 1] << "};";
     os() << " elem : " << blockDtypeCppString(buf->dtype());
-    os() << "}"
-         << " // flattened tensor";
+    os() << '}' << " // flattened tensor";
   }
   os() << '\n';
   emitIndent();
-  os() << "}" << '\n' << '\n';
+  os() << '}' << '\n' << '\n';
 }
 
 void BlockPrinter::PrintArguments(const std::unordered_set<BufPtr>& bufs) {
@@ -213,7 +212,7 @@ void BlockPrinter::PrintArguments(const std::unordered_set<BufPtr>& bufs) {
   emitIndent();
   os() << "var bs_DPE = " << blck_sz << '\n';
   emitIndent();
-  os() << "}" << '\n' << '\n';
+  os() << '}' << '\n' << '\n';
 }
 
 void BlockPrinter::PrintBufferInfo(const std::unordered_set<BufPtr>& bufs) {
@@ -230,7 +229,7 @@ void BlockPrinter::PrintBufferInfo(const std::unordered_set<BufPtr>& bufs) {
   }
   os() << '\n';
   emitIndent();
-  os() << "}" << '\n' << '\n';
+  os() << '}' << '\n' << '\n';
 }
 
 void BlockPrinter::PrintDistribution(const std::unordered_set<BufPtr>& bufs) {
@@ -253,14 +252,14 @@ void BlockPrinter::PrintLoop(
   auto trip = 0;
   for (auto& buf : bufs) {
     if (trip > 0) {
-      os() << ",";
+      os() << ',';
     }
     os() << "{dim : ";
     os() << block_analysis_->getFlatInputName(buf) << ".dim.0, ";
     os() << (block_idx ? "block: bs_N}" : "block: bs_DPE}");
     ++trip;
   }
-  os() << ")";
+  os() << ')';
 }
 
 void BlockPrinter::PrintReshapeInfo(
@@ -274,7 +273,7 @@ void BlockPrinter::PrintReshapeInfo(
          << ", "
          << (reverse ? block_analysis_->getInputName(buf)
                      : block_analysis_->getFlatInputName(buf))
-         << ")" << '\n';
+         << ')' << '\n';
   }
 }
 
@@ -283,7 +282,7 @@ void BlockPrinter::PrintDMAs(const std::unordered_set<BufPtr>& bufs) {
     emitIndent();
     os() << "dma_in(";
     os() << block_analysis_->getFlatInputName(read);
-    os() << ")" << '\n';
+    os() << ')' << '\n';
   }
 }
 void BlockPrinter::PrintAdjustBuffers(const std::unordered_set<BufPtr>& bufs) {
@@ -291,7 +290,7 @@ void BlockPrinter::PrintAdjustBuffers(const std::unordered_set<BufPtr>& bufs) {
     emitIndent();
     os() << "adjust_buffer(";
     os() << block_analysis_->getFlatInputName(read);
-    os() << ")" << '\n';
+    os() << ')' << '\n';
   }
 }
 
@@ -305,14 +304,14 @@ void BlockPrinter::visit(const StorePtr& v) {
 }
 
 void BlockPrinter::visit(const BlockPtr& v) {
-  os() << "{" << '\n';
+  os() << '{' << '\n';
   indent_++;
   for (const StmtPtr& s : v->stmts()) {
     s->accept(this);
   }
   indent_--;
   emitIndent();
-  os() << "}";
+  os() << '}';
 }
 
 std::string BlockCodeGen::GetUniqueFuncName(const std::string& func_prefix) {
@@ -341,14 +340,14 @@ void BlockCodeGen::Initialize() {
   };
 
   std::string func_name = GetUniqueFuncName("func");
-  os() << "kernel " << func_name << "(";
+  os() << "kernel " << func_name << '(';
   for (auto const& arg : buf_writes) {
     os() << block_analysis_->getInputName(arg);
   }
   for (auto const& arg : buf_reads) {
-    os() << ";" << block_analysis_->getInputName(arg);
+    os() << ';' << block_analysis_->getInputName(arg);
   }
-  os() << ")";
+  os() << ')';
 
   stmt_v->accept(printer_.get());
 
diff --git a/torch/csrc/jit/tensorexpr/bounds_inference.cpp b/torch/csrc/jit/tensorexpr/bounds_inference.cpp
index bbc9d845fa4f7..034f51f46b8f7 100644
--- a/torch/csrc/jit/tensorexpr/bounds_inference.cpp
+++ b/torch/csrc/jit/tensorexpr/bounds_inference.cpp
@@ -128,10 +128,10 @@ void printBoundsInfo(const BoundsInfo& v) {
       if (!first) {
         std::cerr << ", ";
       }
-      std::cerr << ((b.kind == kLoad) ? "LOAD" : "STORE") << "(";
+      std::cerr << ((b.kind == kLoad) ? "LOAD" : "STORE") << '(';
       int i = 0;
       if (b.start.empty()) {
-        std::cerr << "0";
+        std::cerr << '0';
       }
       for (auto& s : b.start) {
         if (i != 0) {
@@ -143,7 +143,7 @@ void printBoundsInfo(const BoundsInfo& v) {
       std::cerr << "; ";
       i = 0;
       if (b.stop.empty()) {
-        std::cerr << "0";
+        std::cerr << '0';
       }
       for (auto& s : b.stop) {
         if (i != 0) {
@@ -152,7 +152,7 @@ void printBoundsInfo(const BoundsInfo& v) {
         std::cerr << *s;
         i++;
       }
-      std::cerr << ")";
+      std::cerr << ')';
       first = false;
     }
     std::cerr << "]\n";
diff --git a/torch/csrc/jit/tensorexpr/bounds_overlap.cpp b/torch/csrc/jit/tensorexpr/bounds_overlap.cpp
index 0c352e3b19f3b..0c785504efe85 100644
--- a/torch/csrc/jit/tensorexpr/bounds_overlap.cpp
+++ b/torch/csrc/jit/tensorexpr/bounds_overlap.cpp
@@ -35,7 +35,7 @@ static bool mustBeZero(const ExprPtr& e) {
 }
 
 void Bound::print() const {
-  std::cout << "(" << *start << ", " << *end << ")";
+  std::cout << '(' << *start << ", " << *end << ')';
 }
 
 bool Bound::equals(const Bound& other) const {
diff --git a/torch/csrc/jit/tensorexpr/codegen.cpp b/torch/csrc/jit/tensorexpr/codegen.cpp
index 41e54869850c8..b19a8b8964ad5 100644
--- a/torch/csrc/jit/tensorexpr/codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/codegen.cpp
@@ -41,7 +41,7 @@ RegisterCodeGenList::StmtFactoryMethod RegisterCodeGenList::
       oss << entry.first;
       index++;
     }
-    oss << "]";
+    oss << ']';
     throw std::runtime_error(oss.str());
   }
   return iter->second;
diff --git a/torch/csrc/jit/tensorexpr/cpp_codegen.cpp b/torch/csrc/jit/tensorexpr/cpp_codegen.cpp
index fa42d48c75e93..6b03b939ace99 100644
--- a/torch/csrc/jit/tensorexpr/cpp_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/cpp_codegen.cpp
@@ -89,28 +89,28 @@ static inline std::enable_if_t<std::is_floating_point_v<T>, void> visit_mod(
     std::ostream& os,
     const ExprPtr& lhs,
     const ExprPtr& rhs) {
-  os << "std::fmod(" << *lhs << ", " << *rhs << ")";
+  os << "std::fmod(" << *lhs << ", " << *rhs << ')';
 }
 
 template <typename T>
 static inline std::
     enable_if_t<std::is_floating_point_v<T> || std::is_integral_v<T>, void>
     visit_max(std::ostream& os, const ExprPtr& lhs, const ExprPtr& rhs) {
-  os << "std::max(" << *lhs << ", " << *rhs << ")";
+  os << "std::max(" << *lhs << ", " << *rhs << ')';
 }
 
 template <typename T>
 static inline std::
     enable_if_t<!std::is_floating_point_v<T> && !std::is_integral_v<T>, void>
     visit_max(std::ostream& os, const ExprPtr& lhs, const ExprPtr& rhs) {
-  os << "(" << *lhs << " < " << *rhs << ") ? " << *rhs << " : " << *lhs;
+  os << '(' << *lhs << " < " << *rhs << ") ? " << *rhs << " : " << *lhs;
 }
 
 template <typename T>
 static inline std::
     enable_if_t<std::is_floating_point_v<T> || std::is_integral_v<T>, void>
     visit_min(std::ostream& os, const ExprPtr& lhs, const ExprPtr& rhs) {
-  os << "std::min(" << *lhs << ", " << *rhs << ")";
+  os << "std::min(" << *lhs << ", " << *rhs << ')';
 }
 
 template <typename T>
@@ -176,14 +176,14 @@ void CppPrinter::visit(const MinPtr& v) {
 }
 
 void CppPrinter::visit(const CompareSelectPtr& v) {
-  os() << "((" << *v->lhs() << " "
-       << IRPrinter::to_string(v->compare_select_op()) << " " << *v->rhs()
-       << ") ? " << *v->ret_val1() << " : " << *v->ret_val2() << ")";
+  os() << "((" << *v->lhs() << ' '
+       << IRPrinter::to_string(v->compare_select_op()) << ' ' << *v->rhs()
+       << ") ? " << *v->ret_val1() << " : " << *v->ret_val2() << ')';
 }
 
 void CppPrinter::visit(const IfThenElsePtr& v) {
   os() << "((" << *v->condition() << ") ? " << *v->true_value() << " : "
-       << *v->false_value() << ")";
+       << *v->false_value() << ')';
 }
 
 void CppPrinter::visit(const AllocatePtr& v) {
@@ -211,7 +211,7 @@ void CppPrinter::visit(const FreePtr& v) {
 void CppPrinter::visit(const LoadPtr& v) {
   auto flat_idx =
       flatten_index(v->buf()->dims(), v->indices(), v->buf()->strides());
-  os() << *v->base_handle() << "[" << *flat_idx << "]";
+  os() << *v->base_handle() << '[' << *flat_idx << ']';
 }
 
 void CppPrinter::visit(const StorePtr& v) {
@@ -221,19 +221,19 @@ void CppPrinter::visit(const StorePtr& v) {
   for (int lane = 0; lane < lanes; lane++) {
     lane_ = lane;
     emitIndent();
-    os() << *v->base_handle() << "[" << *flat_idx << "] = " << *v->value()
-         << ";" << '\n';
+    os() << *v->base_handle() << '[' << *flat_idx << "] = " << *v->value()
+         << ';' << '\n';
   }
 }
 
 void CppPrinter::visit(const CastPtr& v) {
   os() << "static_cast<" << v->dtype().ToCppString() << ">(" << *v->src_value()
-       << ")";
+       << ')';
 }
 
 void CppPrinter::visit(const BitCastPtr& v) {
   os() << "std::bitcast<" << v->src_value()->dtype().ToCppString() << ", "
-       << v->dtype().ToCppString() << ">(" << *v->src_value() << ")";
+       << v->dtype().ToCppString() << ">(" << *v->src_value() << ')';
 }
 
 void CppPrinter::visit(const IntrinsicsPtr& v) {
@@ -241,14 +241,14 @@ void CppPrinter::visit(const IntrinsicsPtr& v) {
     throw std::runtime_error("kRand and kSigmoid are not supported");
   }
 
-  os() << "std::" << v->func_name() << "(";
+  os() << "std::" << v->func_name() << '(';
   for (size_t i = 0; i < v->nparams(); i++) {
     if (i > 0) {
       os() << ", ";
     }
     os() << *v->param(i);
   }
-  os() << ")";
+  os() << ')';
 }
 
 void CppPrinter::visit(const ExternalCallPtr& v) {
@@ -272,7 +272,7 @@ void CppPrinter::visit(const ExternalCallPtr& v) {
   };
 
   emitIndent();
-  os() << "{" << '\n';
+  os() << '{' << '\n';
   indent_++;
 
   emitIndent();
@@ -315,9 +315,9 @@ void CppPrinter::visit(const ExternalCallPtr& v) {
   os() << "};" << '\n';
 
   emitIndent();
-  os() << v->func_name() << "(" << '\n';
+  os() << v->func_name() << '(' << '\n';
   emitIndent();
-  os() << "    " << bufs.size() << "," << '\n';
+  os() << "    " << bufs.size() << ',' << '\n';
   emitIndent();
   os() << "    buf_ptrs," << '\n';
   emitIndent();
@@ -327,20 +327,20 @@ void CppPrinter::visit(const ExternalCallPtr& v) {
   emitIndent();
   os() << "    buf_dtypes," << '\n';
   emitIndent();
-  os() << "    " << v->args().size() << "," << '\n';
+  os() << "    " << v->args().size() << ',' << '\n';
   emitIndent();
   os() << "    extra_args);" << '\n';
 
   indent_--;
   emitIndent();
-  os() << "}" << '\n';
+  os() << '}' << '\n';
 }
 
 void CppPrinter::visit(const LetPtr& v) {
   if (v->var()->dtype().lanes() == 1) {
     emitIndent();
-    os() << v->var()->dtype().ToCppString() << " " << *v->var() << " = "
-         << *v->value() << ";" << '\n';
+    os() << v->var()->dtype().ToCppString() << ' ' << *v->var() << " = "
+         << *v->value() << ';' << '\n';
   } else {
     vector_vars_[v->var()] = v->value();
   }
@@ -370,7 +370,7 @@ void CppCodeGen::init() {
   apply_visitor(var_name_rewriter_.get());
 
   printer_->printPrologue();
-  os() << "void " << kernel_func_name() << "(";
+  os() << "void " << kernel_func_name() << '(';
   const std::vector<BufferArg> buffer_args = this->buffer_args();
   for (size_t i = 0; i < buffer_args.size(); i++) {
     if (i > 0) {
@@ -381,7 +381,7 @@ void CppCodeGen::init() {
     Dtype dtype = buffer_arg.dtype();
     os() << dtype.ToCppString() << (buffer_arg.isVar() ? " " : "* ") << *var;
   }
-  os() << ")";
+  os() << ')';
   stmt()->accept(printer_.get());
   os() << '\n';
 }
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
index 6131b55883dfb..264e01d65db94 100644
--- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
@@ -195,8 +195,8 @@ void CudaPrinter::print_flat_alloc(const AllocatePtr& alloc) {
       throw std::runtime_error("Only integer dimensions are supported for now");
     }
   }
-  os() << dtypeToCppString(alloc->dtype()) << " " << (*alloc->buffer_var())
-       << "[" << flat_size << "];" << '\n';
+  os() << dtypeToCppString(alloc->dtype()) << ' ' << (*alloc->buffer_var())
+       << '[' << flat_size << "];" << '\n';
 }
 
 void CudaPrinter::visit(const AllocatePtr& v) {
@@ -234,9 +234,9 @@ void CudaPrinter::visit(const CastPtr& v) {
       : v->src_value()->dtype().scalar_type() == ScalarType::BFloat16
       ? "__bfloat162float"
       : ("(" + dtypeToCppString(v->dtype()) + ")");
-  os() << castFn << "(";
+  os() << castFn << '(';
   v->src_value()->accept(this);
-  os() << ")";
+  os() << ')';
 }
 
 void CudaPrinter::visit(const IntrinsicsPtr& v) {
@@ -265,14 +265,14 @@ void CudaPrinter::visit(const IntrinsicsPtr& v) {
     func_name = "isnan";
   }
 
-  os() << func_name << "(";
+  os() << func_name << '(';
   for (const auto i : c10::irange(v->nparams())) {
     if (i > 0) {
       os() << ", ";
     }
     os() << *v->param(i);
   }
-  os() << ")";
+  os() << ')';
 }
 
 void CudaPrinter::visit(const ExternalCallPtr& v) {
@@ -293,15 +293,15 @@ void CudaPrinter::visit(const LoadPtr& v) {
       v->dtype().scalar_type() == ScalarType::Half ||
       v->dtype().scalar_type() == ScalarType::BFloat16) {
     // There's no __ldg overload for bool or half.
-    os() << *v->base_handle() << "[" << *v->flat_index() << "]";
+    os() << *v->base_handle() << '[' << *v->flat_index() << ']';
     return;
   }
   if (cuda_analysis_->is_buf_store_target(v->buf())) {
     // Cuda __ldg can only be applied on read-only buffers.
-    os() << *v->base_handle() << "[" << *v->flat_index() << "]";
+    os() << *v->base_handle() << '[' << *v->flat_index() << ']';
     return;
   }
-  os() << "__ldg(" << *v->base_handle() << " + " << *v->flat_index() << ")";
+  os() << "__ldg(" << *v->base_handle() << " + " << *v->flat_index() << ')';
 }
 
 // TODO: maybe this should be a more shared location?
@@ -412,9 +412,9 @@ void CudaPrinter::visit(const StorePtr& v) {
   if (v->indices().empty()) {
     os() << *v->base_handle() << " = ";
   } else {
-    os() << *v->base_handle() << "[" << *v->flat_index() << "] = ";
+    os() << *v->base_handle() << '[' << *v->flat_index() << "] = ";
   }
-  os() << *v->value() << ";";
+  os() << *v->value() << ';';
   os() << '\n';
 }
 
@@ -422,10 +422,10 @@ void CudaPrinter::visit(const AtomicAddPtr& v) {
   emitIndent();
   if (cuda_analysis_->thread_local_bufs().count(v->base_handle()) > 0) {
     // atomicAdd only works on global and shared memory
-    os() << *v->base_handle() << "[" << *v->flat_index()
-         << "] += " << *v->value() << ";";
+    os() << *v->base_handle() << '[' << *v->flat_index()
+         << "] += " << *v->value() << ';';
   } else {
-    os() << "atomicAdd(&" << *v->base_handle() << "[" << *v->flat_index() << "]"
+    os() << "atomicAdd(&" << *v->base_handle() << '[' << *v->flat_index() << ']'
          << ", " << *v->value() << ");";
   }
   os() << '\n';
@@ -438,9 +438,9 @@ void CudaPrinter::visit(const MaxPtr& v) {
     os() << "maximum(";
   }
   v->lhs()->accept(this);
-  os() << ",";
+  os() << ',';
   v->rhs()->accept(this);
-  os() << ")";
+  os() << ')';
 }
 
 void CudaPrinter::visit(const MinPtr& v) {
@@ -450,9 +450,9 @@ void CudaPrinter::visit(const MinPtr& v) {
     os() << "minimum(";
   }
   v->lhs()->accept(this);
-  os() << ",";
+  os() << ',';
   v->rhs()->accept(this);
-  os() << ")";
+  os() << ')';
 }
 
 void CudaPrinter::visit(const IfThenElsePtr& v) {
@@ -462,11 +462,11 @@ void CudaPrinter::visit(const IfThenElsePtr& v) {
   v->true_value()->accept(this);
   os() << " : ";
   v->false_value()->accept(this);
-  os() << ")";
+  os() << ')';
 }
 
 void CudaPrinter::visit(const BlockPtr& v) {
-  os() << "{" << '\n';
+  os() << '{' << '\n';
   indent_++;
 
   for (const StmtPtr& s : v->stmts()) {
@@ -475,15 +475,15 @@ void CudaPrinter::visit(const BlockPtr& v) {
 
   indent_--;
   emitIndent();
-  os() << "}";
+  os() << '}';
 }
 
 void CudaPrinter::visit(const LetPtr& v) {
   emitIndent();
   os() << dtypeToCppString(v->var()->dtype());
-  os() << " " << *v->var() << " = ";
+  os() << ' ' << *v->var() << " = ";
   v->value()->accept(this);
-  os() << ";" << '\n';
+  os() << ';' << '\n';
 }
 
 class PrioritizeLoad : public IRMutator {
@@ -911,7 +911,7 @@ void CudaCodeGen::Initialize() {
   // https://clang.llvm.org/docs/AttributeReference.html#amdgpu-flat-work-group-size
   os() << "__attribute__((amdgpu_flat_work_group_size(1, 1024)))" << std::endl;
 #endif
-  os() << "void " << func_name << "(";
+  os() << "void " << func_name << '(';
   const std::vector<BufferArg> buffer_args = this->buffer_args();
   for (size_t i = 0; i < buffer_args.size(); i++) {
     if (i > 0) {
@@ -932,7 +932,7 @@ void CudaCodeGen::Initialize() {
     rand_seed = alloc<Var>("rand_seed", kInt);
     rand_offset = alloc<Var>("rand_offset", kInt);
     std::string uint64_str = "unsigned long long";
-    os() << ", " << uint64_str << " " << *rand_seed << ", " << uint64_str << " "
+    os() << ", " << uint64_str << ' ' << *rand_seed << ", " << uint64_str << ' '
          << *rand_offset;
   }
   os() << ") {";
@@ -942,7 +942,7 @@ void CudaCodeGen::Initialize() {
     VarPtr idx = alloc<Var>("idx", kInt);
     os() << "int " << *idx << " = blockIdx.x*blockDim.x + threadIdx.x;" << '\n';
     VarPtr rand_func = printer_->rand_func();
-    os() << "Philox " << *rand_func << "(" << *rand_seed << ", " << *idx << ", "
+    os() << "Philox " << *rand_func << '(' << *rand_seed << ", " << *idx << ", "
          << *rand_offset << ");" << '\n';
     os() << '\n';
   }
@@ -969,7 +969,7 @@ void CudaCodeGen::Initialize() {
 
   stmt_v->accept(printer_.get());
   os() << '\n';
-  os() << "}";
+  os() << '}';
 
   // Check that all block extents had been set.
   const std::vector<ExprPtr>& gpu_block_extents =
diff --git a/torch/csrc/jit/tensorexpr/eval.cpp b/torch/csrc/jit/tensorexpr/eval.cpp
index 3c398cf89b430..1709ef2bbff5a 100644
--- a/torch/csrc/jit/tensorexpr/eval.cpp
+++ b/torch/csrc/jit/tensorexpr/eval.cpp
@@ -1286,7 +1286,7 @@ std::optional<int64_t> evalInt(ExprPtr e) {
   try {
     return ExprEval<SimpleIREvaluator>(cast<int64_t>(ExprHandle(std::move(e))))
         .value<int64_t>();
-  } catch (std::runtime_error& err) {
+  } catch (std::runtime_error&) {
     return std::nullopt;
   }
 }
diff --git a/torch/csrc/jit/tensorexpr/ir_printer.cpp b/torch/csrc/jit/tensorexpr/ir_printer.cpp
index 9b2ecd0e11515..31b7866a73d21 100644
--- a/torch/csrc/jit/tensorexpr/ir_printer.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_printer.cpp
@@ -71,21 +71,21 @@ static void visitBinaryOp(
   int rhs_prec = getPrecedence(v->rhs()->expr_type());
 
   if (lhs_prec >= self_prec) {
-    os << "(";
+    os << '(';
   }
   v->lhs()->accept(printer);
   if (lhs_prec >= self_prec) {
-    os << ")";
+    os << ')';
   }
 
-  os << " " << op_str << " ";
+  os << ' ' << op_str << ' ';
 
   if (rhs_prec >= self_prec) {
-    os << "(";
+    os << '(';
   }
   v->rhs()->accept(printer);
   if (rhs_prec >= self_prec) {
-    os << ")";
+    os << ')';
   }
 }
 
@@ -129,7 +129,7 @@ void IRPrinter::visit(const ModPtr& v) {
   if (v->dtype().is_integral()) {
     visitBinaryOp(v, "%", this);
   } else if (v->dtype().is_floating_point()) {
-    os() << "mod(" << *v->lhs() << ", " << *v->rhs() << ")";
+    os() << "mod(" << *v->lhs() << ", " << *v->rhs() << ')';
   } else {
     throw std::runtime_error("invalid dtype: " + std::to_string(v->dtype()));
   }
@@ -140,7 +140,7 @@ void IRPrinter::visit(const MaxPtr& v) {
   v->lhs()->accept(this);
   os() << ", ";
   v->rhs()->accept(this);
-  os() << ", " << (unsigned int)v->propagate_nans() << ")";
+  os() << ", " << (unsigned int)v->propagate_nans() << ')';
 }
 
 void IRPrinter::visit(const MinPtr& v) {
@@ -148,7 +148,7 @@ void IRPrinter::visit(const MinPtr& v) {
   v->lhs()->accept(this);
   os() << ", ";
   v->rhs()->accept(this);
-  os() << ", " << (unsigned int)v->propagate_nans() << ")";
+  os() << ", " << (unsigned int)v->propagate_nans() << ')';
 }
 
 void IRPrinter::visit(const CompareSelectPtr& v) {
@@ -158,32 +158,32 @@ void IRPrinter::visit(const CompareSelectPtr& v) {
   int rhs_prec = getPrecedence(v->rhs()->expr_type());
 
   if (lhs_prec >= self_prec) {
-    os() << "(";
+    os() << '(';
   }
   v->lhs()->accept(this);
   if (lhs_prec >= self_prec) {
-    os() << ")";
+    os() << ')';
   }
 
   os() << to_string(cmp_op);
 
   if (rhs_prec >= self_prec) {
-    os() << "(";
+    os() << '(';
   }
   v->rhs()->accept(this);
   if (rhs_prec >= self_prec) {
-    os() << ")";
+    os() << ')';
   }
   os() << " ? ";
 
   auto withParens = [&](const ExprPtr& e) {
     auto prec = getPrecedence(e->expr_type());
     if (prec >= self_prec) {
-      os() << "(";
+      os() << '(';
     }
     e->accept(this);
     if (prec >= self_prec) {
-      os() << ")";
+      os() << ')';
     }
   };
   withParens(v->ret_val1());
@@ -237,16 +237,16 @@ AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_PRINT_VISIT)
 
 void IRPrinter::visit(const CastPtr& v) {
   auto dtype = v->dtype();
-  os() << dtypeToCppString(dtype) << "(";
+  os() << dtypeToCppString(dtype) << '(';
   v->src_value()->accept(this);
-  os() << ")";
+  os() << ')';
 }
 
 void IRPrinter::visit(const BitCastPtr& v) {
   auto dtype = v->dtype();
   os() << "BitCast<" << dtype.ToCppString() << ">(";
   v->src_value()->accept(this);
-  os() << ")";
+  os() << ')';
 }
 
 void IRPrinter::visit(const VarPtr& v) {
@@ -273,7 +273,7 @@ void IRPrinter::visit(const BufPtr& v) {
     }
     s->accept(this);
   }
-  os() << "]";
+  os() << ']';
   os() << ", strides=[";
   i = 0;
   for (const ExprPtr& s : v->strides()) {
@@ -282,14 +282,14 @@ void IRPrinter::visit(const BufPtr& v) {
     }
     s->accept(this);
   }
-  os() << "]";
+  os() << ']';
 
-  os() << ")";
+  os() << ')';
 }
 
 void IRPrinter::visit(const RampPtr& v) {
   os() << "Ramp(" << *v->base() << ", " << *v->stride() << ", " << v->lanes()
-       << ")";
+       << ')';
 }
 
 void IRPrinter::visit(const LoadPtr& v) {
@@ -297,7 +297,7 @@ void IRPrinter::visit(const LoadPtr& v) {
   if (v->indices().empty()) {
     os() << *v->base_handle();
   } else {
-    os() << *v->base_handle() << "[";
+    os() << *v->base_handle() << '[';
     size_t i = 0;
     for (const ExprPtr& ind : v->indices()) {
       if (i++) {
@@ -306,40 +306,40 @@ void IRPrinter::visit(const LoadPtr& v) {
       ind->accept(this);
     }
     if (v->indices().empty()) {
-      os() << "0";
+      os() << '0';
     }
-    os() << "]";
+    os() << ']';
   }
 }
 
 void IRPrinter::visit(const BroadcastPtr& v) {
-  os() << "Broadcast(" << *v->value() << ", " << v->lanes() << ")";
+  os() << "Broadcast(" << *v->value() << ", " << v->lanes() << ')';
 }
 
 void IRPrinter::visit(const IfThenElsePtr& v) {
   os() << "IfThenElse(" << *v->condition() << ", " << *v->true_value() << ", "
-       << *v->false_value() << ")";
+       << *v->false_value() << ')';
 }
 
 void IRPrinter::visit(const IntrinsicsPtr& v) {
-  os() << v->func_name() << "(";
+  os() << v->func_name() << '(';
   for (const auto i : c10::irange(v->nparams())) {
     if (i > 0) {
       os() << ", ";
     }
     os() << *v->param(i);
   }
-  os() << ")";
+  os() << ')';
 }
 
 void IRPrinter::visit(const TermPtr& v) {
   os() << "Term(";
   v->scalar()->accept(this);
   for (const auto& t : v->variables()) {
-    os() << ",";
+    os() << ',';
     t->accept(this);
   }
-  os() << ")";
+  os() << ')';
 }
 
 void IRPrinter::visit(const PolynomialPtr& v) {
@@ -357,7 +357,7 @@ void IRPrinter::visit(const PolynomialPtr& v) {
     os() << " + ";
   }
   v->scalar()->accept(this);
-  os() << ")";
+  os() << ')';
 }
 
 void IRPrinter::visit(const RoundOffPtr& v) {
@@ -365,7 +365,7 @@ void IRPrinter::visit(const RoundOffPtr& v) {
   v->lhs()->accept(this);
   os() << ", ";
   v->rhs()->accept(this);
-  os() << ")";
+  os() << ')';
 }
 
 void IRPrinter::visit(const MaxTermPtr& v) {
@@ -380,7 +380,7 @@ void IRPrinter::visit(const MaxTermPtr& v) {
       os() << ", ";
     }
   }
-  os() << ")";
+  os() << ')';
 }
 
 void IRPrinter::visit(const MinTermPtr& v) {
@@ -395,7 +395,7 @@ void IRPrinter::visit(const MinTermPtr& v) {
       os() << ", ";
     }
   }
-  os() << ")";
+  os() << ')';
 }
 
 void IRPrinter::visit(const ReduceOpPtr& v) {
@@ -423,11 +423,11 @@ void IRPrinter::visit(const ReduceOpPtr& v) {
 void IRPrinter::visit(const StorePtr& v) {
   // TODO: handle the mask
   if (v->indices().empty()) {
-    os() << *v->base_handle() << " = " << *v->value() << ";";
+    os() << *v->base_handle() << " = " << *v->value() << ';';
     return;
   }
 
-  os() << *v->base_handle() << "[";
+  os() << *v->base_handle() << '[';
   size_t i = 0;
   for (const ExprPtr& ind : v->indices()) {
     if (i++) {
@@ -436,15 +436,15 @@ void IRPrinter::visit(const StorePtr& v) {
     ind->accept(this);
   }
   if (v->indices().empty()) {
-    os() << "0";
+    os() << '0';
   }
-  os() << "] = " << *v->value() << ";";
+  os() << "] = " << *v->value() << ';';
 }
 
 void IRPrinter::visit(const ForPtr& v) {
   VarPtr var = v->var();
   VarHandle vv(var);
-  os() << "for (" << dtypeToCppString(var->dtype()) << " " << vv << " = "
+  os() << "for (" << dtypeToCppString(var->dtype()) << ' ' << vv << " = "
        << ExprHandle(v->start()) << "; " << vv << " < " << ExprHandle(v->stop())
        << "; " << vv << "++) ";
   std::string loop_options_str = v->loop_options().ToString();
@@ -464,11 +464,11 @@ void IRPrinter::visit(const BlockPtr& v) {
 
   for (const StmtPtr& s : *v) {
     emitIndent();
-    os() << *s << "\n";
+    os() << *s << '\n';
   }
   indent_--;
   emitIndent();
-  os() << "}";
+  os() << '}';
 }
 
 void IRPrinter::visit(const AllocatePtr& v) {
@@ -482,7 +482,7 @@ void IRPrinter::visit(const AllocatePtr& v) {
     }
     os() << *dims[i];
   }
-  os() << "]";
+  os() << ']';
 }
 
 void IRPrinter::visit(const FreePtr& v) {
@@ -503,13 +503,13 @@ void IRPrinter::visit(const FreeExtPtr& v) {
 }
 
 void IRPrinter::visit(const PlacementAllocatePtr& v) {
-  os() << "Alias(" << *v->buf()->base_handle() << ","
+  os() << "Alias(" << *v->buf()->base_handle() << ','
        << *v->buf_to_reuse()->base_handle() << ");";
 }
 
 void IRPrinter::visit(const LetPtr& v) {
-  os() << dtypeToCppString(v->var()->dtype()) << " " << *v->var();
-  os() << " = " << *v->value() << ";";
+  os() << dtypeToCppString(v->var()->dtype()) << ' ' << *v->var();
+  os() << " = " << *v->value() << ';';
 }
 
 void IRPrinter::visit(const CondPtr& v) {
@@ -530,7 +530,7 @@ void IRPrinter::visit(const CondPtr& v) {
 }
 
 void IRPrinter::visit(const AtomicAddPtr& v) {
-  os() << "atomicAdd(&" << *v->base_handle() << "[";
+  os() << "atomicAdd(&" << *v->base_handle() << '[';
   size_t i = 0;
   for (const ExprPtr& ind : v->indices()) {
     if (i++) {
@@ -539,7 +539,7 @@ void IRPrinter::visit(const AtomicAddPtr& v) {
     ind->accept(this);
   }
   if (v->indices().empty()) {
-    os() << "0";
+    os() << '0';
   }
   os() << "], " << *v->value() << ");";
 }
@@ -549,7 +549,7 @@ void IRPrinter::visit(const SyncThreadsPtr& v) {
 }
 
 void IRPrinter::visit(const ExternalCallPtr& v) {
-  os() << *v->buf() << " = " << v->func_name() << "(";
+  os() << *v->buf() << " = " << v->func_name() << '(';
 
   os() << "buf_args={";
   int i = 0;
@@ -580,7 +580,7 @@ void IRPrinter::visit(const ExternalCallWithAllocPtr& v) {
     os() << *buf_out_arg;
   }
 
-  os() << " := " << v->func_name() << "(";
+  os() << " := " << v->func_name() << '(';
 
   os() << "buf_args={";
   i = 0;
@@ -657,7 +657,7 @@ void print(const ExprPtr& expr) {
   } else {
     std::cout << "(null expr)";
   }
-  std::cout << "\n";
+  std::cout << '\n';
 }
 
 void print(const StmtPtr& stmt) {
@@ -691,14 +691,14 @@ std::string to_string(const StmtPtr& stmt) {
 std::string to_string(const Tensor& t) {
   std::ostringstream oss;
   // TODO: move this to Buf printer
-  oss << "Tensor " << t.buf()->name_hint() << "[";
+  oss << "Tensor " << t.buf()->name_hint() << '[';
   for (const auto i : c10::irange(t.buf()->ndim())) {
     if (i != 0) {
       oss << ", ";
     }
     oss << *t.buf()->dim(i);
   }
-  oss << "]:\n" << *t.stmt() << "\n";
+  oss << "]:\n" << *t.stmt() << '\n';
   return oss.str();
 }
 } // namespace std
diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
index 0d51e11e446d1..53eca369cc5ac 100644
--- a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
@@ -3079,7 +3079,7 @@ bool exprEquals(const ExprPtr& A, const ExprPtr& B) {
       return false;
     }
     return immediateEquals(diff, 0);
-  } catch (std::exception& e) {
+  } catch (std::exception&) {
     return false;
   }
 }
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index 7f0888666d3af..cca7efcd0adaf 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -131,9 +131,9 @@ std::string sanitizeName(const std::string& input_name) {
     } else {
       if (i == 0) {
         // Don't start names with underscore
-        sanitized_name << "v";
+        sanitized_name << 'v';
       }
-      sanitized_name << "_";
+      sanitized_name << '_';
     }
   }
   return sanitized_name.str();
diff --git a/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp b/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp
index 46a09314fb7bf..3dda98ff0faf6 100644
--- a/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp
@@ -733,7 +733,7 @@ void loopnestRandomization(int64_t seed, LoopNest& l) {
     }
   } catch (...) {
     std::cout << "EXCEPTION THROWN!\n";
-    std::cout << "SEED: " << seed << "\n";
+    std::cout << "SEED: " << seed << '\n';
     throw std::runtime_error("Random test failed");
   }
   message = "End of transformations;\n";
diff --git a/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp b/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp
index 73a1c6a4a2d5a..bbd43f0fa8a8c 100644
--- a/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp
+++ b/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp
@@ -151,7 +151,7 @@ bool AccessInfo::isWrite() const {
 }
 
 void AccessInfo::print() const {
-  std::cout << id_ << ". " << AccessToString(type_) << ": " << *var_ << "[";
+  std::cout << id_ << ". " << AccessToString(type_) << ": " << *var_ << '[';
   if (!bounds_.empty()) {
     for (size_t i = 0; i < bounds_.size() - 1; ++i) {
       bounds_[i].print();
@@ -161,30 +161,30 @@ void AccessInfo::print() const {
     size_t i = bounds_.size() - 1;
     bounds_[i].print();
   }
-  std::cout << "]";
+  std::cout << ']';
 
   if (!dependencies_.empty()) {
     std::cout << " - depends on: ";
     for (auto& pair : dependencies_) {
-      std::cout << pair.second->id() << " ";
+      std::cout << pair.second->id() << ' ';
     }
   }
 
   if (!dependents_.empty()) {
     std::cout << " - dependents: ";
     for (auto& pair : dependents_) {
-      std::cout << pair.second.lock()->id() << " ";
+      std::cout << pair.second.lock()->id() << ' ';
     }
   }
 
-  std::cout << "\n";
+  std::cout << '\n';
 }
 
 void AccessInfo::dumpDOT(std::ostream& os) const {
   if (type_ == AccessType::Input || type_ == AccessType::Output ||
       type_ == AccessType::Alloc) {
-    os << "n" << id_ << " [\n";
-    os << "label = \"" << AccessToString(type_) << "\\n " << *var_ << "[";
+    os << 'n' << id_ << " [\n";
+    os << "label = \"" << AccessToString(type_) << "\\n " << *var_ << '[';
     if (!bounds_.empty()) {
       for (size_t i = 0; i < bounds_.size() - 1; ++i) {
         os << *IRSimplifier::simplify(
@@ -203,17 +203,17 @@ void AccessInfo::dumpDOT(std::ostream& os) const {
       os << "\tshape = \"house\"\n";
     }
   } else {
-    os << "n" << id_ << " [\n";
+    os << 'n' << id_ << " [\n";
     os << "label = \"" << AccessToString(type_) << " (#" << id_ << ")\\n";
     os << "buf : " << *var_ << "\\n";
     os << "bounds : [";
     if (!bounds_.empty()) {
       for (size_t i = 0; i < bounds_.size() - 1; ++i) {
-        os << "(" << *bounds_[i].start << ", " << *bounds_[i].end << "), ";
+        os << '(' << *bounds_[i].start << ", " << *bounds_[i].end << "), ";
       }
 
       size_t i = bounds_.size() - 1;
-      os << "(" << *bounds_[i].start << ", " << *bounds_[i].end << ")]";
+      os << '(' << *bounds_[i].start << ", " << *bounds_[i].end << ")]";
     }
     os << "\"\n";
     os << "\tshape = \"box\"\n";
@@ -228,8 +228,8 @@ void AccessInfo::dumpDOT(std::ostream& os) const {
   }
   os << "]\n";
   for (auto& pair : dependencies_) {
-    os << "n" << pair.second->id() << " -> "
-       << "n" << id_ << " [color=\"" << edgeColour << "\"]\n";
+    os << 'n' << pair.second->id() << " -> " << 'n' << id_ << " [color=\""
+       << edgeColour << "\"]\n";
   }
 }
 
diff --git a/torch/csrc/jit/tensorexpr/registerizer.cpp b/torch/csrc/jit/tensorexpr/registerizer.cpp
index 37f79d529238d..9ad44e31a3873 100644
--- a/torch/csrc/jit/tensorexpr/registerizer.cpp
+++ b/torch/csrc/jit/tensorexpr/registerizer.cpp
@@ -131,17 +131,17 @@ std::shared_ptr<AccessInfo> AccessInfo::cloneWithHiddenInfo(
 }
 
 void AccessInfo::print() const {
-  std::cout << "Access: " << *buf_ << "{";
+  std::cout << "Access: " << *buf_ << '{';
   for (const auto& i : indices_) {
-    std::cout << *i << " ";
+    std::cout << *i << ' ';
   }
   std::cout << "} stores: " << stores_.size() << " (" << *store_cost_ << ") -";
-  std::cout << " loads: " << loads_.size() << " (" << *load_cost_ << ")";
+  std::cout << " loads: " << loads_.size() << " (" << *load_cost_ << ')';
   if (conditionId_) {
     std::cout << " cond: " << conditionId_;
   }
 
-  std::cout << "\n";
+  std::cout << '\n';
 }
 
 // Scope
diff --git a/torch/csrc/jit/tensorexpr/types.cpp b/torch/csrc/jit/tensorexpr/types.cpp
index 0ee8fd4a956bb..f3a62fa374056 100644
--- a/torch/csrc/jit/tensorexpr/types.cpp
+++ b/torch/csrc/jit/tensorexpr/types.cpp
@@ -57,7 +57,7 @@ Dtype ToDtype(ScalarType type) {
 TORCH_API std::ostream& operator<<(std::ostream& stream, const Dtype& dtype) {
   stream << dtype.scalar_type_;
   if (dtype.lanes() > 1) {
-    stream << "x" << dtype.lanes();
+    stream << 'x' << dtype.lanes();
     ;
   }
   return stream;
diff --git a/torch/csrc/jit/testing/file_check.cpp b/torch/csrc/jit/testing/file_check.cpp
index aeac1233e4d23..fb1280400a89d 100644
--- a/torch/csrc/jit/testing/file_check.cpp
+++ b/torch/csrc/jit/testing/file_check.cpp
@@ -116,7 +116,7 @@ size_t assertFind(
     const std::string& sub,
     const Check& check) {
   return assertFind(search_range, sub, [&](std::ostream& out) {
-    out << "From " << check << "\n";
+    out << "From " << check << '\n';
   });
 }
 
@@ -156,7 +156,7 @@ size_t assertFindRegex(
     const std::string& sub,
     const Check& check) {
   return assertFindRegex(search_range, sub, [&](std::ostream& out) {
-    out << "From " << check << "\n";
+    out << "From " << check << '\n';
   });
 }
 
@@ -182,7 +182,7 @@ void assertNotFind(
     c10::printQuotedString(ss, sub);
     ss << " but found it\n";
     found_range.highlight(ss);
-    ss << "From " << check << "\n";
+    ss << "From " << check << '\n';
     throw std::runtime_error(ss.str());
   }
 }
@@ -543,7 +543,7 @@ FileCheck::FileCheck() : fcImpl(new FileCheckImpl()) {}
 std::ostream& operator<<(std::ostream& out, const FileCheckImpl& fc) {
   out << "FileCheck checks:\n";
   for (const Check& c : fc.checks) {
-    out << "\t" << c << "\n";
+    out << '\t' << c << '\n';
   }
   return out;
 }
diff --git a/torch/csrc/lazy/core/debug_util.cpp b/torch/csrc/lazy/core/debug_util.cpp
index 3cc35c6d0cf05..2eb448e75f61d 100644
--- a/torch/csrc/lazy/core/debug_util.cpp
+++ b/torch/csrc/lazy/core/debug_util.cpp
@@ -77,7 +77,7 @@ std::string GetFirstUserFrameInPython() {
     auto& loc = frames[i - 1];
     if (loc.file.find("site-packages") == std::string::npos) {
       std::stringstream ss;
-      ss << loc.file << " " << loc.function << " " << loc.line;
+      ss << loc.file << ' ' << loc.function << ' ' << loc.line;
       return ss.str();
     }
   }
@@ -120,7 +120,7 @@ std::string DebugUtil::GetTensorsGraphInfo(
   std::vector<SourceLocation> frames = GetPythonFramesFunction()();
   ss << "Python Stacktrace:\n";
   for (auto& location : frames) {
-    ss << "  " << location.function << " (" << location.file << ":"
+    ss << "  " << location.function << " (" << location.file << ':'
        << location.line << ")\n";
   }
   ss << "\nHashes: (";
@@ -160,7 +160,7 @@ void DebugUtil::SaveTensorsGraphInfo(
     std::string info = GetTensorsGraphInfo(tensors, indices, format);
     std::lock_guard<std::mutex> guard(lock);
     std::ofstream graph_file(save_file, std::ios_base::app);
-    graph_file << "[" << name << "]\n" << info << "\n";
+    graph_file << '[' << name << "]\n" << info << '\n';
   }
 }
 
diff --git a/torch/csrc/lazy/core/ir.cpp b/torch/csrc/lazy/core/ir.cpp
index 709b5b028b242..3cd25d2f5e85e 100644
--- a/torch/csrc/lazy/core/ir.cpp
+++ b/torch/csrc/lazy/core/ir.cpp
@@ -143,7 +143,7 @@ const Output& Node::nullable_operand(size_t i) const {
 
 std::string Node::ToString() const {
   std::stringstream ss;
-  ss << shapes() << " " << op();
+  ss << shapes() << ' ' << op();
   if (num_outputs() > 1) {
     ss << ", num_outputs=" << num_outputs();
   }
diff --git a/torch/csrc/lazy/core/ir_dump_util.cpp b/torch/csrc/lazy/core/ir_dump_util.cpp
index 3f33c4fce2246..b7f959682452c 100644
--- a/torch/csrc/lazy/core/ir_dump_util.cpp
+++ b/torch/csrc/lazy/core/ir_dump_util.cpp
@@ -137,7 +137,7 @@ std::string GenerateDotNodeLabel(
   std::stringstream ss;
   ss << node->op() << "\\n" << node->shape();
   for (auto& tag : GetNodeTags(node)) {
-    ss << "\\n" << tag.name << "=";
+    ss << "\\n" << tag.name << '=';
     if (tag.value.size() < kMaxValueSize) {
       ss << tag.value;
     } else {
@@ -155,27 +155,27 @@ std::string GenerateDotNodeSpec(
     const Node* node,
     const std::unordered_map<const Node*, size_t>& roots_ids) {
   std::stringstream ss;
-  ss << "label=\"" << GenerateDotNodeLabel(node, roots_ids) << "\"";
+  ss << "label=\"" << GenerateDotNodeLabel(node, roots_ids) << '"';
   return ss.str();
 }
 
 std::string GenerateTextNodeSpec(const Node* node, const NodeIdMap& id_map) {
   std::stringstream ss;
-  ss << node->shapes() << " " << node->op() << "(";
+  ss << node->shapes() << ' ' << node->op() << '(';
   size_t count = 0;
   for (auto& output : node->operands()) {
     if (count > 0) {
       ss << ", ";
     }
-    ss << "%" << id_map.at(output.node);
+    ss << '%' << id_map.at(output.node);
     if (output.node->num_outputs() > 1) {
-      ss << "." << output.index;
+      ss << '.' << output.index;
     }
     ++count;
   }
-  ss << ")";
+  ss << ')';
   for (auto& tag : GetNodeTags(node)) {
-    ss << ", " << tag.name << "=" << tag.value;
+    ss << ", " << tag.name << '=' << tag.value;
   }
   return ss.str();
 }
@@ -214,7 +214,7 @@ std::string DumpUtil::PostOrderToDot(
         if (output.node->num_outputs() > 1) {
           ss << " [label=\"o=" << output.index << "\"]";
         }
-        ss << "\n";
+        ss << '\n';
       }
     }
   }
@@ -242,7 +242,7 @@ std::string DumpUtil::PostOrderToText(
       ss << ", ROOT=" << *opt_root_id;
     }
     ss << ", NodeType=" << typeid(*node).name();
-    ss << "\n";
+    ss << '\n';
   }
   ss << "}\n";
   return ss.str();
diff --git a/torch/csrc/lazy/core/ir_metadata.cpp b/torch/csrc/lazy/core/ir_metadata.cpp
index 50aedaca0293b..5da2860ed6cea 100644
--- a/torch/csrc/lazy/core/ir_metadata.cpp
+++ b/torch/csrc/lazy/core/ir_metadata.cpp
@@ -16,8 +16,8 @@ void EmitShortFrameInfo(
     } else {
       ++pos;
     }
-    stream << ", location=" << frame.function << "@" << frame.file.substr(pos)
-           << ":" << frame.line;
+    stream << ", location=" << frame.function << '@' << frame.file.substr(pos)
+           << ':' << frame.line;
   }
 }
 
@@ -26,7 +26,7 @@ std::ostream& operator<<(
     const std::vector<SourceLocation>& frames) {
   stream << "Frames:\n";
   for (auto& location : frames) {
-    stream << "  " << location.function << " (" << location.file << ":"
+    stream << "  " << location.function << " (" << location.file << ':'
            << location.line << ")\n";
   }
   return stream;
diff --git a/torch/csrc/lazy/core/lazy_graph_executor.cpp b/torch/csrc/lazy/core/lazy_graph_executor.cpp
index c440357f9e16e..413601f70afd4 100644
--- a/torch/csrc/lazy/core/lazy_graph_executor.cpp
+++ b/torch/csrc/lazy/core/lazy_graph_executor.cpp
@@ -404,7 +404,7 @@ void LazyGraphExecutor::SyncLiveTensorsGraph(
     bool wait) {
   auto tensors = GetLiveTensors(device);
   VLOG(4) << tensors.size() << " live tensors: devices=("
-          << c10::Join(", ", devices) << ")";
+          << c10::Join(", ", devices) << ')';
   SyncTensorsGraph(&tensors, devices, wait, /*sync_ltc_data=*/true);
 }
 
diff --git a/torch/csrc/lazy/core/shape_inference.cpp b/torch/csrc/lazy/core/shape_inference.cpp
index e7ab494d18e32..ada3a2fed1693 100644
--- a/torch/csrc/lazy/core/shape_inference.cpp
+++ b/torch/csrc/lazy/core/shape_inference.cpp
@@ -85,7 +85,7 @@ static std::vector<int64_t> expand_param_if_needed(
     std::ostringstream ss;
     ss << "expected " << param_name << " to be a single integer value or a "
        << "list of " << expected_dim << " values to match the convolution "
-       << "dimensions, but got " << param_name << "=" << list_param;
+       << "dimensions, but got " << param_name << '=' << list_param;
     TORCH_CHECK(false, ss.str());
   } else {
     return list_param.vec();
diff --git a/torch/csrc/lazy/core/trie.cpp b/torch/csrc/lazy/core/trie.cpp
index a4a5d6f0c8b86..e0e657aae137e 100644
--- a/torch/csrc/lazy/core/trie.cpp
+++ b/torch/csrc/lazy/core/trie.cpp
@@ -19,7 +19,7 @@ void TraverseTrie(TrieNode* node, std::stringstream& ss) {
        << ", " << node->hit_counter << " hits\"]\n";
   }
   for (auto& successor : node->successors) {
-    ss << node->unique_id << " -> " << successor->unique_id << "\n";
+    ss << node->unique_id << " -> " << successor->unique_id << '\n';
     TraverseTrie(successor.get(), ss);
   }
 }
diff --git a/torch/csrc/monitor/counters.h b/torch/csrc/monitor/counters.h
index 65a0f516a58d3..046c63a78eddb 100644
--- a/torch/csrc/monitor/counters.h
+++ b/torch/csrc/monitor/counters.h
@@ -226,7 +226,7 @@ class Stat {
     for (auto& kv : stats) {
       std::stringstream key;
       key << name_;
-      key << ".";
+      key << '.';
       key << aggregationName(kv.first);
       e.data[key.str()] = kv.second;
     }
diff --git a/torch/csrc/profiler/kineto_shim.cpp b/torch/csrc/profiler/kineto_shim.cpp
index ec9994e15ec9c..524b84070cbf6 100644
--- a/torch/csrc/profiler/kineto_shim.cpp
+++ b/torch/csrc/profiler/kineto_shim.cpp
@@ -201,13 +201,13 @@ class ExperimentalConfigWrapper {
     for (size_t i = 0; i < num_metrics; i++) {
       configss << config_.profiler_metrics[i];
       if (num_metrics > 1 && i < (num_metrics - 1)) {
-        configss << ",";
+        configss << ',';
       }
     }
     configss << "\nCUPTI_PROFILER_ENABLE_PER_KERNEL="
              << (config_.profiler_measure_per_kernel ? "true" : "false")
-             << "\n";
-    configss << "CUSTOM_CONFIG=" << config_.custom_profiler_config << "\n";
+             << '\n';
+    configss << "CUSTOM_CONFIG=" << config_.custom_profiler_config << '\n';
     LOG(INFO) << "Generated config = " << configss.str();
 
     libkineto::api().activityProfiler().prepareTrace(
@@ -236,8 +236,8 @@ static const std::string setTraceID(const std::string& trace_id) {
     return "";
   }
   std::stringstream configss;
-  configss << "REQUEST_TRACE_ID=" << trace_id << "\n";
-  configss << "REQUEST_GROUP_TRACE_ID=" << trace_id << "\n";
+  configss << "REQUEST_TRACE_ID=" << trace_id << '\n';
+  configss << "REQUEST_GROUP_TRACE_ID=" << trace_id << '\n';
   return configss.str();
 }
 
@@ -249,7 +249,7 @@ static const std::string appendCustomConfig(
   }
   std::stringstream configss;
   configss << config;
-  configss << "CUSTOM_CONFIG=" << custom_profiler_config << "\n";
+  configss << "CUSTOM_CONFIG=" << custom_profiler_config << '\n';
   return configss.str();
 }
 #endif
diff --git a/torch/csrc/profiler/standalone/execution_trace_observer.cpp b/torch/csrc/profiler/standalone/execution_trace_observer.cpp
index 918cc554c5b16..29b2b94af4472 100644
--- a/torch/csrc/profiler/standalone/execution_trace_observer.cpp
+++ b/torch/csrc/profiler/standalone/execution_trace_observer.cpp
@@ -122,29 +122,47 @@ struct TORCH_API ExecutionTraceObserver { // NOLINT
   ID get_tensor_storage_ID(const c10::Storage& t_storage) {
     const std::lock_guard<std::recursive_mutex> lock(gMutex);
 
-    const void* raw_data_ptr = t_storage.data();
-    auto iter = data_ptr_to_weak_storage_ptr.find(raw_data_ptr);
-    if (iter == data_ptr_to_weak_storage_ptr.end()) {
+    const void* raw_data_ptr = nullptr;
+    bool should_track_liveness = false;
+    // FakeTensor/FunctionalTensor may clear the Storage handle entirely or use
+    // a nullptr data pointer. Treat both cases as a shared cache key but avoid
+    // touching the weak-ref table so they can reuse the same ID without
+    // tripping the liveness check.
+    if (t_storage.unsafeGetStorageImpl()) {
+      raw_data_ptr = t_storage.data();
+      should_track_liveness = raw_data_ptr != nullptr;
+    }
+
+    auto id_iter = data_ptr_to_storage_id.find(raw_data_ptr);
+    if (!should_track_liveness) {
+      if (id_iter != data_ptr_to_storage_id.end()) {
+        return id_iter->second;
+      }
       ID id = storage_id_++;
       data_ptr_to_storage_id.emplace(raw_data_ptr, id);
+      return id;
+    }
+
+    auto weak_iter = data_ptr_to_weak_storage_ptr.find(raw_data_ptr);
+    if (weak_iter == data_ptr_to_weak_storage_ptr.end()) {
+      ID id = storage_id_++;
+      data_ptr_to_storage_id.insert_or_assign(raw_data_ptr, id);
       data_ptr_to_weak_storage_ptr.emplace(
           raw_data_ptr, t_storage.getWeakStorageImpl());
       return id;
-    } else {
-      // check if the storage is still alive
-      if (iter->second.expired()) {
-        ID id = storage_id_++;
-        // std::unorder_map does not change if the key is already in the map.
-        // So we need to remove the key and insert the key with the new value.
-        data_ptr_to_storage_id.erase(raw_data_ptr);
-        data_ptr_to_storage_id[raw_data_ptr] = id;
-        data_ptr_to_weak_storage_ptr.insert_or_assign(
-            raw_data_ptr, t_storage.getWeakStorageImpl());
-        return id;
-      } else {
-        return data_ptr_to_storage_id[raw_data_ptr];
-      }
     }
+
+    if (weak_iter->second.expired()) {
+      ID id = storage_id_++;
+      data_ptr_to_storage_id.insert_or_assign(raw_data_ptr, id);
+      data_ptr_to_weak_storage_ptr.insert_or_assign(
+          raw_data_ptr, t_storage.getWeakStorageImpl());
+      return id;
+    }
+
+    id_iter = data_ptr_to_storage_id.find(raw_data_ptr);
+    TORCH_INTERNAL_ASSERT(id_iter != data_ptr_to_storage_id.end());
+    return id_iter->second;
   }
 
   // Observer run state.
@@ -261,7 +279,7 @@ static std::ofstream openOutputFile(const std::string& name) {
   std::ofstream stream;
   stream.open(name, std::ofstream::out | std::ofstream::trunc);
   if (!stream) {
-    LOG(ERROR) << "Failed to open '" << name << "'";
+    LOG(ERROR) << "Failed to open '" << name << '\'';
   } else {
     VLOG(1) << "PyTorch Execution Trace: writing to " << name;
   }
@@ -736,7 +754,7 @@ static void recordOperatorStart(
                 RecordScope::USER_SCOPE),
             tid,
             0); // fw_tid
-        ob.out << ",";
+        ob.out << ',';
       }
     }
 
@@ -910,7 +928,7 @@ static void onFunctionExit(const RecordFunction& fn, ObserverContext* ctx_ptr) {
             fc.kernelFile,
             fc.get_string_for_tensor_range(),
             additiona_attrs);
-        ob->out << ",";
+        ob->out << ',';
       }
     } catch (const std::exception& e) {
       LOG(WARNING) << "Exception in execution trace observer: [" << fc.name
@@ -959,7 +977,7 @@ bool addExecutionTraceObserver(const std::string& output_file_path) {
       // 5 is the length of ".json"
       ob.resourceDir.replace(ext_pos, 5, "_resources/");
       VLOG(1) << "Execution trace resource directory: " << ob.resourceDir
-              << "\n";
+              << '\n';
     } else {
       LOG(WARNING)
           << "Execution trace output file does not end with \".json\".";
diff --git a/torch/csrc/profiler/stubs/cuda.cpp b/torch/csrc/profiler/stubs/cuda.cpp
index ea195f7bc71ad..45c288b976ae2 100644
--- a/torch/csrc/profiler/stubs/cuda.cpp
+++ b/torch/csrc/profiler/stubs/cuda.cpp
@@ -1,7 +1,7 @@
 #include <sstream>
 
 #ifndef ROCM_ON_WINDOWS
-#ifdef TORCH_CUDA_USE_NVTX3
+#if CUDART_VERSION >= 13000 || defined(TORCH_CUDA_USE_NVTX3)
 #include <nvtx3/nvtx3.hpp>
 #else
 #include <nvToolsExt.h>
@@ -21,7 +21,7 @@ namespace {
 static void cudaCheck(cudaError_t result, const char* file, int line) {
   if (result != cudaSuccess) {
     std::stringstream ss;
-    ss << file << ":" << line << ": ";
+    ss << file << ':' << line << ": ";
     if (result == cudaErrorInitializationError) {
       // It is common for users to use DataLoader with multiple workers
       // and the autograd profiler. Throw a nice error message here.
diff --git a/torch/csrc/profiler/unwind/action.h b/torch/csrc/profiler/unwind/action.h
index 1a8373d9dfe14..5a982cfd046a0 100644
--- a/torch/csrc/profiler/unwind/action.h
+++ b/torch/csrc/profiler/unwind/action.h
@@ -40,16 +40,16 @@ struct Action {
   friend std::ostream& operator<<(std::ostream& out, const Action& self) {
     switch (self.kind) {
       case A_UNDEFINED:
-        out << "u";
+        out << 'u';
         break;
       case A_REG_PLUS_DATA:
-        out << "r" << (int)self.reg << " + " << self.data;
+        out << 'r' << (int)self.reg << " + " << self.data;
         break;
       case A_REG_PLUS_DATA_DEREF:
-        out << "*(r" << (int)self.reg << " + " << self.data << ")";
+        out << "*(r" << (int)self.reg << " + " << self.data << ')';
         break;
       case A_LOAD_CFA_OFFSET:
-        out << "*(cfa + " << self.data << ")";
+        out << "*(cfa + " << self.data << ')';
         break;
     }
     return out;
diff --git a/torch/csrc/profiler/unwind/eh_frame_hdr.h b/torch/csrc/profiler/unwind/eh_frame_hdr.h
index 740f4beb2c85c..5884685433b0d 100644
--- a/torch/csrc/profiler/unwind/eh_frame_hdr.h
+++ b/torch/csrc/profiler/unwind/eh_frame_hdr.h
@@ -81,7 +81,7 @@ struct EHFrameHdr {
   friend std::ostream& operator<<(std::ostream& out, const EHFrameHdr& self) {
     out << "EHFrameHeader(version=" << self.version_
         << ",table_size=" << self.table_size_
-        << ",fde_count=" << self.fde_count_ << ")";
+        << ",fde_count=" << self.fde_count_ << ')';
     return out;
   }
 
diff --git a/torch/csrc/profiler/unwind/fde.h b/torch/csrc/profiler/unwind/fde.h
index 083578ec391e5..ffb06b5ab1f46 100644
--- a/torch/csrc/profiler/unwind/fde.h
+++ b/torch/csrc/profiler/unwind/fde.h
@@ -17,7 +17,7 @@ struct TableState {
     out << "cfa = " << self.cfa << "; ";
     for (auto r : c10::irange(self.registers.size())) {
       if (self.registers.at(r).kind != A_UNDEFINED) {
-        out << "r" << r << " = " << self.registers.at(r) << "; ";
+        out << 'r' << r << " = " << self.registers.at(r) << "; ";
       }
     }
     return out;
@@ -110,21 +110,21 @@ struct FDE {
     auto previous_pc = current_pc_;
     current_pc_ += amount;
     if (LOG) {
-      (*out_) << (void*)(previous_pc - load_bias_) << "-"
-              << (void*)(current_pc_ - load_bias_) << ": " << state() << "\n";
+      (*out_) << (void*)(previous_pc - load_bias_) << '-'
+              << (void*)(current_pc_ - load_bias_) << ": " << state() << '\n';
     }
   }
 
   void advance_loc(int64_t amount) {
     if (LOG) {
-      (*out_) << "advance_loc " << amount << "\n";
+      (*out_) << "advance_loc " << amount << '\n';
     }
     advance_raw(amount * code_alignment_factor_);
   }
 
   void offset(int64_t reg, int64_t offset) {
     if (LOG) {
-      (*out_) << "offset " << reg << " " << offset << "\n";
+      (*out_) << "offset " << reg << ' ' << offset << '\n';
     }
     if (reg > (int64_t)state().registers.size()) {
       if (LOG) {
@@ -138,7 +138,7 @@ struct FDE {
 
   void restore(int64_t reg) {
     if (LOG) {
-      (*out_) << "restore " << reg << "\n";
+      (*out_) << "restore " << reg << '\n';
     }
     if (reg > (int64_t)state().registers.size()) {
       if (LOG) {
@@ -151,7 +151,7 @@ struct FDE {
 
   void def_cfa(int64_t reg, int64_t off) {
     if (LOG) {
-      (*out_) << "def_cfa " << reg << " " << off << "\n";
+      (*out_) << "def_cfa " << reg << ' ' << off << '\n';
     }
     last_reg_ = reg;
     last_offset_ = off;
@@ -179,13 +179,13 @@ struct FDE {
 
   void undefined(int64_t reg) {
     if (LOG) {
-      (*out_) << "undefined " << reg << "\n";
+      (*out_) << "undefined " << reg << '\n';
     }
     state().registers.at(reg) = Action::undefined();
   }
   void register_(int64_t reg, int64_t rhs_reg) {
     if (LOG) {
-      (*out_) << "register " << reg << " " << rhs_reg << "\n";
+      (*out_) << "register " << reg << ' ' << rhs_reg << '\n';
     }
     state().registers.at(reg) =
         Action::regPlusData(static_cast<int32_t>(reg), 0);
@@ -214,7 +214,7 @@ struct FDE {
     if (LOG) {
       // NOLINTNEXTLINE(performance-no-int-to-ptr)
       (*out_) << "readUpTo " << (void*)addr << " for " << library_name_
-              << " at " << (void*)load_bias_ << "\n";
+              << " at " << (void*)load_bias_ << '\n';
     }
     state_stack_.emplace_back();
     current_pc_ = low_pc_;
@@ -245,8 +245,8 @@ struct FDE {
   }
 
   void dumpAddr2Line() {
-    std::cout << "addr2line -f -e " << library_name_ << " "
-              << (void*)(low_pc_ - load_bias_) << "\n";
+    std::cout << "addr2line -f -e " << library_name_ << ' '
+              << (void*)(low_pc_ - load_bias_) << '\n';
   }
 
   void readInstruction(Lexer& L) {
diff --git a/torch/csrc/profiler/unwind/unwind.cpp b/torch/csrc/profiler/unwind/unwind.cpp
index 2b30df4e2a60e..db7e8a60e4a19 100644
--- a/torch/csrc/profiler/unwind/unwind.cpp
+++ b/torch/csrc/profiler/unwind/unwind.cpp
@@ -354,7 +354,7 @@ struct Symbolizer {
     entry.queried.push_back(addr);
     auto libaddress = maybe_library->second - 1;
     // NOLINTNEXTLINE(performance-no-int-to-ptr)
-    entry.comm->out() << (void*)libaddress << "\n";
+    entry.comm->out() << (void*)libaddress << '\n';
     // we need to make sure we don't write more than 64k bytes to
     // a pipe before reading the results. Otherwise the buffer may
     // get filled and block before we read the results.
diff --git a/torch/csrc/profiler/util.cpp b/torch/csrc/profiler/util.cpp
index d266958e2cb63..b547bc528da55 100644
--- a/torch/csrc/profiler/util.cpp
+++ b/torch/csrc/profiler/util.cpp
@@ -145,7 +145,7 @@ std::vector<std::string> callstackStr(const std::vector<FileLineFunc>& cs) {
   cs_str.reserve(cs.size());
   for (const auto& entry : cs) {
     std::stringstream loc;
-    loc << entry.filename << "(" << entry.line << "): " << entry.funcname;
+    loc << entry.filename << '(' << entry.line << "): " << entry.funcname;
     cs_str.push_back(loc.str());
   }
   return cs_str;
@@ -310,11 +310,11 @@ std::string ivalueToStr(const c10::IValue& val, bool isString) {
   } else {
     ss.str("");
     if (isString) {
-      ss << "\"";
+      ss << '"';
     }
     ss << val;
     if (isString) {
-      ss << "\"";
+      ss << '"';
     }
     std::string mystr = ss.str();
 
@@ -934,7 +934,7 @@ int getTensorStartHint(const at::Tensor& t) {
 bool checkFunctionOutputsForLogging(const at::RecordFunction& fn) {
   const auto& outputs = fn.outputs();
   auto num_outputs = fn.num_outputs();
-  VLOG(2) << "outputs: " << num_outputs << " " << outputs.size() << '\n';
+  VLOG(2) << "outputs: " << num_outputs << ' ' << outputs.size() << '\n';
   // We have two cases: for unboxed kernel, we have num_outputs ==
   // outputs.size() for boxed kernel using stack, there could be more elements
   // on the stack from previous ops.
@@ -948,7 +948,7 @@ bool checkFunctionOutputsForLogging(const at::RecordFunction& fn) {
 bool checkFunctionInputsForLogging(const at::RecordFunction& fn) {
   auto num_inputs = fn.num_inputs();
   const auto inputs = fn.inputs();
-  VLOG(2) << "inputs: " << num_inputs << " " << inputs.size() << '\n';
+  VLOG(2) << "inputs: " << num_inputs << ' ' << inputs.size() << '\n';
   // We have two cases: for unboxed kernel, we have num_inputs ==
   // inputs.size() for boxed kernel using stack, there could be more elements
   // on the stack from previous ops.
diff --git a/torch/csrc/shim_common.cpp b/torch/csrc/shim_common.cpp
index 23effad1a36b2..ffbb7bb1235a7 100644
--- a/torch/csrc/shim_common.cpp
+++ b/torch/csrc/shim_common.cpp
@@ -1,15 +1,70 @@
+#include <c10/core/Device.h>
 #include <c10/core/DispatchKey.h>
 #include <c10/util/Exception.h>
 #include <torch/csrc/inductor/aoti_runtime/utils.h>
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
 #include <torch/csrc/inductor/aoti_torch/tensor_converter.h>
 #include <torch/csrc/inductor/aoti_torch/utils.h>
-#include <torch/csrc/jit/serialization/pickle.h>
 #include <torch/csrc/stable/library.h>
 #include <torch/library.h>
 
+#include <ATen/Parallel.h>
+#include <torch/csrc/shim_conversion_utils.h>
 #include <torch/csrc/stable/c/shim.h>
 
+AOTITorchError torch_new_list_reserve_size(size_t size, StableListHandle* ret) {
+  auto list_ptr = std::make_unique<std::vector<StableIValue>>();
+  list_ptr->reserve(size);
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *ret = list_pointer_to_list_handle(list_ptr.release()); });
+}
+
+AOTI_TORCH_EXPORT AOTITorchError
+torch_list_size(StableListHandle list_handle, size_t* size) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    std::vector<StableIValue>* list = list_handle_to_list_pointer(list_handle);
+    *size = list->size();
+  });
+}
+
+AOTI_TORCH_EXPORT AOTITorchError torch_list_get_item(
+    StableListHandle list_handle,
+    size_t index,
+    StableIValue* element) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    std::vector<StableIValue>* list = list_handle_to_list_pointer(list_handle);
+    *element = list->at(index);
+  });
+}
+
+AOTI_TORCH_EXPORT AOTITorchError torch_list_set_item(
+    StableListHandle list_handle,
+    size_t index,
+    StableIValue element) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    std::vector<StableIValue>* list = list_handle_to_list_pointer(list_handle);
+    list->at(index) = element;
+  });
+}
+
+AOTITorchError torch_list_push_back(
+    StableListHandle list_handle,
+    StableIValue element) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    std::vector<StableIValue>* list = list_handle_to_list_pointer(list_handle);
+    list->push_back(element);
+  });
+}
+
+AOTI_TORCH_EXPORT AOTITorchError
+torch_delete_list(StableListHandle list_handle) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    std::vector<StableIValue>* list_ptr =
+        list_handle_to_list_pointer(list_handle);
+    delete list_ptr;
+  });
+}
+
 static StableIValue from_ivalue(
     const c10::TypePtr& type,
     const c10::IValue& ivalue,
@@ -37,8 +92,14 @@ static StableIValue from_ivalue(
           ivalue.toScalarType(), extension_build_version);
     }
     case c10::TypeKind::DeviceObjType: {
-      return torch::stable::detail::_from(
-          ivalue.toDevice(), extension_build_version);
+      // Pack device type and index into StableIValue in platform-independent
+      // format Lower 32 bits = device index, upper 32 bits = device type
+      const auto& device = ivalue.toDevice();
+      uint64_t device_index_bits =
+          static_cast<uint64_t>(static_cast<uint32_t>(device.index()));
+      uint64_t device_type_bits =
+          static_cast<uint64_t>(static_cast<int8_t>(device.type())) << 32;
+      return device_index_bits | device_type_bits;
     }
     case c10::TypeKind::LayoutType: {
       return torch::stable::detail::_from(
@@ -71,6 +132,19 @@ static StableIValue from_ivalue(
           from_ivalue(inner_type, ivalue, extension_build_version));
       return torch::stable::detail::_from(sivp, extension_build_version);
     }
+    case c10::TypeKind::ListType: {
+      auto inner_type = type->castRaw<c10::ListType>()->getElementType();
+      auto ivalue_list = ivalue.toList();
+      auto stableivalue_list = std::make_unique<std::vector<StableIValue>>();
+      stableivalue_list->reserve(ivalue_list.size());
+      for (const auto& elem : ivalue_list) {
+        stableivalue_list->emplace_back(
+            from_ivalue(inner_type, elem, extension_build_version));
+      }
+      return torch::stable::detail::_from(
+          list_pointer_to_list_handle(stableivalue_list.release()),
+          extension_build_version);
+    }
     default: {
       TORCH_CHECK(
           false,
@@ -109,8 +183,25 @@ static c10::IValue to_ivalue(
           stable_ivalue, extension_build_version));
     }
     case c10::TypeKind::DeviceObjType: {
-      return c10::IValue(torch::stable::detail::_to<c10::Device>(
-          stable_ivalue, extension_build_version));
+      // Unpack device type and index from StableIValue
+      // Lower 32 bits = device index, upper 32 bits = device type
+      int32_t device_index = static_cast<int32_t>(
+          static_cast<uint32_t>(stable_ivalue & 0xFFFFFFFF));
+      c10::DeviceType device_type =
+          static_cast<c10::DeviceType>(static_cast<int8_t>(
+              static_cast<uint32_t>((stable_ivalue >> 32) & 0xFFFFFFFF)));
+      TORCH_CHECK(
+          device_index >= std::numeric_limits<int8_t>::min() &&
+              device_index <= std::numeric_limits<int8_t>::max(),
+          "Device index ",
+          device_index,
+          " is out of range for int8_t [",
+          static_cast<int>(std::numeric_limits<int8_t>::min()),
+          ", ",
+          static_cast<int>(std::numeric_limits<int8_t>::max()),
+          "]");
+      return c10::IValue(
+          c10::Device(device_type, static_cast<int8_t>(device_index)));
     }
     case c10::TypeKind::LayoutType: {
       return c10::IValue(torch::stable::detail::_to<c10::Layout>(
@@ -145,6 +236,21 @@ static c10::IValue to_ivalue(
       delete sivp;
       return ival;
     }
+    case c10::TypeKind::ListType: {
+      auto inner_type = type->castRaw<c10::ListType>()->getElementType();
+      auto list_handle = torch::stable::detail::_to<StableListHandle>(
+          stable_ivalue, extension_build_version);
+      std::vector<StableIValue>* stableivalue_list =
+          list_handle_to_list_pointer(list_handle);
+      auto ivalue_list = c10::impl::GenericList(inner_type);
+      ivalue_list.reserve(stableivalue_list->size());
+      for (const auto& elem : *stableivalue_list) {
+        ivalue_list.emplace_back(
+            to_ivalue(inner_type, elem, extension_build_version));
+      }
+      TORCH_ERROR_CODE_CHECK(torch_delete_list(list_handle));
+      return ivalue_list;
+    }
     default: {
       TORCH_CHECK(
           false,
@@ -209,6 +315,19 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_library_impl(
   });
 }
 
+// Helper function to parse device string using c10::Device
+// Returns device type and index
+AOTI_TORCH_EXPORT AOTITorchError torch_parse_device_string(
+    const char* device_string,
+    uint32_t* out_device_type,
+    int32_t* out_device_index) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::Device device{std::string(device_string)};
+    *out_device_type = static_cast<uint32_t>(device.type());
+    *out_device_index = static_cast<int32_t>(device.index());
+  });
+}
+
 // Version-aware variant of aoti_torch_library_impl that takes an
 // extension_build_version parameter for backward compatibility
 AOTI_TORCH_EXPORT AOTITorchError torch_library_impl(
@@ -415,3 +534,47 @@ AOTI_TORCH_EXPORT AOTITorchError torch_call_dispatcher(
     }
   });
 }
+
+AOTI_TORCH_EXPORT AOTITorchError torch_parallel_for(
+    int64_t begin,
+    int64_t end,
+    int64_t grain_size,
+    ParallelFunc func,
+    void* ctx) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    at::parallel_for(
+        begin, end, grain_size, [func, ctx](int64_t begin, int64_t end) {
+          func(begin, end, ctx);
+        });
+  });
+}
+
+AOTI_TORCH_EXPORT AOTITorchError
+torch_get_thread_idx(uint32_t* out_thread_idx) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *out_thread_idx = static_cast<uint32_t>(at::get_thread_num()); });
+}
+
+AOTI_TORCH_EXPORT AOTITorchError
+torch_get_num_threads(uint32_t* out_num_threads) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *out_num_threads = static_cast<uint32_t>(at::get_num_threads()); });
+}
+
+AOTI_TORCH_EXPORT AOTITorchError
+torch_get_const_data_ptr(AtenTensorHandle tensor, const void** ret_data_ptr) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    at::Tensor* t =
+        torch::aot_inductor::tensor_handle_to_tensor_pointer(tensor);
+    *ret_data_ptr = t->const_data_ptr();
+  });
+}
+
+AOTI_TORCH_EXPORT AOTITorchError
+torch_get_mutable_data_ptr(AtenTensorHandle tensor, void** ret_data_ptr) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    at::Tensor* t =
+        torch::aot_inductor::tensor_handle_to_tensor_pointer(tensor);
+    *ret_data_ptr = t->mutable_data_ptr();
+  });
+}
diff --git a/torch/csrc/shim_conversion_utils.h b/torch/csrc/shim_conversion_utils.h
new file mode 100644
index 0000000000000..e0e1d25e65ef7
--- /dev/null
+++ b/torch/csrc/shim_conversion_utils.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <c10/util/Exception.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/csrc/stable/c/shim.h>
+
+#include <vector>
+
+inline std::vector<StableIValue>* list_handle_to_list_pointer(
+    StableListHandle handle) {
+  return reinterpret_cast<std::vector<StableIValue>*>(handle);
+}
+
+inline StableListHandle list_pointer_to_list_handle(
+    std::vector<StableIValue>* list_ptr) {
+  return reinterpret_cast<StableListHandle>(list_ptr);
+}
+
+inline StableListHandle new_list_handle(std::vector<StableIValue>&& list) {
+  std::vector<StableIValue>* new_list = new std::vector<StableIValue>(list);
+  return list_pointer_to_list_handle(new_list);
+}
diff --git a/torch/csrc/stable/c/shim.h b/torch/csrc/stable/c/shim.h
index 365c954dbe787..99b3b435cf550 100644
--- a/torch/csrc/stable/c/shim.h
+++ b/torch/csrc/stable/c/shim.h
@@ -37,6 +37,72 @@ AOTI_TORCH_EXPORT AOTITorchError torch_library_impl(
     void (*fn)(StableIValue*, uint64_t, uint64_t),
     uint64_t extension_build_version);
 
+struct StableListOpaque;
+using StableListHandle = StableListOpaque*;
+
+// returns an owning reference of a StableList. callee is responsible for
+// freeing memory.
+AOTI_TORCH_EXPORT AOTITorchError
+torch_new_list_reserve_size(size_t size, StableListHandle* ret);
+
+AOTI_TORCH_EXPORT AOTITorchError
+torch_list_size(StableListHandle list_handle, size_t* size);
+
+AOTI_TORCH_EXPORT AOTITorchError torch_list_get_item(
+    StableListHandle list_handle,
+    size_t index,
+    StableIValue* element);
+
+AOTI_TORCH_EXPORT AOTITorchError torch_list_set_item(
+    StableListHandle list_handle,
+    size_t index,
+    StableIValue element);
+
+AOTI_TORCH_EXPORT AOTITorchError
+torch_list_push_back(StableListHandle list_handle, StableIValue element);
+
+// deletes the underlying list referenced by list_handle
+AOTI_TORCH_EXPORT AOTITorchError
+torch_delete_list(StableListHandle list_handle);
+
+// Helper function to parse device string using c10::Device
+// Returns device type and index via output parameters
+AOTI_TORCH_EXPORT AOTITorchError torch_parse_device_string(
+    const char* device_string,
+    uint32_t* out_device_type,
+    int32_t* out_device_index);
+
+// Parallel utility APIs for stable ABI
+// Function pointer type for parallel_for callback
+// The callback receives begin and end indices for a range to process
+typedef void (*ParallelFunc)(int64_t begin, int64_t end, void* ctx);
+
+AOTI_TORCH_EXPORT AOTITorchError torch_parallel_for(
+    int64_t begin,
+    int64_t end,
+    int64_t grain_size,
+    ParallelFunc func,
+    void* ctx);
+
+// Get the current thread index in a parallel region
+// Returns 0 if not in a parallel region
+AOTI_TORCH_EXPORT AOTITorchError torch_get_thread_idx(uint32_t* out_thread_idx);
+
+// Get the number of threads for the parallel backend
+AOTI_TORCH_EXPORT AOTITorchError
+torch_get_num_threads(uint32_t* out_num_threads);
+
+// Get a pointer to the underlying storage data
+AOTI_TORCH_EXPORT AOTITorchError torch_get_mutable_data_ptr(
+    AtenTensorHandle tensor,
+    void** ret_data_ptr // returns borrowed reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError torch_get_const_data_ptr(
+    AtenTensorHandle tensor,
+    const void** ret_data_ptr // returns borrowed reference
+);
+
 #endif // TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0
 
 #ifdef __cplusplus
diff --git a/torch/csrc/stable/device.h b/torch/csrc/stable/device.h
new file mode 100644
index 0000000000000..223e3320a4fd3
--- /dev/null
+++ b/torch/csrc/stable/device.h
@@ -0,0 +1,4 @@
+#pragma once
+
+#include <torch/csrc/stable/device_inl.h>
+#include <torch/csrc/stable/device_struct.h>
diff --git a/torch/csrc/stable/device_inl.h b/torch/csrc/stable/device_inl.h
new file mode 100644
index 0000000000000..8c9685f0d7da7
--- /dev/null
+++ b/torch/csrc/stable/device_inl.h
@@ -0,0 +1,41 @@
+#pragma once
+
+// This file implements device.h. We separated out the Device struct so that
+// other files can depend on the Device struct (like stableivalue_conversions.h)
+// and the implementations of the Device methods can depend on APIs in
+// stableivalue_conversions.h without circular dependencies.
+
+#include <torch/csrc/stable/c/shim.h>
+#include <torch/csrc/stable/device_struct.h>
+#include <torch/csrc/stable/stableivalue_conversions.h>
+#include <torch/csrc/stable/version.h>
+#include <torch/headeronly/core/DeviceType.h>
+#include <torch/headeronly/macros/Macros.h>
+#include <torch/headeronly/util/shim_utils.h>
+
+#include <string>
+
+HIDDEN_NAMESPACE_BEGIN(torch, stable)
+
+using DeviceType = torch::headeronly::DeviceType;
+using DeviceIndex = torch::stable::accelerator::DeviceIndex;
+
+#if TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0
+
+inline Device::Device(const std::string& device_string) {
+  uint32_t device_type;
+  int32_t device_index;
+
+  TORCH_ERROR_CODE_CHECK(torch_parse_device_string(
+      device_string.c_str(), &device_type, &device_index));
+
+  DeviceType dt = torch::stable::detail::to<DeviceType>(
+      torch::stable::detail::from(device_type));
+  DeviceIndex di = static_cast<DeviceIndex>(device_index);
+
+  *this = Device(dt, di);
+}
+
+#endif // TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0
+
+HIDDEN_NAMESPACE_END(torch, stable)
diff --git a/torch/csrc/stable/device_struct.h b/torch/csrc/stable/device_struct.h
new file mode 100644
index 0000000000000..b422d62e30c58
--- /dev/null
+++ b/torch/csrc/stable/device_struct.h
@@ -0,0 +1,107 @@
+#pragma once
+
+#include <torch/csrc/stable/accelerator.h>
+#include <torch/csrc/stable/c/shim.h>
+#include <torch/csrc/stable/version.h>
+#include <torch/headeronly/core/DeviceType.h>
+#include <torch/headeronly/macros/Macros.h>
+#include <torch/headeronly/util/Exception.h>
+#include <torch/headeronly/util/shim_utils.h>
+
+#include <string>
+
+HIDDEN_NAMESPACE_BEGIN(torch, stable)
+
+using DeviceType = torch::headeronly::DeviceType;
+using DeviceIndex = torch::stable::accelerator::DeviceIndex;
+
+// The torch::stable::Device class is an approximate copy of c10::Device.
+// It has some slight modifications:
+// 1. TORCH_INTERNAL_ASSERT_DEBUG_ONLY -> STD_TORCH_CHECK
+// 2. Has a string constructor that uses a shim function
+// 3. does not include some is_{device} variants that we can add later
+//
+// We chose to copy it rather than moving it to headeronly as
+// 1. Device is < 8 bytes so the *Handle approach used for tensor doesn't make
+// sense
+// 2. c10::Device is not header-only due to its string constructor.
+//
+// StableIValue conversions handle conversion between c10::Device (in libtorch)
+// and torch::stable::Device (in stable user extensions)
+
+class Device {
+ private:
+  DeviceType type_;
+  DeviceIndex index_ = -1;
+
+  void validate() {
+    STD_TORCH_CHECK(
+        index_ >= -1,
+        "Device index must be -1 or non-negative, got ",
+        static_cast<int>(index_));
+    STD_TORCH_CHECK(
+        type_ != DeviceType::CPU || index_ <= 0,
+        "CPU device index must be -1 or zero, got ",
+        static_cast<int>(index_));
+  }
+
+ public:
+  // Construct a stable::Device from a DeviceType and optional device index
+  // Default index is -1 (current device)
+  /* implicit */ Device(DeviceType type, DeviceIndex index = -1)
+      : type_(type), index_(index) {
+    validate();
+  }
+
+#if TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0
+  // Construct a stable::Device from a string description
+  // The string must follow the schema: (cpu|cuda|...)[:<device-index>]
+  // Defined in device_inl.h to avoid circular dependencies
+  /* implicit */ Device(const std::string& device_string);
+#endif // TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0
+
+  // Copy and move constructors can be default
+  Device(const Device& other) = default;
+  Device(Device&& other) noexcept = default;
+
+  // Copy and move assignment operators can be default
+  Device& operator=(const Device& other) = default;
+  Device& operator=(Device&& other) noexcept = default;
+
+  // Destructor can be default
+  ~Device() = default;
+
+  bool operator==(const Device& other) const noexcept {
+    return type() == other.type() && index() == other.index();
+  }
+
+  bool operator!=(const Device& other) const noexcept {
+    return !(*this == other);
+  }
+
+  void set_index(DeviceIndex index) {
+    index_ = index;
+  }
+
+  DeviceType type() const noexcept {
+    return type_;
+  }
+
+  DeviceIndex index() const noexcept {
+    return index_;
+  }
+
+  bool has_index() const noexcept {
+    return index_ != -1;
+  }
+
+  bool is_cuda() const noexcept {
+    return type_ == DeviceType::CUDA;
+  }
+
+  bool is_cpu() const noexcept {
+    return type_ == DeviceType::CPU;
+  }
+};
+
+HIDDEN_NAMESPACE_END(torch, stable)
diff --git a/torch/csrc/stable/library.h b/torch/csrc/stable/library.h
index 61bc6d7249f5f..dc36c4d182478 100644
--- a/torch/csrc/stable/library.h
+++ b/torch/csrc/stable/library.h
@@ -6,6 +6,7 @@
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
 #include <torch/csrc/stable/c/shim.h>
 #include <torch/headeronly/macros/Macros.h>
+#include <torch/headeronly/util/Metaprogramming.h>
 
 // Technically, this file doesn't use anything from stableivalue_conversions.h,
 // but we need to include it here as the contents of stableivalue_conversions.h
@@ -116,8 +117,182 @@ class StableTorchLibraryInit final {
   }
 };
 
+// type mapper: since to<HeaderOnlyArrayRef<T>> cannot exist,
+// we map that to to<std::vector<T>> to preserve ownership semantics.
+// note that unbox_type_t is used to convert ParamTypes, so that
+// the tuple holding the arguments will have proper ownership too.
+template <typename T>
+struct UnboxType {
+  using type = T;
+};
+
+template <typename T>
+struct UnboxType<torch::headeronly::HeaderOnlyArrayRef<T>> {
+  using type = std::vector<T>;
+};
+
+template <typename T>
+using unbox_type_t = typename UnboxType<T>::type;
+
+template <class... T, std::size_t... I>
+std::tuple<T...> unbox_to_tuple_impl(
+    StableIValue* stack,
+    std::index_sequence<I...>) {
+  return std::make_tuple(to<T>(stack[I])...);
+}
+
+template <class... T>
+std::tuple<T...> unbox_to_tuple(StableIValue* stack) {
+  return unbox_to_tuple_impl<T...>(
+      stack, std::make_index_sequence<sizeof...(T)>());
+}
+
+template <class... T, std::size_t... I>
+void box_from_tuple_impl(
+    StableIValue* stack,
+    std::tuple<T...> vals,
+    std::index_sequence<I...>) {
+  ((stack[I] = from<T>(std::get<I>(vals))), ...);
+}
+
+template <class... T>
+void box_from_tuple(StableIValue* stack, std::tuple<T...> vals) {
+  box_from_tuple_impl<T...>(
+      stack, vals, std::make_index_sequence<sizeof...(T)>());
+}
+
+template <
+    typename ReturnType,
+    typename ParameterTypeList,
+    typename FuncT,
+    FuncT* func>
+struct boxer_impl {
+  static_assert(
+      torch::headeronly::guts::false_t<ReturnType>::value,
+      "Unsupported function schema for TORCH_BOX.");
+};
+
+// Multiple returns
+template <
+    typename... ReturnTypes,
+    typename... ParameterTypes,
+    typename FuncT,
+    FuncT* func>
+struct boxer_impl<
+    std::tuple<ReturnTypes...>,
+    torch::headeronly::guts::typelist::typelist<ParameterTypes...>,
+    FuncT,
+    func> {
+  static void boxed_fn(
+      StableIValue* stack,
+      uint64_t num_args,
+      uint64_t num_outputs) {
+    STD_TORCH_CHECK(
+        num_args == sizeof...(ParameterTypes),
+        "Registered schema has ",
+        num_args,
+        " args, but the kernel to box has ",
+        sizeof...(ParameterTypes));
+    STD_TORCH_CHECK(
+        num_outputs == sizeof...(ReturnTypes),
+        "Registered schema has ",
+        num_outputs,
+        " outputs, but the kernel to box has ",
+        sizeof...(ReturnTypes));
+    std::tuple<unbox_type_t<ParameterTypes>...> args =
+        unbox_to_tuple<unbox_type_t<ParameterTypes>...>(stack);
+    auto res = std::apply(func, args);
+    box_from_tuple<ReturnTypes...>(stack, res);
+  }
+};
+
+// Single return
+template <
+    typename ReturnType,
+    typename... ParameterTypes,
+    typename FuncT,
+    FuncT* func>
+struct boxer_impl<
+    ReturnType,
+    torch::headeronly::guts::typelist::typelist<ParameterTypes...>,
+    FuncT,
+    func> {
+  static void boxed_fn(
+      StableIValue* stack,
+      uint64_t num_args,
+      uint64_t num_outputs) {
+    STD_TORCH_CHECK(
+        num_args == sizeof...(ParameterTypes),
+        "Registered schema has ",
+        num_args,
+        " args, but the kernel to box has ",
+        sizeof...(ParameterTypes));
+    STD_TORCH_CHECK(
+        num_outputs == 1,
+        "Registered schema has ",
+        num_outputs,
+        " outputs, but the kernel to box has ",
+        1);
+    std::tuple<unbox_type_t<ParameterTypes>...> args =
+        unbox_to_tuple<unbox_type_t<ParameterTypes>...>(stack);
+    auto res = std::apply(func, args);
+    stack[0] = from<ReturnType>(res);
+  }
+};
+
+// No/void return
+template <typename... ParameterTypes, typename FuncT, FuncT* func>
+struct boxer_impl<
+    void,
+    torch::headeronly::guts::typelist::typelist<ParameterTypes...>,
+    FuncT,
+    func> {
+  static void boxed_fn(
+      StableIValue* stack,
+      uint64_t num_args,
+      uint64_t num_outputs) {
+    STD_TORCH_CHECK(
+        num_args == sizeof...(ParameterTypes),
+        "Registered schema has ",
+        num_args,
+        " args, but the kernel to box has ",
+        sizeof...(ParameterTypes));
+    STD_TORCH_CHECK(
+        num_outputs == 0,
+        "Registered schema has ",
+        num_outputs,
+        " outputs, but the kernel to box has ",
+        0);
+    std::tuple<unbox_type_t<ParameterTypes>...> args =
+        unbox_to_tuple<unbox_type_t<ParameterTypes>...>(stack);
+    std::apply(func, args);
+  }
+};
+
+template <typename FuncT, FuncT* func>
+struct boxer {
+  using FunctionTraits =
+      torch::headeronly::guts::infer_function_traits_t<FuncT>;
+
+  static void boxed_fn(
+      StableIValue* stack,
+      uint64_t num_args,
+      uint64_t num_outputs) {
+    boxer_impl<
+        typename FunctionTraits::return_type,
+        typename FunctionTraits::parameter_types,
+        FuncT,
+        func>::boxed_fn(stack, num_args, num_outputs);
+  }
+};
+
 HIDDEN_NAMESPACE_END(torch, stable, detail)
 
+#define TORCH_BOX(func)                                               \
+  torch::stable::detail::boxer<                                       \
+      std::remove_pointer_t<std::remove_reference_t<decltype(func)>>, \
+      (func)>::boxed_fn
+
 // macros copied from c10/macros/Macros.h
 #ifdef __COUNTER__
 #define STABLE_UID __COUNTER__
diff --git a/torch/csrc/stable/ops.h b/torch/csrc/stable/ops.h
index 5c2959e69ae0b..c90db39cb1b98 100644
--- a/torch/csrc/stable/ops.h
+++ b/torch/csrc/stable/ops.h
@@ -5,13 +5,13 @@
 #include <cstdint>
 #include <optional>
 #include <string>
-#include <vector>
 
 #include <torch/csrc/inductor/aoti_torch/generated/c_shim_aten.h>
 #include <torch/csrc/stable/c/shim.h>
 #include <torch/csrc/stable/version.h>
 #include <torch/headeronly/core/ScalarType.h>
 #include <torch/headeronly/macros/Macros.h>
+#include <torch/headeronly/util/HeaderOnlyArrayRef.h>
 
 HIDDEN_NAMESPACE_BEGIN(torch, stable)
 
@@ -68,8 +68,8 @@ inline torch::stable::Tensor narrow(
 // only dtype information.
 inline torch::stable::Tensor new_empty(
     const torch::stable::Tensor& self,
-    std::vector<int64_t> size,
-    std::optional<c10::ScalarType> dtype = std::nullopt) {
+    torch::headeronly::IntHeaderOnlyArrayRef size,
+    std::optional<torch::headeronly::ScalarType> dtype = std::nullopt) {
   int32_t device_type;
   TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type(self.get(), &device_type));
 
@@ -107,8 +107,8 @@ inline torch::stable::Tensor new_empty(
 // only dtype information.
 inline torch::stable::Tensor new_zeros(
     const torch::stable::Tensor& self,
-    std::vector<int64_t> size,
-    std::optional<c10::ScalarType> dtype = std::nullopt) {
+    torch::headeronly::IntHeaderOnlyArrayRef size,
+    std::optional<torch::headeronly::ScalarType> dtype = std::nullopt) {
   int32_t device_type;
   TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type(self.get(), &device_type));
 
@@ -144,12 +144,10 @@ inline torch::stable::Tensor new_zeros(
 
 // We expect this to be the stable version of the pad.default op.
 // pad.default takes in a SymInt[] as the pad argument however pad is typed as
-// use std::vector<int64_t> because
-// (1) IntArrayRef is not yet header-only
-// (2) SymInt is not yet header-only
+// torch::headeronly::IntHeaderOnlyArrayRef as SymInt is not yet header-only.
 inline torch::stable::Tensor pad(
     const torch::stable::Tensor& self,
-    std::vector<int64_t> pad,
+    torch::headeronly::IntHeaderOnlyArrayRef pad,
     const std::string& mode = "constant",
     double value = 0.0) {
   AtenTensorHandle ret0 = nullptr;
@@ -181,11 +179,10 @@ inline torch::stable::Tensor amax(
 // This function is an overload to compute the maximum value along each slice of
 // `self` reducing over all the dimensions in the vector `dims`. The
 // amax.default op takes in a SymInt[] as the dims argument, however dims is
-// typed as use std::vector<int64_t> here because (1) IntArrayRef is not yet
-// header-only (2) SymInt is not yet header-only
+// typed as use IntHeaderOnlyArrayRef here because SymInt is not yet header-only
 inline torch::stable::Tensor amax(
     const torch::stable::Tensor& self,
-    std::vector<int64_t> dims,
+    torch::headeronly::IntHeaderOnlyArrayRef dims,
     bool keepdim = false) {
   AtenTensorHandle ret = nullptr;
   TORCH_ERROR_CODE_CHECK(aoti_torch_aten_amax(
@@ -272,10 +269,114 @@ inline torch::stable::Tensor clone(const torch::stable::Tensor& self) {
   return torch::stable::detail::to<torch::stable::Tensor>(stack[0]);
 }
 
+// We expect this to be the stable version of the flatten.using_ints op.
+inline torch::stable::Tensor flatten(
+    const torch::stable::Tensor& self,
+    int64_t start_dim = 0,
+    int64_t end_dim = -1) {
+  const auto num_args = 3;
+  std::array<StableIValue, num_args> stack{
+      torch::stable::detail::from(self),
+      torch::stable::detail::from(start_dim),
+      torch::stable::detail::from(end_dim)};
+#if TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0
+  TORCH_ERROR_CODE_CHECK(torch_call_dispatcher(
+      "aten::flatten", "using_ints", stack.data(), TORCH_ABI_VERSION));
+#else
+  TORCH_ERROR_CODE_CHECK(
+      aoti_torch_call_dispatcher("aten::flatten", "using_ints", stack.data()));
+#endif
+  return torch::stable::detail::to<torch::stable::Tensor>(stack[0]);
+}
+
 #if TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0
 
 // New ops should be added here if they use a brand new shim API
 
+// Parallel utility wrapper that provides a stable interface to at::parallel_for
+// This function has the same signature as at::parallel_for and allows stable
+// ABI code to leverage PyTorch's parallel execution capabilities.
+//
+// The function f will be called with (begin, end) ranges to process in
+// parallel. grain_size controls the minimum work size per thread for efficient
+// parallelization.
+template <class F>
+inline void parallel_for(
+    const int64_t begin,
+    const int64_t end,
+    const int64_t grain_size,
+    const F& f) {
+  auto callback = [](int64_t cb_begin, int64_t cb_end, void* ctx) {
+    const F* func = static_cast<const F*>(ctx);
+    (*func)(cb_begin, cb_end);
+  };
+  TORCH_ERROR_CODE_CHECK(torch_parallel_for(
+      begin,
+      end,
+      grain_size,
+      callback,
+      const_cast<void*>(static_cast<const void*>(&f))));
+}
+
+// Get the number of threads for the parallel backend
+// This provides a stable interface to at::get_num_threads
+inline uint32_t get_num_threads() {
+  uint32_t num_threads;
+  TORCH_ERROR_CODE_CHECK(torch_get_num_threads(&num_threads));
+  return num_threads;
+}
+
+// We expect this to be the stable version of the empty op that takes in
+// device and dtype parameters. The empty op creates a tensor with uninitialized
+// values of the specified size, dtype, and device.
+// This function is only available in 2.10 because it uses the stableivalue
+// conversion for HeaderOnlyArrayRef<T>, which is only available in 2.10.
+inline torch::stable::Tensor empty(
+    torch::headeronly::IntHeaderOnlyArrayRef size,
+    std::optional<torch::headeronly::ScalarType> dtype = std::nullopt,
+    std::optional<torch::stable::Device> device = std::nullopt,
+    std::optional<bool> pin_memory = std::nullopt) {
+  const auto num_args = 6;
+  std::array<StableIValue, num_args> stack{
+      torch::stable::detail::from(size),
+      torch::stable::detail::from(dtype),
+      torch::stable::detail::from(std::nullopt),
+      torch::stable::detail::from(device),
+      torch::stable::detail::from(pin_memory),
+      torch::stable::detail::from(std::nullopt)};
+  TORCH_ERROR_CODE_CHECK(torch_call_dispatcher(
+      "aten::empty", "memory_format", stack.data(), TORCH_ABI_VERSION));
+  return torch::stable::detail::to<torch::stable::Tensor>(stack[0]);
+}
+
+// We expect this to be the stable version of the reshape op.
+// This function is only available in 2.10 because it uses the stableivalue
+// conversion for HeaderOnlyArrayRef<T>, which is only available in 2.10.
+inline torch::stable::Tensor reshape(
+    const torch::stable::Tensor& self,
+    torch::headeronly::IntHeaderOnlyArrayRef shape) {
+  const auto num_args = 2;
+  std::array<StableIValue, num_args> stack{
+      torch::stable::detail::from(self), torch::stable::detail::from(shape)};
+  TORCH_ERROR_CODE_CHECK(torch_call_dispatcher(
+      "aten::reshape", "", stack.data(), TORCH_ABI_VERSION));
+  return torch::stable::detail::to<torch::stable::Tensor>(stack[0]);
+}
+
+// We expect this to be the stable version of the view op.
+// This function is only available in 2.10 because it uses the stableivalue
+// conversion for HeaderOnlyArrayRef<T>, which is only available in 2.10.
+inline torch::stable::Tensor view(
+    const torch::stable::Tensor& self,
+    torch::headeronly::IntHeaderOnlyArrayRef size) {
+  const auto num_args = 2;
+  std::array<StableIValue, num_args> stack{
+      torch::stable::detail::from(self), torch::stable::detail::from(size)};
+  TORCH_ERROR_CODE_CHECK(
+      torch_call_dispatcher("aten::view", "", stack.data(), TORCH_ABI_VERSION));
+  return torch::stable::detail::to<torch::stable::Tensor>(stack[0]);
+}
+
 #endif
 
 HIDDEN_NAMESPACE_END(torch, stable)
diff --git a/torch/csrc/stable/stableivalue_conversions.h b/torch/csrc/stable/stableivalue_conversions.h
index f35ed50d99be4..15ac8e539e76b 100644
--- a/torch/csrc/stable/stableivalue_conversions.h
+++ b/torch/csrc/stable/stableivalue_conversions.h
@@ -1,8 +1,10 @@
 #pragma once
 
-#include <c10/util/Exception.h>
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/csrc/stable/c/shim.h>
+#include <torch/csrc/stable/device_struct.h>
 #include <torch/csrc/stable/tensor_struct.h>
+#include <torch/headeronly/core/DeviceType.h>
 #include <torch/headeronly/core/ScalarType.h>
 #include <torch/headeronly/macros/Macros.h>
 #include <torch/headeronly/util/Exception.h>
@@ -12,6 +14,21 @@
 
 HIDDEN_NAMESPACE_BEGIN(torch, stable, detail)
 
+// Helper variable templates to detect 2.10+ types for better compile-time error
+// messages
+template <typename T>
+inline constexpr bool is_header_only_array_ref_v = false;
+
+template <typename T>
+inline constexpr bool
+    is_header_only_array_ref_v<torch::headeronly::HeaderOnlyArrayRef<T>> = true;
+
+template <typename T>
+inline constexpr bool is_std_vector_v = false;
+
+template <typename T>
+inline constexpr bool is_std_vector_v<std::vector<T>> = true;
+
 // forward declare so that the from/to() implementations in the detail
 // namespace of library.h where the real work is done can compile.
 template <typename T>
@@ -31,10 +48,19 @@ template <typename T>
 struct FromImpl {
   static StableIValue call(
       T val,
-      uint64_t extension_build_version,
-      bool is_internal) {
-    (void)extension_build_version; // Unused parameter
-    (void)is_internal; // Unused parameter
+      [[maybe_unused]] uint64_t extension_build_version,
+      [[maybe_unused]] bool is_internal) {
+    // Ensure 2.10+ types don't accidentally use the base case - provide clear
+    // compile-time errors.
+    static_assert(
+        !std::is_same_v<T, torch::stable::Device>,
+        "torch::stable::Device requires TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0");
+    static_assert(
+        !is_header_only_array_ref_v<T>,
+        "HeaderOnlyArrayRef<T> requires TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0");
+    static_assert(
+        !is_std_vector_v<T>,
+        "std::vector<T> requires TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0");
     static_assert(
         sizeof(T) <= sizeof(StableIValue),
         "StableLibrary stack does not support parameter types larger than 64 bits.");
@@ -75,10 +101,8 @@ template <>
 struct FromImpl<ScalarType> {
   static StableIValue call(
       ScalarType val,
-      uint64_t extension_build_version,
-      bool is_internal) {
-    (void)extension_build_version; // Unused parameter
-    (void)is_internal; // Unused parameter
+      [[maybe_unused]] uint64_t extension_build_version,
+      [[maybe_unused]] bool is_internal) {
     switch (val) {
       case ScalarType::Byte:
         return from(aoti_torch_dtype_uint8());
@@ -121,22 +145,63 @@ struct FromImpl<ScalarType> {
       case ScalarType::UInt64:
         return from(aoti_torch_dtype_uint64());
       default:
-        TORCH_CHECK(
+        STD_TORCH_CHECK(
             false,
             "Not yet supported ScalarType, please file an issue describing your use case.");
     }
   }
 };
 
+// [Note DeviceType version guard]
+// This conversion was introduced in 2.10. However, we do not gate it
+// with TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0 because this
+// conversion is not actually used to pass DeviceType between user
+// extensions and libtorch (i.e. there is no c10::TypeKind::DeviceType).
+// The purpose of gating other conversions is to ensure that user
+// extensions do not try to pass a StableIValue that libtorch is
+// unable to interpret.
+// This conversion is only used
+// (1) In the conversion for torch::stable::Device (already gated)
+// (2) Within the user extension to translate between libtorch/extension's
+//     DeviceType (no gating needed)
+// Specialization for torch::headeronly::DeviceType => StableIValue
+// Note that we call into the shim to translate between the user's
+// DeviceType and libtorch's DeviceType, which can be different!
+using torch::headeronly::DeviceType;
+template <>
+struct FromImpl<DeviceType> {
+  static StableIValue call(
+      DeviceType val,
+      [[maybe_unused]] uint64_t extension_build_version,
+      [[maybe_unused]] bool is_internal) {
+    switch (val) {
+      case DeviceType::CPU:
+        return from(aoti_torch_device_type_cpu());
+      case DeviceType::CUDA:
+        return from(aoti_torch_device_type_cuda());
+      case DeviceType::Meta:
+        return from(aoti_torch_device_type_meta());
+      case DeviceType::XPU:
+        return from(aoti_torch_device_type_xpu());
+      case DeviceType::MPS:
+        return from(aoti_torch_device_type_mps());
+      case DeviceType::PrivateUse1:
+        return from(aoti_torch_device_type_privateuse1());
+      default:
+        STD_TORCH_CHECK(
+            false,
+            "Not yet supported DeviceType, please file an issue describing your use case.");
+    }
+  }
+};
+
 // Specialization for std::nullopt_t => StableIValue
 template <>
 struct FromImpl<std::nullopt_t> {
   static StableIValue call(
       std::nullopt_t val,
-      uint64_t extension_build_version,
-      bool is_internal) {
-    (void)extension_build_version; // Unused parameter
-    (void)is_internal; // Unused parameter
+      [[maybe_unused]] uint64_t extension_build_version,
+      [[maybe_unused]] bool is_internal) {
     return from(nullptr);
   }
 };
@@ -190,16 +255,83 @@ template <>
 struct FromImpl<torch::stable::Tensor> {
   static StableIValue call(
       const torch::stable::Tensor& val,
-      uint64_t extension_build_version,
-      bool is_internal) {
-    (void)extension_build_version; // Unused parameter
-    (void)is_internal; // Unused parameter
+      [[maybe_unused]] uint64_t extension_build_version,
+      [[maybe_unused]] bool is_internal) {
     AtenTensorHandle new_ath;
     TORCH_ERROR_CODE_CHECK(aoti_torch_new_tensor_handle(val.get(), &new_ath));
     return from(new_ath);
   }
 };
 
+// =============================================================================
+// FROM CONVERSIONS requiring TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0
+// =============================================================================
+#if TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0
+
+// Specialization for torch::headeronly::HeaderOnlyArrayRef<T> => StableIValue
+// Returns a new owning reference of the underlying list.
+template <typename T>
+struct FromImpl<torch::headeronly::HeaderOnlyArrayRef<T>> {
+  static StableIValue call(
+      const torch::headeronly::HeaderOnlyArrayRef<T>& val,
+      [[maybe_unused]] uint64_t extension_build_version,
+      [[maybe_unused]] bool is_internal) {
+    StableListHandle new_list_handle;
+    try {
+      TORCH_ERROR_CODE_CHECK(
+          torch_new_list_reserve_size(val.size(), &new_list_handle));
+      for (const auto& elem : val) {
+        TORCH_ERROR_CODE_CHECK(
+            torch_list_push_back(new_list_handle, from(elem)));
+      }
+      return from(new_list_handle);
+    } catch (const std::runtime_error& e) {
+      if (new_list_handle != nullptr) {
+        // clean up memory if an error was thrown
+        TORCH_ERROR_CODE_CHECK(torch_delete_list(new_list_handle));
+      }
+      throw;
+    }
+  }
+};
+
+// Specialization for std::vector<T> => StableIValue, which is implemented the
+// same way as HeaderOnlyArrayRef<T> => StableIValue
+// Returns a new owning reference of the underlying list.
+template <typename T>
+struct FromImpl<std::vector<T>> {
+  static StableIValue call(
+      const std::vector<T>& val,
+      [[maybe_unused]] uint64_t extension_build_version,
+      [[maybe_unused]] bool is_internal) {
+    return from<torch::headeronly::HeaderOnlyArrayRef<T>>(val);
+  }
+};
+
+// Specialization for torch::stable::Device => StableIValue
+// Pack the device type and index into a StableIValue in a platform-independent
+// format. We use the shim representation for DeviceType (int32_t) for ABI
+// stability. StableIValue layout: DeviceIndex in lower 32 bits,
+// DeviceType (shim int32_t) in upper 32 bits
+template <>
+struct FromImpl<torch::stable::Device> {
+  static StableIValue call(
+      const torch::stable::Device& val,
+      [[maybe_unused]] uint64_t extension_build_version,
+      [[maybe_unused]] bool is_internal) {
+    // Convert DeviceType to shim representation (int32_t)
+    StableIValue device_type_shim = from(val.type());
+    // Pack: lower 32 bits = device index, upper 32 bits = device type (shim)
+    uint64_t device_index_bits =
+        static_cast<uint64_t>(static_cast<uint32_t>(val.index()));
+    uint64_t device_type_bits =
+        static_cast<uint64_t>(static_cast<uint32_t>(device_type_shim)) << 32;
+    return device_index_bits | device_type_bits;
+  }
+};
+
+#endif // TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0
+
 // =============================================================================
 // TO CONVERSIONS (StableIValue -> T)
 // =============================================================================
@@ -209,11 +341,20 @@ template <typename T>
 struct ToImpl {
   static T call(
       StableIValue val,
-      uint64_t extension_build_version,
-      bool is_internal) {
-    (void)extension_build_version; // Unused parameter
-    (void)is_internal; // Unused parameter
+      [[maybe_unused]] uint64_t extension_build_version,
+      [[maybe_unused]] bool is_internal) {
     static_assert(std::is_trivially_copyable_v<T>);
+    // Ensure 2.10+ types don't accidentally use the base case - provide clear
+    // compile-time errors.
+    static_assert(
+        !std::is_same_v<T, torch::stable::Device>,
+        "torch::stable::Device requires TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0");
+    static_assert(
+        !is_header_only_array_ref_v<T>,
+        "HeaderOnlyArrayRef<T> requires TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0");
+    static_assert(
+        !is_std_vector_v<T>,
+        "std::vector<T> requires TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0");
     // T may not have a default constructor. (For example, it might be
     // c10::Device.) However, std::memcpy implicitly creates a T at the
     // destination. So, we can use a union to work around this lack of
@@ -249,10 +390,8 @@ template <>
 struct ToImpl<ScalarType> {
   static ScalarType call(
       StableIValue val,
-      uint64_t extension_build_version,
-      bool is_internal) {
-    (void)extension_build_version; // Unused parameter
-    (void)is_internal; // Unused parameter
+      [[maybe_unused]] uint64_t extension_build_version,
+      [[maybe_unused]] bool is_internal) {
     int32_t shim_scalartype = to<int32_t>(val);
     if (shim_scalartype == aoti_torch_dtype_uint8()) {
       return ScalarType::Byte;
@@ -295,7 +434,7 @@ struct ToImpl<ScalarType> {
     } else if (shim_scalartype == aoti_torch_dtype_uint64()) {
       return ScalarType::UInt64;
     } else {
-      TORCH_CHECK(
+      STD_TORCH_CHECK(
           false,
           "Not yet supported ScalarType ",
           std::to_string(shim_scalartype),
@@ -304,15 +443,44 @@ struct ToImpl<ScalarType> {
   }
 };
 
+// See [Note DeviceType version guard]
+// Specialization for StableIValue => torch::headeronly::DeviceType
+template <>
+struct ToImpl<DeviceType> {
+  static DeviceType call(
+      StableIValue val,
+      [[maybe_unused]] uint64_t extension_build_version,
+      [[maybe_unused]] bool is_internal) {
+    int32_t shim_devicetype = to<int32_t>(val);
+    if (shim_devicetype == aoti_torch_device_type_cpu()) {
+      return DeviceType::CPU;
+    } else if (shim_devicetype == aoti_torch_device_type_cuda()) {
+      return DeviceType::CUDA;
+    } else if (shim_devicetype == aoti_torch_device_type_meta()) {
+      return DeviceType::Meta;
+    } else if (shim_devicetype == aoti_torch_device_type_xpu()) {
+      return DeviceType::XPU;
+    } else if (shim_devicetype == aoti_torch_device_type_mps()) {
+      return DeviceType::MPS;
+    } else if (shim_devicetype == aoti_torch_device_type_privateuse1()) {
+      return DeviceType::PrivateUse1;
+    } else {
+      STD_TORCH_CHECK(
+          false,
+          "Not yet supported DeviceType ",
+          std::to_string(shim_devicetype),
+          ", please file an issue describing your use case.");
+    }
+  }
+};
+
 // Specialization for StableIValue => std::nullopt_t
 template <>
 struct ToImpl<std::nullopt_t> {
   static std::nullopt_t call(
       StableIValue val,
-      uint64_t extension_build_version,
-      bool is_internal) {
-    (void)extension_build_version; // Unused parameter
-    (void)is_internal; // Unused parameter
+      [[maybe_unused]] uint64_t extension_build_version,
+      [[maybe_unused]] bool is_internal) {
     // val should be equivalent to from(nullptr)
     return std::nullopt;
   }
@@ -350,14 +518,69 @@ template <>
 struct ToImpl<torch::stable::Tensor> {
   static torch::stable::Tensor call(
       StableIValue val,
-      uint64_t extension_build_version,
-      bool is_internal) {
-    (void)extension_build_version; // Unused parameter
-    (void)is_internal; // Unused parameter
+      [[maybe_unused]] uint64_t extension_build_version,
+      [[maybe_unused]] bool is_internal) {
     return torch::stable::Tensor(to<AtenTensorHandle>(val));
   }
 };
 
+// =============================================================================
+// TO CONVERSIONS requiring TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0
+// =============================================================================
+#if TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0
+
+// Specialization for StableIValue => std::vector<T>
+// std::vector<T> should be represented as a StableListHandle
+// filled with StableIValues
+// The new std::vector steals ownership of the underlying elements
+// and we free the underlying list referred by the input StableListHandle.
+template <typename T>
+struct ToImpl<std::vector<T>> {
+  static std::vector<T> call(
+      StableIValue val,
+      [[maybe_unused]] uint64_t extension_build_version,
+      [[maybe_unused]] bool is_internal) {
+    auto list_handle = to<StableListHandle>(val);
+    size_t size;
+    try {
+      TORCH_ERROR_CODE_CHECK(torch_list_size(list_handle, &size));
+      std::vector<T> result;
+      result.reserve(size);
+      for (size_t i = 0; i < size; i++) {
+        StableIValue element;
+        TORCH_ERROR_CODE_CHECK(torch_list_get_item(list_handle, i, &element));
+        result.push_back(to<T>(element));
+      }
+      TORCH_ERROR_CODE_CHECK(torch_delete_list(list_handle));
+      return result;
+    } catch (const std::runtime_error& e) {
+      // clean up memory if an exception is thrown, and rethrow
+      TORCH_ERROR_CODE_CHECK(torch_delete_list(list_handle));
+      throw;
+    }
+  }
+};
+
+// Specialization for StableIValue => torch::stable::Device
+// Unpack device type and index from StableIValue in platform-independent
+// format. StableIValue layout: DeviceIndex in lower 32 bits,
+// DeviceType (shim int32_t) in upper 32 bits
+template <>
+struct ToImpl<torch::stable::Device> {
+  static torch::stable::Device call(
+      StableIValue val,
+      [[maybe_unused]] uint64_t extension_build_version,
+      [[maybe_unused]] bool is_internal) {
+    // Unpack: lower 32 bits = device index, upper 32 bits = device type (shim)
+    int32_t device_index = static_cast<int32_t>(val & 0xFFFFFFFF);
+    StableIValue device_type_shim = (val >> 32) & 0xFFFFFFFF;
+    DeviceType device_type = to<DeviceType>(device_type_shim);
+    return torch::stable::Device(device_type, device_index);
+  }
+};
+
+#endif // TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0
+
 // =============================================================================
 //  end to helpers for converting between StableIValue and T
 // =============================================================================
diff --git a/torch/csrc/stable/tensor_inl.h b/torch/csrc/stable/tensor_inl.h
index 37582de201840..8f7be7b4aabbd 100644
--- a/torch/csrc/stable/tensor_inl.h
+++ b/torch/csrc/stable/tensor_inl.h
@@ -22,4 +22,48 @@ inline ScalarType Tensor::scalar_type() const {
       torch::stable::detail::from(dtype));
 }
 
+inline Device Tensor::device() const {
+  int32_t device_type;
+  int32_t device_index;
+  TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type(ath_.get(), &device_type));
+  TORCH_ERROR_CODE_CHECK(
+      aoti_torch_get_device_index(ath_.get(), &device_index));
+  DeviceType extension_device_type = torch::stable::detail::to<DeviceType>(
+      torch::stable::detail::from(device_type));
+  return Device(extension_device_type, static_cast<DeviceIndex>(device_index));
+}
+
+#if TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0
+// The following data ptr cast methods mirror the methods defined in
+// aten/src/ATen/templates/TensorMethods.cpp
+#define DEFINE_DATA_PTR_CAST(T, name, PRED)               \
+  template <>                                             \
+  inline T* Tensor::mutable_data_ptr() const {            \
+    auto stype = scalar_type();                           \
+    STD_TORCH_CHECK(                                      \
+        PRED(stype, torch::headeronly::ScalarType::name), \
+        "expected scalar type " #name " but found ",      \
+        torch::headeronly::toString(stype));              \
+    return static_cast<T*>(mutable_data_ptr());           \
+  }                                                       \
+  template <>                                             \
+  inline const T* Tensor::const_data_ptr() const {        \
+    auto stype = scalar_type();                           \
+    STD_TORCH_CHECK(                                      \
+        PRED(stype, torch::headeronly::ScalarType::name), \
+        "expected scalar type " #name " but found ",      \
+        torch::headeronly::toString(stype));              \
+    return static_cast<const T*>(const_data_ptr());       \
+  }
+
+#define _PRED(S1, S2) S1 == S2
+#define DEFINE_CAST(T, name) DEFINE_DATA_PTR_CAST(T, name, _PRED)
+AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CAST)
+DEFINE_CAST(uint16_t, UInt16)
+DEFINE_CAST(uint32_t, UInt32)
+DEFINE_CAST(uint64_t, UInt64)
+#undef DEFINE_CAST
+#undef _PRED
+#endif // TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0
+
 HIDDEN_NAMESPACE_END(torch, stable)
diff --git a/torch/csrc/stable/tensor_struct.h b/torch/csrc/stable/tensor_struct.h
index 88cc167e59770..78a7430f555d4 100644
--- a/torch/csrc/stable/tensor_struct.h
+++ b/torch/csrc/stable/tensor_struct.h
@@ -4,15 +4,18 @@
 #include <torch/headeronly/core/ScalarType.h>
 #include <torch/headeronly/macros/Macros.h>
 #include <torch/headeronly/util/Exception.h>
+#include <torch/headeronly/util/HeaderOnlyArrayRef.h>
 #include <torch/headeronly/util/shim_utils.h>
 #include <climits>
 #include <memory>
 
 #include <torch/csrc/stable/accelerator.h>
+#include <torch/csrc/stable/device_struct.h>
 
 HIDDEN_NAMESPACE_BEGIN(torch, stable)
 
 using accelerator::DeviceIndex;
+using torch::headeronly::IntHeaderOnlyArrayRef;
 using torch::headeronly::ScalarType;
 
 // The torch::stable::Tensor class is a highlevel C++ wrapper around
@@ -75,12 +78,34 @@ class Tensor {
   // semantics as their counterparts in TensorBase.h.
   // =============================================================================
 
+  // Do not add new uses of data_ptr(), use const_data_ptr() if
+  // possible, mutable_data_ptr() otherwise.
   void* data_ptr() const {
     void* data_ptr;
     TORCH_ERROR_CODE_CHECK(aoti_torch_get_data_ptr(ath_.get(), &data_ptr));
     return data_ptr;
   }
 
+#if TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0
+  void* mutable_data_ptr() const {
+    void* data_ptr{};
+    TORCH_ERROR_CODE_CHECK(torch_get_mutable_data_ptr(ath_.get(), &data_ptr));
+    return data_ptr;
+  }
+
+  const void* const_data_ptr() const {
+    const void* data_ptr{};
+    TORCH_ERROR_CODE_CHECK(torch_get_const_data_ptr(ath_.get(), &data_ptr));
+    return data_ptr;
+  }
+
+  template <typename T>
+  T* mutable_data_ptr() const;
+
+  template <typename T, std::enable_if_t<!std::is_const_v<T>, int> = 0>
+  const T* const_data_ptr() const;
+#endif // TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0
+
   int64_t dim() const {
     int64_t dim;
     TORCH_ERROR_CODE_CHECK(aoti_torch_get_dim(ath_.get(), &dim));
@@ -93,6 +118,32 @@ class Tensor {
     return numel;
   }
 
+  // note: this API is, for all intents and purposes, the same as the one in
+  // TensorBase.h: it returns a borrowed reference of the dimension sizes of
+  // a Tensor.
+  //
+  // The only difference is that it returns a header-only IntHeaderOnlyArrayRef,
+  // which has slightly less functionality than a regular IntArrayRef. See
+  // [HeaderOnlyArrayRef vs ArrayRef note] for more details.
+  IntHeaderOnlyArrayRef sizes() const {
+    int64_t* sizes;
+    TORCH_ERROR_CODE_CHECK(aoti_torch_get_sizes(ath_.get(), &sizes));
+    return IntHeaderOnlyArrayRef(sizes, dim());
+  }
+
+  // note: this API is, for all intents and purposes, the same as the one in
+  // TensorBase.h: it returns a borrowed reference of the strides of a
+  // Tensor.
+  //
+  // The only difference is that it returns a header-only IntHeaderOnlyArrayRef,
+  // which has slightly less functionality than a regular IntArrayRef. See
+  // [HeaderOnlyArrayRef vs ArrayRef note] for more details.
+  IntHeaderOnlyArrayRef strides() const {
+    int64_t* strides;
+    TORCH_ERROR_CODE_CHECK(aoti_torch_get_strides(ath_.get(), &strides));
+    return IntHeaderOnlyArrayRef(strides, dim());
+  }
+
   // note: this is a subset of the original TensorBase API. It takes no
   // arguments whereas the original API takes in a kwarg of memory format.
   // Here, we assume the default contiguous memory format.
@@ -164,6 +215,9 @@ class Tensor {
   // defined in tensor-inl.h to avoid circular dependencies
   ScalarType scalar_type() const;
 
+  // defined in tensor-inl.h to avoid circular dependencies
+  Device device() const;
+
   // =============================================================================
   // END of C-shimified TensorBase APIs
   // =============================================================================
diff --git a/torch/csrc/tensor/python_tensor.cpp b/torch/csrc/tensor/python_tensor.cpp
index ad418955e0559..d4c810d95c608 100644
--- a/torch/csrc/tensor/python_tensor.cpp
+++ b/torch/csrc/tensor/python_tensor.cpp
@@ -218,7 +218,7 @@ static void py_initialize_tensor_type(
 
 static std::string get_name(Backend backend, ScalarType scalarType) {
   std::ostringstream ss;
-  ss << torch::utils::backend_to_string(backend) << "." << toString(scalarType)
+  ss << torch::utils::backend_to_string(backend) << '.' << toString(scalarType)
      << "Tensor";
   return ss.str();
 }
diff --git a/torch/csrc/utils/pyobject_preservation.cpp b/torch/csrc/utils/pyobject_preservation.cpp
index 4f2d0a2507011..a652cbdb7aefd 100644
--- a/torch/csrc/utils/pyobject_preservation.cpp
+++ b/torch/csrc/utils/pyobject_preservation.cpp
@@ -1,19 +1,67 @@
 #include <torch/csrc/utils/pyobject_preservation.h>
 
-#include <structmember.h>
-
-void clear_slots(PyTypeObject* type, PyObject* self) {
-  Py_ssize_t n = Py_SIZE(type);
-  PyMemberDef* mp = type->tp_members;
-
-  for (Py_ssize_t i = 0; i < n; i++, mp++) {
-    if (mp->type == T_OBJECT_EX && !(mp->flags & READONLY)) {
-      char* addr = (char*)self + mp->offset;
-      PyObject* obj = *(PyObject**)addr;
-      if (obj != nullptr) {
-        *(PyObject**)addr = nullptr;
-        Py_DECREF(obj);
-      }
+#include <c10/core/impl/PyObjectSlot.h>
+#include <c10/util/intrusive_ptr.h>
+
+namespace torch::utils {
+
+using c10::intrusive_ptr_target;
+using c10::impl::PyObjectSlot;
+
+void PyObjectPreservation::init_fresh_nonatomic(
+    intrusive_ptr_target* target,
+    PyObjectSlot* slot,
+    PyObject* pyobj) {
+  TORCH_INTERNAL_ASSERT(slot->load_pyobj() == nullptr);
+  TORCH_INTERNAL_ASSERT(
+      target->combined_refcount_.load(std::memory_order_relaxed) ==
+      c10::detail::kUniqueRef);
+
+  slot->pyobj_.store(pyobj, std::memory_order_relaxed);
+  slot->pyobj_interpreter_.store(
+      c10::impl::getGlobalPyInterpreter(), std::memory_order_relaxed);
+  target->combined_refcount_.store(
+      c10::detail::kHasPyObject | c10::detail::kUniqueRef,
+      std::memory_order_relaxed);
+}
+
+PyObject* PyObjectPreservation::init_once(
+    intrusive_ptr_target* target,
+    PyObjectSlot* slot,
+    PyObject* pyobj) {
+  PyObject* expected = nullptr;
+  if (!slot->pyobj_.compare_exchange_strong(
+          expected, pyobj, std::memory_order_acq_rel)) {
+    TORCH_INTERNAL_ASSERT(expected != nullptr);
+    return expected;
+  }
+
+  slot->pyobj_interpreter_.store(
+      c10::impl::getGlobalPyInterpreter(), std::memory_order_release);
+
+  bool increfed = false;
+  auto combined = target->combined_refcount_.load(std::memory_order_relaxed);
+  do {
+    TORCH_INTERNAL_ASSERT(!c10::detail::has_pyobject(combined));
+    if (c10::detail::refcount(combined) > 1 && !increfed) {
+      // We need to incref the object to preserve the invariant that
+      // if refcount > 1, the c10 object holds a reference to the PyObject.
+      // This must happen before we set the kHasPyObject bit.
+      Py_INCREF(pyobj);
+      increfed = true;
     }
+  } while (!target->combined_refcount_.compare_exchange_weak(
+      combined,
+      combined | c10::detail::kHasPyObject,
+      std::memory_order_acq_rel,
+      std::memory_order_relaxed));
+
+  if (increfed && c10::detail::refcount(combined) == 1) {
+    // Fix up if refcount if we did the incref in a failed compare-exchange
+    Py_DECREF(pyobj);
   }
+
+  return pyobj;
 }
+
+} // namespace torch::utils
diff --git a/torch/csrc/utils/pyobject_preservation.h b/torch/csrc/utils/pyobject_preservation.h
index 456095d7b7037..b060bc034b2c3 100644
--- a/torch/csrc/utils/pyobject_preservation.h
+++ b/torch/csrc/utils/pyobject_preservation.h
@@ -4,4 +4,28 @@
 
 // This file contains utilities used for handling PyObject preservation
 
-void clear_slots(PyTypeObject* type, PyObject* self);
+namespace c10 {
+class intrusive_ptr_target;
+namespace impl {
+struct PyObjectSlot;
+} // namespace impl
+} // namespace c10
+
+namespace torch::utils {
+
+class PyObjectPreservation {
+ public:
+  // Store a PyObject wrapper on a fresh c10 wrapper. The caller must hold
+  // a unique reference to `target`.
+  static void init_fresh_nonatomic(
+      c10::intrusive_ptr_target* target,
+      c10::impl::PyObjectSlot* slot,
+      PyObject* pyobj);
+
+  static PyObject* init_once(
+      c10::intrusive_ptr_target* target,
+      c10::impl::PyObjectSlot* slot,
+      PyObject* pyobj);
+};
+
+} // namespace torch::utils
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
index a51cfaf8c5c1c..e89f7887320a0 100644
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -4,6 +4,7 @@
 #include <torch/csrc/Layout.h>
 #include <torch/csrc/MemoryFormat.h>
 #include <torch/csrc/autograd/python_variable.h>
+#include <torch/csrc/jit/python/pybind_utils.h>
 #include <torch/csrc/utils/invalid_arguments.h>
 #include <torch/csrc/utils/python_strings.h>
 #include <torch/csrc/utils/python_torch_function_mode.h>
@@ -12,6 +13,7 @@
 #include <ATen/ATen.h>
 #include <ATen/PythonTorchFunctionTLS.h>
 #include <ATen/TracerMode.h>
+#include <ATen/core/dispatch/Dispatcher.h>
 #include <c10/util/irange.h>
 
 #include <sstream>
@@ -301,6 +303,16 @@ static py::object maybe_get_registered_torch_dispatch_rule(
   return result;
 }
 
+static bool is_dtensor(PyObject* obj) {
+#ifdef USE_DISTRIBUTED
+  const py::handle dtensor = get_dtensor_class();
+  return (PyObject*)Py_TYPE(obj) == dtensor.ptr() ||
+      py::isinstance(py::handle(obj), dtensor);
+#else
+  return false;
+#endif
+}
+
 // NB: Invariant: if you run this function, you MUST test if the returned
 // py::object is nullptr, as this will occur WITHOUT error condition being set.
 // And if an error happens, this function is responsible for throwing a C++
@@ -313,8 +325,8 @@ static py::object dispatch_on_subclass(
     PyObject* torch_api_function,
     bool is_torch_function,
     const char* torch_function_name_str,
-    std::optional<c10::impl::TorchDispatchModeKey> maybe_mode_key =
-        std::nullopt) {
+    const c10::OperatorHandle* opt_op,
+    torch::jit::Stack* opt_stack) {
   py::object ret;
   for (auto& arg : overloaded_args) {
     py::object torch_function =
@@ -367,13 +379,39 @@ static py::object dispatch_on_subclass(
       }
     }
 
-    ret = py::reinterpret_steal<py::object>(PyObject_CallFunctionObjArgs(
-        torch_function.ptr(),
-        torch_api_function,
-        py_types.ptr(),
-        args,
-        kwargs,
-        NULL));
+    if (!is_torch_function && is_dtensor(arg)) {
+      if (opt_op && opt_stack) {
+        ret = dispatchDTensorOp(
+            *opt_op, torch_api_function, args, kwargs, opt_stack);
+      } else {
+        // Slow path -- reconstruct C++ data structures since they were not
+        // provided.
+        auto schema = py::cast<at::FunctionSchema>(
+            py::handle(torch_api_function).attr("_schema"));
+        auto opt_op_handle =
+            c10::Dispatcher::singleton().findOp(schema.operator_name());
+        TORCH_CHECK(
+            opt_op_handle.has_value(),
+            "could not look up op for ",
+            schema.operator_name());
+        const auto& op_handle = *opt_op_handle;
+        auto stack = torch::jit::createStackForSchema(
+            op_handle.schema(),
+            py::reinterpret_borrow<py::args>(args),
+            py::reinterpret_borrow<py::kwargs>(kwargs),
+            std::nullopt);
+        ret = dispatchDTensorOp(
+            op_handle, torch_api_function, args, kwargs, &stack);
+      }
+    } else {
+      ret = py::reinterpret_steal<py::object>(PyObject_CallFunctionObjArgs(
+          torch_function.ptr(),
+          torch_api_function,
+          py_types.ptr(),
+          args,
+          kwargs,
+          NULL));
+    }
     if (ret.ptr() == nullptr) {
       throw python_error();
     }
@@ -480,6 +518,28 @@ auto handle_torch_function_no_python_arg_parser(
     PyObject* torch_api_function,
     const char* module_name,
     TorchFunctionName torch_function_name) -> PyObject* {
+  return handle_torch_function_no_python_arg_parser(
+      overloaded_args,
+      args,
+      kwargs,
+      func_name,
+      torch_api_function,
+      module_name,
+      nullptr,
+      nullptr,
+      torch_function_name);
+}
+
+auto handle_torch_function_no_python_arg_parser(
+    at::ArrayRef<PyObject*> overloaded_args,
+    PyObject* args,
+    PyObject* kwargs,
+    const char* func_name,
+    PyObject* torch_api_function,
+    const char* module_name,
+    const c10::OperatorHandle* opt_op,
+    torch::jit::Stack* opt_stack,
+    TorchFunctionName torch_function_name) -> PyObject* {
   const char* torch_function_name_str = nullptr;
   switch (torch_function_name) {
     case TorchFunctionName::TorchFunction:
@@ -579,7 +639,9 @@ auto handle_torch_function_no_python_arg_parser(
         py_types,
         torch_api_function,
         is_torch_function,
-        torch_function_name_str);
+        torch_function_name_str,
+        opt_op,
+        opt_stack);
     if (curr_ret.ptr() != nullptr) {
       ret = curr_ret;
     }
@@ -601,20 +663,20 @@ auto handle_torch_function_no_python_arg_parser(
     std::stringstream ss;
     ss << "Multiple dispatch failed for '";
     if (module_name && func_name) {
-      ss << module_name << "." << func_name;
+      ss << module_name << '.' << func_name;
     } else {
       py::handle fn = torch_api_function;
-      ss << py::str(fn.attr("__module__")) << "."
+      ss << py::str(fn.attr("__module__")) << '.'
          << py::str(fn.attr("__name__"));
     }
     ss << "'; all " << torch_function_name_str
        << " handlers returned NotImplemented:\n\n";
     if (mode_obj) {
-      ss << "  - mode object " << py::repr(mode_obj) << "\n";
+      ss << "  - mode object " << py::repr(mode_obj) << '\n';
     }
     for (auto& arg : overloaded_args) {
       ss << "  - tensor subclass " << py::repr(get_type_of_overloaded_arg(arg))
-         << "\n";
+         << '\n';
     }
     ss << "\nFor more information, try re-running with TORCH_LOGS=not_implemented";
     const std::string& tmp = ss.str();
@@ -1480,7 +1542,7 @@ std::string FunctionSignature::toString() const {
   // optionals, etc.
   std::ostringstream ss;
   bool keyword_already = false;
-  ss << "(";
+  ss << '(';
   int i = 0;
   for (auto& param : params) {
     if (i != 0) {
@@ -1490,13 +1552,13 @@ std::string FunctionSignature::toString() const {
       ss << "*, ";
       keyword_already = true;
     }
-    ss << param.type_name() << " " << param.name;
+    ss << param.type_name() << ' ' << param.name;
     if (param.optional) {
       ss << " = " << param.default_value;
     }
     i++;
   }
-  ss << ")";
+  ss << ')';
   return ss.str();
 }
 
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index 5887235f72e50..4a73a21916776 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -565,8 +565,16 @@ inline std::vector<c10::SymInt> PythonArgs::symintlist(int i) {
     return std::vector<c10::SymInt>(size1, si);
   }
 
+  if (size1 > 0 && THPVariable_Check(args[i])) {
+    return std::vector<c10::SymInt>(
+        size1, THPVariable_Unpack(args[i]).item().toSymInt());
+  }
+
   PyObject* arg = args[i];
   auto tuple = PyTuple_Check(arg);
+  if (!tuple) {
+    TORCH_INTERNAL_ASSERT(PyList_Check(arg), "expected tuple or list");
+  }
   // NOLINTNEXTLINE(bugprone-branch-clone)
   const auto size2 = tuple ? PyTuple_GET_SIZE(arg) : PyList_GET_SIZE(arg);
   std::vector<c10::SymInt> res;
@@ -645,7 +653,13 @@ inline std::vector<int64_t> PythonArgs::intlistWithDefault(
   if (size1 > 0 && torch::is_dynint(py::handle(arg))) {
     return std::vector<int64_t>(size1, py::handle(arg).cast<int>());
   }
+  if (size1 > 0 && THPVariable_Check(arg)) {
+    return std::vector<int64_t>(size1, THPVariable_Unpack(arg).item<int64_t>());
+  }
   auto tuple = PyTuple_Check(arg);
+  if (!tuple) {
+    TORCH_INTERNAL_ASSERT(PyList_Check(arg), "expected tuple or list");
+  }
   // NOLINTNEXTLINE(bugprone-branch-clone)
   const auto size2 = tuple ? PyTuple_GET_SIZE(arg) : PyList_GET_SIZE(arg);
   std::vector<int64_t> res(size2);
@@ -716,6 +730,9 @@ inline c10::OptionalArray<c10::SymInt> PythonArgs::symintlistOptional(int i) {
 inline std::vector<double> PythonArgs::getDoublelist(int i) {
   PyObject* arg = args[i];
   auto tuple = PyTuple_Check(arg);
+  if (!tuple) {
+    TORCH_INTERNAL_ASSERT(PyList_Check(arg), "expected tuple or list");
+  }
   // NOLINTNEXTLINE(bugprone-branch-clone)
   auto size = tuple ? PyTuple_GET_SIZE(arg) : PyList_GET_SIZE(arg);
   std::vector<double> res(size);
@@ -889,6 +906,9 @@ inline at::Dimname PythonArgs::dimname(int i) {
 
 inline std::vector<at::Dimname> parseDimnameList(PyObject* arg) {
   auto tuple = PyTuple_Check(arg);
+  if (!tuple) {
+    TORCH_INTERNAL_ASSERT(PyList_Check(arg), "expected tuple or list");
+  }
   // NOLINTNEXTLINE(bugprone-branch-clone)
   auto size = tuple ? PyTuple_GET_SIZE(arg) : PyList_GET_SIZE(arg);
   std::vector<at::Dimname> res;
@@ -1267,6 +1287,18 @@ auto TORCH_PYTHON_API handle_torch_function_no_python_arg_parser(
     TorchFunctionName torch_function_name = TorchFunctionName::TorchFunction)
     -> PyObject*;
 
+auto handle_torch_function_no_python_arg_parser(
+    at::ArrayRef<PyObject*> overloaded_args,
+    PyObject* args,
+    PyObject* kwargs,
+    const char* func_name,
+    PyObject* torch_api_function,
+    const char* module_name,
+    const c10::OperatorHandle* opt_op,
+    torch::jit::Stack* opt_stack,
+    TorchFunctionName torch_function_name = TorchFunctionName::TorchFunction)
+    -> PyObject*;
+
 // Used for getters of Tensor properties
 auto handle_torch_function_getter(
     THPVariable* self,
diff --git a/torch/csrc/utils/python_compat.h b/torch/csrc/utils/python_compat.h
index 16308dad4421d..8488d5d0917b5 100644
--- a/torch/csrc/utils/python_compat.h
+++ b/torch/csrc/utils/python_compat.h
@@ -33,14 +33,6 @@ static inline int PyCode_GetNFreevars(PyCodeObject* code) {
 #endif
 }
 
-// Provided by CPython but getting the header for them is very hard
-#if IS_PYTHON_3_11_PLUS
-// NOLINTNEXTLINE(readability-redundant-declaration)
-PyAPI_FUNC(void) _PyWeakref_ClearRef(PyWeakReference* self);
-#else
-extern void _PyWeakref_ClearRef(PyWeakReference* self);
-#endif
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/torch/csrc/utils/python_dispatch.cpp b/torch/csrc/utils/python_dispatch.cpp
index f97b6ac0ba9b1..3380bb0a13e57 100644
--- a/torch/csrc/utils/python_dispatch.cpp
+++ b/torch/csrc/utils/python_dispatch.cpp
@@ -692,7 +692,7 @@ void initDispatchBindings(PyObject* module) {
       std::stringstream ss;
       ss << op.name;
       if (!op.overload_name.empty()) {
-        ss << "." << op.overload_name;
+        ss << '.' << op.overload_name;
       }
       names.emplace_back(std::move(ss).str());
     }
diff --git a/torch/csrc/utils/pythoncapi_compat.h b/torch/csrc/utils/pythoncapi_compat.h
index 05e80b5ee8607..cdfdafa84eb72 100644
--- a/torch/csrc/utils/pythoncapi_compat.h
+++ b/torch/csrc/utils/pythoncapi_compat.h
@@ -7,7 +7,7 @@
 // https://github.com/python/pythoncapi_compat
 //
 // Latest version:
-// https://raw.githubusercontent.com/python/pythoncapi_compat/master/pythoncapi_compat.h
+// https://raw.githubusercontent.com/python/pythoncapi-compat/main/pythoncapi_compat.h
 //
 // SPDX-License-Identifier: 0BSD
 
@@ -19,6 +19,7 @@ extern "C" {
 #endif
 
 #include <Python.h>
+#include <stddef.h>               // offsetof()
 
 // Python 3.11.0b4 added PyFrame_Back() to Python.h
 #if PY_VERSION_HEX < 0x030b00B4 && !defined(PYPY_VERSION)
@@ -33,11 +34,13 @@ extern "C" {
 // Static inline functions should use _Py_NULL rather than using directly NULL
 // to prevent C++ compiler warnings. On C23 and newer and on C++11 and newer,
 // _Py_NULL is defined as nullptr.
-#if (defined (__STDC_VERSION__) && __STDC_VERSION__ > 201710L) \
-        || (defined(__cplusplus) && __cplusplus >= 201103)
-#  define _Py_NULL nullptr
-#else
-#  define _Py_NULL NULL
+#ifndef _Py_NULL
+#  if (defined (__STDC_VERSION__) && __STDC_VERSION__ > 201710L) \
+          || (defined(__cplusplus) && __cplusplus >= 201103)
+#    define _Py_NULL nullptr
+#  else
+#    define _Py_NULL NULL
+#  endif
 #endif
 
 // Cast argument to PyObject* type.
@@ -45,6 +48,13 @@ extern "C" {
 #  define _PyObject_CAST(op) _Py_CAST(PyObject*, op)
 #endif
 
+#ifndef Py_BUILD_ASSERT
+#  define Py_BUILD_ASSERT(cond) \
+        do { \
+            (void)sizeof(char [1 - 2 * !(cond)]); \
+        } while(0)
+#endif
+
 
 // bpo-42262 added Py_NewRef() to Python 3.10.0a3
 #if PY_VERSION_HEX < 0x030A00A3 && !defined(Py_NewRef)
@@ -68,6 +78,16 @@ static inline PyObject* _Py_XNewRef(PyObject *obj)
 #endif
 
 
+// bpo-39573 added Py_SET_REFCNT() to Python 3.9.0a4
+#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_SET_REFCNT)
+static inline void _Py_SET_REFCNT(PyObject *ob, Py_ssize_t refcnt)
+{
+    ob->ob_refcnt = refcnt;
+}
+#define Py_SET_REFCNT(ob, refcnt) _Py_SET_REFCNT(_PyObject_CAST(ob), refcnt)
+#endif
+
+
 // Py_SETREF() and Py_XSETREF() were added to Python 3.5.2.
 // It is excluded from the limited C API.
 #if (PY_VERSION_HEX < 0x03050200 && !defined(Py_SETREF)) && !defined(Py_LIMITED_API)
@@ -104,6 +124,37 @@ static inline PyObject* _Py_XNewRef(PyObject *obj)
 #  define Py_IsFalse(x) Py_Is(x, Py_False)
 #endif
 
+
+// bpo-39573 added Py_SET_TYPE() to Python 3.9.0a4
+#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_SET_TYPE)
+static inline void _Py_SET_TYPE(PyObject *ob, PyTypeObject *type)
+{
+    ob->ob_type = type;
+}
+#define Py_SET_TYPE(ob, type) _Py_SET_TYPE(_PyObject_CAST(ob), type)
+#endif
+
+
+// bpo-39573 added Py_SET_SIZE() to Python 3.9.0a4
+#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_SET_SIZE)
+static inline void _Py_SET_SIZE(PyVarObject *ob, Py_ssize_t size)
+{
+    ob->ob_size = size;
+}
+#define Py_SET_SIZE(ob, size) _Py_SET_SIZE((PyVarObject*)(ob), size)
+#endif
+
+
+// bpo-40421 added PyFrame_GetCode() to Python 3.9.0b1
+#if PY_VERSION_HEX < 0x030900B1 || defined(PYPY_VERSION)
+static inline PyCodeObject* PyFrame_GetCode(PyFrameObject *frame)
+{
+    assert(frame != _Py_NULL);
+    assert(frame->f_code != _Py_NULL);
+    return _Py_CAST(PyCodeObject*, Py_NewRef(frame->f_code));
+}
+#endif
+
 static inline PyCodeObject* _PyFrame_GetCodeBorrow(PyFrameObject *frame)
 {
     PyCodeObject *code = PyFrame_GetCode(frame);
@@ -112,6 +163,15 @@ static inline PyCodeObject* _PyFrame_GetCodeBorrow(PyFrameObject *frame)
 }
 
 
+// bpo-40421 added PyFrame_GetBack() to Python 3.9.0b1
+#if PY_VERSION_HEX < 0x030900B1 && !defined(PYPY_VERSION)
+static inline PyFrameObject* PyFrame_GetBack(PyFrameObject *frame)
+{
+    assert(frame != _Py_NULL);
+    return _Py_CAST(PyFrameObject*, Py_XNewRef(frame->f_back));
+}
+#endif
+
 #if !defined(PYPY_VERSION)
 static inline PyFrameObject* _PyFrame_GetBackBorrow(PyFrameObject *frame)
 {
@@ -229,6 +289,26 @@ PyFrame_GetVarString(PyFrameObject *frame, const char *name)
 #endif
 
 
+// bpo-39947 added PyThreadState_GetInterpreter() to Python 3.9.0a5
+#if PY_VERSION_HEX < 0x030900A5 || (defined(PYPY_VERSION) && PY_VERSION_HEX < 0x030B0000)
+static inline PyInterpreterState *
+PyThreadState_GetInterpreter(PyThreadState *tstate)
+{
+    assert(tstate != _Py_NULL);
+    return tstate->interp;
+}
+#endif
+
+
+// bpo-40429 added PyThreadState_GetFrame() to Python 3.9.0b1
+#if PY_VERSION_HEX < 0x030900B1 && !defined(PYPY_VERSION)
+static inline PyFrameObject* PyThreadState_GetFrame(PyThreadState *tstate)
+{
+    assert(tstate != _Py_NULL);
+    return _Py_CAST(PyFrameObject *, Py_XNewRef(tstate->frame));
+}
+#endif
+
 #if !defined(PYPY_VERSION)
 static inline PyFrameObject*
 _PyThreadState_GetFrameBorrow(PyThreadState *tstate)
@@ -240,6 +320,35 @@ _PyThreadState_GetFrameBorrow(PyThreadState *tstate)
 #endif
 
 
+// bpo-39947 added PyInterpreterState_Get() to Python 3.9.0a5
+#if PY_VERSION_HEX < 0x030900A5 || defined(PYPY_VERSION)
+static inline PyInterpreterState* PyInterpreterState_Get(void)
+{
+    PyThreadState *tstate;
+    PyInterpreterState *interp;
+
+    tstate = PyThreadState_GET();
+    if (tstate == _Py_NULL) {
+        Py_FatalError("GIL released (tstate is NULL)");
+    }
+    interp = tstate->interp;
+    if (interp == _Py_NULL) {
+        Py_FatalError("no current interpreter");
+    }
+    return interp;
+}
+#endif
+
+
+// bpo-39947 added PyInterpreterState_Get() to Python 3.9.0a6
+#if 0x030700A1 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x030900A6 && !defined(PYPY_VERSION)
+static inline uint64_t PyThreadState_GetID(PyThreadState *tstate)
+{
+    assert(tstate != _Py_NULL);
+    return tstate->id;
+}
+#endif
+
 // bpo-43760 added PyThreadState_EnterTracing() to Python 3.11.0a2
 #if PY_VERSION_HEX < 0x030B00A2 && !defined(PYPY_VERSION)
 static inline void PyThreadState_EnterTracing(PyThreadState *tstate)
@@ -269,6 +378,27 @@ static inline void PyThreadState_LeaveTracing(PyThreadState *tstate)
 #endif
 
 
+// bpo-37194 added PyObject_CallNoArgs() to Python 3.9.0a1
+// PyObject_CallNoArgs() added to PyPy 3.9.16-v7.3.11
+#if !defined(PyObject_CallNoArgs) && PY_VERSION_HEX < 0x030900A1
+static inline PyObject* PyObject_CallNoArgs(PyObject *func)
+{
+    return PyObject_CallFunctionObjArgs(func, NULL);
+}
+#endif
+
+
+// bpo-39245 made PyObject_CallOneArg() public (previously called
+// _PyObject_CallOneArg) in Python 3.9.0a4
+// PyObject_CallOneArg() added to PyPy 3.9.16-v7.3.11
+#if !defined(PyObject_CallOneArg) && PY_VERSION_HEX < 0x030900A4
+static inline PyObject* PyObject_CallOneArg(PyObject *func, PyObject *arg)
+{
+    return PyObject_CallFunctionObjArgs(func, arg, NULL);
+}
+#endif
+
+
 // bpo-1635741 added PyModule_AddObjectRef() to Python 3.10.0a3
 #if PY_VERSION_HEX < 0x030A00A3
 static inline int
@@ -294,6 +424,58 @@ PyModule_AddObjectRef(PyObject *module, const char *name, PyObject *value)
 #endif
 
 
+// bpo-40024 added PyModule_AddType() to Python 3.9.0a5
+#if PY_VERSION_HEX < 0x030900A5
+static inline int PyModule_AddType(PyObject *module, PyTypeObject *type)
+{
+    const char *name, *dot;
+
+    if (PyType_Ready(type) < 0) {
+        return -1;
+    }
+
+    // inline _PyType_Name()
+    name = type->tp_name;
+    assert(name != _Py_NULL);
+    dot = strrchr(name, '.');
+    if (dot != _Py_NULL) {
+        name = dot + 1;
+    }
+
+    return PyModule_AddObjectRef(module, name, _PyObject_CAST(type));
+}
+#endif
+
+
+// bpo-40241 added PyObject_GC_IsTracked() to Python 3.9.0a6.
+// bpo-4688 added _PyObject_GC_IS_TRACKED() to Python 2.7.0a2.
+#if PY_VERSION_HEX < 0x030900A6 && !defined(PYPY_VERSION)
+static inline int PyObject_GC_IsTracked(PyObject* obj)
+{
+    return (PyObject_IS_GC(obj) && _PyObject_GC_IS_TRACKED(obj));
+}
+#endif
+
+// bpo-40241 added PyObject_GC_IsFinalized() to Python 3.9.0a6.
+// bpo-18112 added _PyGCHead_FINALIZED() to Python 3.4.0 final.
+#if PY_VERSION_HEX < 0x030900A6 && PY_VERSION_HEX >= 0x030400F0 && !defined(PYPY_VERSION)
+static inline int PyObject_GC_IsFinalized(PyObject *obj)
+{
+    PyGC_Head *gc = _Py_CAST(PyGC_Head*, obj) - 1;
+    return (PyObject_IS_GC(obj) && _PyGCHead_FINALIZED(gc));
+}
+#endif
+
+
+// bpo-39573 added Py_IS_TYPE() to Python 3.9.0a4
+#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_IS_TYPE)
+static inline int _Py_IS_TYPE(PyObject *ob, PyTypeObject *type) {
+    return Py_TYPE(ob) == type;
+}
+#define Py_IS_TYPE(ob, type) _Py_IS_TYPE(_PyObject_CAST(ob), type)
+#endif
+
+
 // bpo-46906 added PyFloat_Pack2() and PyFloat_Unpack2() to Python 3.11a7.
 // bpo-11734 added _PyFloat_Pack2() and _PyFloat_Unpack2() to Python 3.6.0b1.
 // Python 3.11a2 moved _PyFloat_Pack2() and _PyFloat_Unpack2() to the internal
@@ -401,7 +583,7 @@ static inline int PyWeakref_GetRef(PyObject *ref, PyObject **pobj)
         return 0;
     }
     *pobj = Py_NewRef(obj);
-    return (*pobj != NULL);
+    return 1;
 }
 #endif
 
@@ -420,6 +602,81 @@ static inline Py_ssize_t PyVectorcall_NARGS(size_t n)
 #endif
 
 
+// gh-105922 added PyObject_Vectorcall() to Python 3.9.0a4
+#if PY_VERSION_HEX < 0x030900A4
+static inline PyObject*
+PyObject_Vectorcall(PyObject *callable, PyObject *const *args,
+                     size_t nargsf, PyObject *kwnames)
+{
+#if PY_VERSION_HEX >= 0x030800B1 && !defined(PYPY_VERSION)
+    // bpo-36974 added _PyObject_Vectorcall() to Python 3.8.0b1
+    return _PyObject_Vectorcall(callable, args, nargsf, kwnames);
+#else
+    PyObject *posargs = NULL, *kwargs = NULL;
+    PyObject *res;
+    Py_ssize_t nposargs, nkwargs, i;
+
+    if (nargsf != 0 && args == NULL) {
+        PyErr_BadInternalCall();
+        goto error;
+    }
+    if (kwnames != NULL && !PyTuple_Check(kwnames)) {
+        PyErr_BadInternalCall();
+        goto error;
+    }
+
+    nposargs = (Py_ssize_t)PyVectorcall_NARGS(nargsf);
+    if (kwnames) {
+        nkwargs = PyTuple_GET_SIZE(kwnames);
+    }
+    else {
+        nkwargs = 0;
+    }
+
+    posargs = PyTuple_New(nposargs);
+    if (posargs == NULL) {
+        goto error;
+    }
+    if (nposargs) {
+        for (i=0; i < nposargs; i++) {
+            PyTuple_SET_ITEM(posargs, i, Py_NewRef(*args));
+            args++;
+        }
+    }
+
+    if (nkwargs) {
+        kwargs = PyDict_New();
+        if (kwargs == NULL) {
+            goto error;
+        }
+
+        for (i = 0; i < nkwargs; i++) {
+            PyObject *key = PyTuple_GET_ITEM(kwnames, i);
+            PyObject *value = *args;
+            args++;
+            if (PyDict_SetItem(kwargs, key, value) < 0) {
+                goto error;
+            }
+        }
+    }
+    else {
+        kwargs = NULL;
+    }
+
+    res = PyObject_Call(callable, posargs, kwargs);
+    Py_DECREF(posargs);
+    Py_XDECREF(kwargs);
+    return res;
+
+error:
+    Py_DECREF(posargs);
+    Py_XDECREF(kwargs);
+    return NULL;
+#endif
+}
+#endif
+
+
 // gh-106521 added PyObject_GetOptionalAttr() and
 // PyObject_GetOptionalAttrString() to Python 3.13.0a1
 #if PY_VERSION_HEX < 0x030D00A1
@@ -664,7 +921,7 @@ static inline int
 PyObject_VisitManagedDict(PyObject *obj, visitproc visit, void *arg)
 {
     PyObject **dict = _PyObject_GetDictPtr(obj);
-    if (*dict == NULL) {
+    if (dict == NULL || *dict == NULL) {
         return -1;
     }
     Py_VISIT(*dict);
@@ -675,7 +932,7 @@ static inline void
 PyObject_ClearManagedDict(PyObject *obj)
 {
     PyObject **dict = _PyObject_GetDictPtr(obj);
-    if (*dict == NULL) {
+    if (dict == NULL || *dict == NULL) {
         return;
     }
     Py_CLEAR(*dict);
@@ -950,11 +1207,11 @@ static inline int PyTime_PerfCounter(PyTime_t *result)
 #endif
 
 // gh-111389 added hash constants to Python 3.13.0a5. These constants were
-// added first as private macros to Python 3.4.0b1 and PyPy 7.3.9.
+// added first as private macros to Python 3.4.0b1 and PyPy 7.3.8.
 #if (!defined(PyHASH_BITS) \
      && ((!defined(PYPY_VERSION) && PY_VERSION_HEX >= 0x030400B1) \
          || (defined(PYPY_VERSION) && PY_VERSION_HEX >= 0x03070000 \
-             && PYPY_VERSION_NUM >= 0x07090000)))
+             && PYPY_VERSION_NUM >= 0x07030800)))
 #  define PyHASH_BITS _PyHASH_BITS
 #  define PyHASH_MODULUS _PyHASH_MODULUS
 #  define PyHASH_INF _PyHASH_INF
@@ -1196,6 +1453,18 @@ PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer,
     return res;
 }
 
+static inline int
+PyUnicodeWriter_WriteASCII(PyUnicodeWriter *writer,
+                           const char *str, Py_ssize_t size)
+{
+    if (size < 0) {
+        size = (Py_ssize_t)strlen(str);
+    }
+
+    return _PyUnicodeWriter_WriteASCIIString((_PyUnicodeWriter*)writer,
+                                             str, size);
+}
+
 static inline int
 PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer,
                               const wchar_t *str, Py_ssize_t size)
@@ -1219,7 +1488,8 @@ PyUnicodeWriter_WriteSubstring(PyUnicodeWriter *writer, PyObject *str,
                                Py_ssize_t start, Py_ssize_t end)
 {
     if (!PyUnicode_Check(str)) {
-        PyErr_Format(PyExc_TypeError, "expect str, not %T", str);
+        PyErr_Format(PyExc_TypeError, "expect str, not %s",
+                     Py_TYPE(str)->tp_name);
         return -1;
     }
     if (start < 0 || start > end) {
@@ -1266,6 +1536,1129 @@ static inline int PyLong_GetSign(PyObject *obj, int *sign)
 }
 #endif
 
+// gh-126061 added PyLong_IsPositive/Negative/Zero() to Python in 3.14.0a2
+#if PY_VERSION_HEX < 0x030E00A2
+static inline int PyLong_IsPositive(PyObject *obj)
+{
+    if (!PyLong_Check(obj)) {
+        PyErr_Format(PyExc_TypeError, "expected int, got %s", Py_TYPE(obj)->tp_name);
+        return -1;
+    }
+    return _PyLong_Sign(obj) == 1;
+}
+
+static inline int PyLong_IsNegative(PyObject *obj)
+{
+    if (!PyLong_Check(obj)) {
+        PyErr_Format(PyExc_TypeError, "expected int, got %s", Py_TYPE(obj)->tp_name);
+        return -1;
+    }
+    return _PyLong_Sign(obj) == -1;
+}
+
+static inline int PyLong_IsZero(PyObject *obj)
+{
+    if (!PyLong_Check(obj)) {
+        PyErr_Format(PyExc_TypeError, "expected int, got %s", Py_TYPE(obj)->tp_name);
+        return -1;
+    }
+    return _PyLong_Sign(obj) == 0;
+}
+#endif
+
+
+// gh-124502 added PyUnicode_Equal() to Python 3.14.0a0
+#if PY_VERSION_HEX < 0x030E00A0
+static inline int PyUnicode_Equal(PyObject *str1, PyObject *str2)
+{
+    if (!PyUnicode_Check(str1)) {
+        PyErr_Format(PyExc_TypeError, "first argument must be str, not %s",
+                     Py_TYPE(str1)->tp_name);
+        return -1;
+    }
+    if (!PyUnicode_Check(str2)) {
+        PyErr_Format(PyExc_TypeError, "second argument must be str, not %s",
+                     Py_TYPE(str2)->tp_name);
+        return -1;
+    }
+
+#if PY_VERSION_HEX >= 0x030d0000 && !defined(PYPY_VERSION)
+    PyAPI_FUNC(int) _PyUnicode_Equal(PyObject *str1, PyObject *str2);
+
+    return _PyUnicode_Equal(str1, str2);
+#elif PY_VERSION_HEX >= 0x03060000 && !defined(PYPY_VERSION)
+    return _PyUnicode_EQ(str1, str2);
+#elif PY_VERSION_HEX >= 0x03090000 && defined(PYPY_VERSION)
+    return _PyUnicode_EQ(str1, str2);
+#else
+    return (PyUnicode_Compare(str1, str2) == 0);
+#endif
+}
+#endif
+
+
+// gh-121645 added PyBytes_Join() to Python 3.14.0a0
+#if PY_VERSION_HEX < 0x030E00A0
+static inline PyObject* PyBytes_Join(PyObject *sep, PyObject *iterable)
+{
+    return _PyBytes_Join(sep, iterable);
+}
+#endif
+
+
+#if PY_VERSION_HEX < 0x030E00A0
+static inline Py_hash_t Py_HashBuffer(const void *ptr, Py_ssize_t len)
+{
+#if PY_VERSION_HEX >= 0x03000000 && !defined(PYPY_VERSION)
+    PyAPI_FUNC(Py_hash_t) _Py_HashBytes(const void *src, Py_ssize_t len);
+
+    return _Py_HashBytes(ptr, len);
+#else
+    Py_hash_t hash;
+    PyObject *bytes = PyBytes_FromStringAndSize((const char*)ptr, len);
+    if (bytes == NULL) {
+        return -1;
+    }
+    hash = PyObject_Hash(bytes);
+    Py_DECREF(bytes);
+    return hash;
+#endif
+}
+#endif
+
+
+#if PY_VERSION_HEX < 0x030E00A0
+static inline int PyIter_NextItem(PyObject *iter, PyObject **item)
+{
+    iternextfunc tp_iternext;
+
+    assert(iter != NULL);
+    assert(item != NULL);
+
+    tp_iternext = Py_TYPE(iter)->tp_iternext;
+    if (tp_iternext == NULL) {
+        *item = NULL;
+        PyErr_Format(PyExc_TypeError, "expected an iterator, got '%s'",
+                     Py_TYPE(iter)->tp_name);
+        return -1;
+    }
+
+    if ((*item = tp_iternext(iter))) {
+        return 1;
+    }
+    if (!PyErr_Occurred()) {
+        return 0;
+    }
+    if (PyErr_ExceptionMatches(PyExc_StopIteration)) {
+        PyErr_Clear();
+        return 0;
+    }
+    return -1;
+}
+#endif
+
+
+#if PY_VERSION_HEX < 0x030E00A0
+static inline PyObject* PyLong_FromInt32(int32_t value)
+{
+    Py_BUILD_ASSERT(sizeof(long) >= 4);
+    return PyLong_FromLong(value);
+}
+
+static inline PyObject* PyLong_FromInt64(int64_t value)
+{
+    Py_BUILD_ASSERT(sizeof(long long) >= 8);
+    return PyLong_FromLongLong(value);
+}
+
+static inline PyObject* PyLong_FromUInt32(uint32_t value)
+{
+    Py_BUILD_ASSERT(sizeof(unsigned long) >= 4);
+    return PyLong_FromUnsignedLong(value);
+}
+
+static inline PyObject* PyLong_FromUInt64(uint64_t value)
+{
+    Py_BUILD_ASSERT(sizeof(unsigned long long) >= 8);
+    return PyLong_FromUnsignedLongLong(value);
+}
+
+static inline int PyLong_AsInt32(PyObject *obj, int32_t *pvalue)
+{
+    Py_BUILD_ASSERT(sizeof(int) == 4);
+    int value = PyLong_AsInt(obj);
+    if (value == -1 && PyErr_Occurred()) {
+        return -1;
+    }
+    *pvalue = (int32_t)value;
+    return 0;
+}
+
+static inline int PyLong_AsInt64(PyObject *obj, int64_t *pvalue)
+{
+    Py_BUILD_ASSERT(sizeof(long long) == 8);
+    long long value = PyLong_AsLongLong(obj);
+    if (value == -1 && PyErr_Occurred()) {
+        return -1;
+    }
+    *pvalue = (int64_t)value;
+    return 0;
+}
+
+static inline int PyLong_AsUInt32(PyObject *obj, uint32_t *pvalue)
+{
+    Py_BUILD_ASSERT(sizeof(long) >= 4);
+    unsigned long value = PyLong_AsUnsignedLong(obj);
+    if (value == (unsigned long)-1 && PyErr_Occurred()) {
+        return -1;
+    }
+#if SIZEOF_LONG > 4
+    if ((unsigned long)UINT32_MAX < value) {
+        PyErr_SetString(PyExc_OverflowError,
+                        "Python int too large to convert to C uint32_t");
+        return -1;
+    }
+#endif
+    *pvalue = (uint32_t)value;
+    return 0;
+}
+
+static inline int PyLong_AsUInt64(PyObject *obj, uint64_t *pvalue)
+{
+    Py_BUILD_ASSERT(sizeof(long long) == 8);
+    unsigned long long value = PyLong_AsUnsignedLongLong(obj);
+    if (value == (unsigned long long)-1 && PyErr_Occurred()) {
+        return -1;
+    }
+    *pvalue = (uint64_t)value;
+    return 0;
+}
+#endif
+
+
+// gh-102471 added import and export API for integers to 3.14.0a2.
+#if PY_VERSION_HEX < 0x030E00A2 && PY_VERSION_HEX >= 0x03000000 && !defined(PYPY_VERSION)
+// Helpers to access PyLongObject internals.
+static inline void
+_PyLong_SetSignAndDigitCount(PyLongObject *op, int sign, Py_ssize_t size)
+{
+#if PY_VERSION_HEX >= 0x030C0000
+    op->long_value.lv_tag = (uintptr_t)(1 - sign) | ((uintptr_t)(size) << 3);
+#elif PY_VERSION_HEX >= 0x030900A4
+    Py_SET_SIZE(op, sign * size);
+#else
+    Py_SIZE(op) = sign * size;
+#endif
+}
+
+static inline Py_ssize_t
+_PyLong_DigitCount(const PyLongObject *op)
+{
+#if PY_VERSION_HEX >= 0x030C0000
+    return (Py_ssize_t)(op->long_value.lv_tag >> 3);
+#else
+    return _PyLong_Sign((PyObject*)op) < 0 ? -Py_SIZE(op) : Py_SIZE(op);
+#endif
+}
+
+static inline digit*
+_PyLong_GetDigits(const PyLongObject *op)
+{
+#if PY_VERSION_HEX >= 0x030C0000
+    return (digit*)(op->long_value.ob_digit);
+#else
+    return (digit*)(op->ob_digit);
+#endif
+}
+
+typedef struct PyLongLayout {
+    uint8_t bits_per_digit;
+    uint8_t digit_size;
+    int8_t digits_order;
+    int8_t digit_endianness;
+} PyLongLayout;
+
+typedef struct PyLongExport {
+    int64_t value;
+    uint8_t negative;
+    Py_ssize_t ndigits;
+    const void *digits;
+    Py_uintptr_t _reserved;
+} PyLongExport;
+
+typedef struct PyLongWriter PyLongWriter;
+
+static inline const PyLongLayout*
+PyLong_GetNativeLayout(void)
+{
+    static const PyLongLayout PyLong_LAYOUT = {
+        PyLong_SHIFT,
+        sizeof(digit),
+        -1,  // least significant first
+        PY_LITTLE_ENDIAN ? -1 : 1,
+    };
+
+    return &PyLong_LAYOUT;
+}
+
+static inline int
+PyLong_Export(PyObject *obj, PyLongExport *export_long)
+{
+    if (!PyLong_Check(obj)) {
+        memset(export_long, 0, sizeof(*export_long));
+        PyErr_Format(PyExc_TypeError, "expected int, got %s",
+                     Py_TYPE(obj)->tp_name);
+        return -1;
+    }
+
+    // Fast-path: try to convert to a int64_t
+    PyLongObject *self = (PyLongObject*)obj;
+    int overflow;
+#if SIZEOF_LONG == 8
+    long value = PyLong_AsLongAndOverflow(obj, &overflow);
+#else
+    // Windows has 32-bit long, so use 64-bit long long instead
+    long long value = PyLong_AsLongLongAndOverflow(obj, &overflow);
+#endif
+    Py_BUILD_ASSERT(sizeof(value) == sizeof(int64_t));
+    // the function cannot fail since obj is a PyLongObject
+    assert(!(value == -1 && PyErr_Occurred()));
+
+    if (!overflow) {
+        export_long->value = value;
+        export_long->negative = 0;
+        export_long->ndigits = 0;
+        export_long->digits = 0;
+        export_long->_reserved = 0;
+    }
+    else {
+        export_long->value = 0;
+        export_long->negative = _PyLong_Sign(obj) < 0;
+        export_long->ndigits = _PyLong_DigitCount(self);
+        if (export_long->ndigits == 0) {
+            export_long->ndigits = 1;
+        }
+        export_long->digits = _PyLong_GetDigits(self);
+        export_long->_reserved = (Py_uintptr_t)Py_NewRef(obj);
+    }
+    return 0;
+}
+
+static inline void
+PyLong_FreeExport(PyLongExport *export_long)
+{
+    PyObject *obj = (PyObject*)export_long->_reserved;
+
+    if (obj) {
+        export_long->_reserved = 0;
+        Py_DECREF(obj);
+    }
+}
+
+static inline PyLongWriter*
+PyLongWriter_Create(int negative, Py_ssize_t ndigits, void **digits)
+{
+    if (ndigits <= 0) {
+        PyErr_SetString(PyExc_ValueError, "ndigits must be positive");
+        return NULL;
+    }
+    assert(digits != NULL);
+
+    PyLongObject *obj = _PyLong_New(ndigits);
+    if (obj == NULL) {
+        return NULL;
+    }
+    _PyLong_SetSignAndDigitCount(obj, negative?-1:1, ndigits);
+
+    *digits = _PyLong_GetDigits(obj);
+    return (PyLongWriter*)obj;
+}
+
+static inline void
+PyLongWriter_Discard(PyLongWriter *writer)
+{
+    PyLongObject *obj = (PyLongObject *)writer;
+
+    assert(Py_REFCNT(obj) == 1);
+    Py_DECREF(obj);
+}
+
+static inline PyObject*
+PyLongWriter_Finish(PyLongWriter *writer)
+{
+    PyObject *obj = (PyObject *)writer;
+    PyLongObject *self = (PyLongObject*)obj;
+    Py_ssize_t j = _PyLong_DigitCount(self);
+    Py_ssize_t i = j;
+    int sign = _PyLong_Sign(obj);
+
+    assert(Py_REFCNT(obj) == 1);
+
+    // Normalize and get singleton if possible
+    while (i > 0 && _PyLong_GetDigits(self)[i-1] == 0) {
+        --i;
+    }
+    if (i != j) {
+        if (i == 0) {
+            sign = 0;
+        }
+        _PyLong_SetSignAndDigitCount(self, sign, i);
+    }
+    if (i <= 1) {
+        long val = sign * (long)(_PyLong_GetDigits(self)[0]);
+        Py_DECREF(obj);
+        return PyLong_FromLong(val);
+    }
+
+    return obj;
+}
+#endif
+
+
+#if PY_VERSION_HEX < 0x030C00A3
+#  define Py_T_SHORT      0
+#  define Py_T_INT        1
+#  define Py_T_LONG       2
+#  define Py_T_FLOAT      3
+#  define Py_T_DOUBLE     4
+#  define Py_T_STRING     5
+#  define _Py_T_OBJECT    6
+#  define Py_T_CHAR       7
+#  define Py_T_BYTE       8
+#  define Py_T_UBYTE      9
+#  define Py_T_USHORT     10
+#  define Py_T_UINT       11
+#  define Py_T_ULONG      12
+#  define Py_T_STRING_INPLACE  13
+#  define Py_T_BOOL       14
+#  define Py_T_OBJECT_EX  16
+#  define Py_T_LONGLONG   17
+#  define Py_T_ULONGLONG  18
+#  define Py_T_PYSSIZET   19
+
+#  if PY_VERSION_HEX >= 0x03000000 && !defined(PYPY_VERSION)
+#    define _Py_T_NONE      20
+#  endif
+
+#  define Py_READONLY            1
+#  define Py_AUDIT_READ          2
+#  define _Py_WRITE_RESTRICTED   4
+#endif
+
+
+// gh-127350 added Py_fopen() and Py_fclose() to Python 3.14a4
+#if PY_VERSION_HEX < 0x030E00A4
+static inline FILE* Py_fopen(PyObject *path, const char *mode)
+{
+#if 0x030400A2 <= PY_VERSION_HEX && !defined(PYPY_VERSION)
+    PyAPI_FUNC(FILE*) _Py_fopen_obj(PyObject *path, const char *mode);
+
+    return _Py_fopen_obj(path, mode);
+#else
+    FILE *f;
+    PyObject *bytes;
+#if PY_VERSION_HEX >= 0x03000000
+    if (!PyUnicode_FSConverter(path, &bytes)) {
+        return NULL;
+    }
+#else
+    if (!PyString_Check(path)) {
+        PyErr_SetString(PyExc_TypeError, "except str");
+        return NULL;
+    }
+    bytes = Py_NewRef(path);
+#endif
+    const char *path_bytes = PyBytes_AS_STRING(bytes);
+
+    f = fopen(path_bytes, mode);
+    Py_DECREF(bytes);
+
+    if (f == NULL) {
+        PyErr_SetFromErrnoWithFilenameObject(PyExc_OSError, path);
+        return NULL;
+    }
+    return f;
+#endif
+}
+
+static inline int Py_fclose(FILE *file)
+{
+    return fclose(file);
+}
+#endif
+
+
+#if 0x03080000 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x030E0000 && !defined(PYPY_VERSION)
+PyAPI_FUNC(const PyConfig*) _Py_GetConfig(void);
+
+static inline PyObject*
+PyConfig_Get(const char *name)
+{
+    typedef enum {
+        _PyConfig_MEMBER_INT,
+        _PyConfig_MEMBER_UINT,
+        _PyConfig_MEMBER_ULONG,
+        _PyConfig_MEMBER_BOOL,
+        _PyConfig_MEMBER_WSTR,
+        _PyConfig_MEMBER_WSTR_OPT,
+        _PyConfig_MEMBER_WSTR_LIST,
+    } PyConfigMemberType;
+
+    typedef struct {
+        const char *name;
+        size_t offset;
+        PyConfigMemberType type;
+        const char *sys_attr;
+    } PyConfigSpec;
+
+#define PYTHONCAPI_COMPAT_SPEC(MEMBER, TYPE, sys_attr) \
+    {#MEMBER, offsetof(PyConfig, MEMBER), \
+     _PyConfig_MEMBER_##TYPE, sys_attr}
+
+    static const PyConfigSpec config_spec[] = {
+        PYTHONCAPI_COMPAT_SPEC(argv, WSTR_LIST, "argv"),
+        PYTHONCAPI_COMPAT_SPEC(base_exec_prefix, WSTR_OPT, "base_exec_prefix"),
+        PYTHONCAPI_COMPAT_SPEC(base_executable, WSTR_OPT, "_base_executable"),
+        PYTHONCAPI_COMPAT_SPEC(base_prefix, WSTR_OPT, "base_prefix"),
+        PYTHONCAPI_COMPAT_SPEC(bytes_warning, UINT, _Py_NULL),
+        PYTHONCAPI_COMPAT_SPEC(exec_prefix, WSTR_OPT, "exec_prefix"),
+        PYTHONCAPI_COMPAT_SPEC(executable, WSTR_OPT, "executable"),
+        PYTHONCAPI_COMPAT_SPEC(inspect, BOOL, _Py_NULL),
+#if 0x030C0000 <= PY_VERSION_HEX
+        PYTHONCAPI_COMPAT_SPEC(int_max_str_digits, UINT, _Py_NULL),
+#endif
+        PYTHONCAPI_COMPAT_SPEC(interactive, BOOL, _Py_NULL),
+        PYTHONCAPI_COMPAT_SPEC(module_search_paths, WSTR_LIST, "path"),
+        PYTHONCAPI_COMPAT_SPEC(optimization_level, UINT, _Py_NULL),
+        PYTHONCAPI_COMPAT_SPEC(parser_debug, BOOL, _Py_NULL),
+#if 0x03090000 <= PY_VERSION_HEX
+        PYTHONCAPI_COMPAT_SPEC(platlibdir, WSTR, "platlibdir"),
+#endif
+        PYTHONCAPI_COMPAT_SPEC(prefix, WSTR_OPT, "prefix"),
+        PYTHONCAPI_COMPAT_SPEC(pycache_prefix, WSTR_OPT, "pycache_prefix"),
+        PYTHONCAPI_COMPAT_SPEC(quiet, BOOL, _Py_NULL),
+#if 0x030B0000 <= PY_VERSION_HEX
+        PYTHONCAPI_COMPAT_SPEC(stdlib_dir, WSTR_OPT, "_stdlib_dir"),
+#endif
+        PYTHONCAPI_COMPAT_SPEC(use_environment, BOOL, _Py_NULL),
+        PYTHONCAPI_COMPAT_SPEC(verbose, UINT, _Py_NULL),
+        PYTHONCAPI_COMPAT_SPEC(warnoptions, WSTR_LIST, "warnoptions"),
+        PYTHONCAPI_COMPAT_SPEC(write_bytecode, BOOL, _Py_NULL),
+        PYTHONCAPI_COMPAT_SPEC(xoptions, WSTR_LIST, "_xoptions"),
+        PYTHONCAPI_COMPAT_SPEC(buffered_stdio, BOOL, _Py_NULL),
+        PYTHONCAPI_COMPAT_SPEC(check_hash_pycs_mode, WSTR, _Py_NULL),
+#if 0x030B0000 <= PY_VERSION_HEX
+        PYTHONCAPI_COMPAT_SPEC(code_debug_ranges, BOOL, _Py_NULL),
+#endif
+        PYTHONCAPI_COMPAT_SPEC(configure_c_stdio, BOOL, _Py_NULL),
+#if 0x030D0000 <= PY_VERSION_HEX
+        PYTHONCAPI_COMPAT_SPEC(cpu_count, INT, _Py_NULL),
+#endif
+        PYTHONCAPI_COMPAT_SPEC(dev_mode, BOOL, _Py_NULL),
+        PYTHONCAPI_COMPAT_SPEC(dump_refs, BOOL, _Py_NULL),
+#if 0x030B0000 <= PY_VERSION_HEX
+        PYTHONCAPI_COMPAT_SPEC(dump_refs_file, WSTR_OPT, _Py_NULL),
+#endif
+#ifdef Py_GIL_DISABLED
+        PYTHONCAPI_COMPAT_SPEC(enable_gil, INT, _Py_NULL),
+#endif
+        PYTHONCAPI_COMPAT_SPEC(faulthandler, BOOL, _Py_NULL),
+        PYTHONCAPI_COMPAT_SPEC(filesystem_encoding, WSTR, _Py_NULL),
+        PYTHONCAPI_COMPAT_SPEC(filesystem_errors, WSTR, _Py_NULL),
+        PYTHONCAPI_COMPAT_SPEC(hash_seed, ULONG, _Py_NULL),
+        PYTHONCAPI_COMPAT_SPEC(home, WSTR_OPT, _Py_NULL),
+        PYTHONCAPI_COMPAT_SPEC(import_time, BOOL, _Py_NULL),
+        PYTHONCAPI_COMPAT_SPEC(install_signal_handlers, BOOL, _Py_NULL),
+        PYTHONCAPI_COMPAT_SPEC(isolated, BOOL, _Py_NULL),
+#ifdef MS_WINDOWS
+        PYTHONCAPI_COMPAT_SPEC(legacy_windows_stdio, BOOL, _Py_NULL),
+#endif
+        PYTHONCAPI_COMPAT_SPEC(malloc_stats, BOOL, _Py_NULL),
+#if 0x030A0000 <= PY_VERSION_HEX
+        PYTHONCAPI_COMPAT_SPEC(orig_argv, WSTR_LIST, "orig_argv"),
+#endif
+        PYTHONCAPI_COMPAT_SPEC(parse_argv, BOOL, _Py_NULL),
+        PYTHONCAPI_COMPAT_SPEC(pathconfig_warnings, BOOL, _Py_NULL),
+#if 0x030C0000 <= PY_VERSION_HEX
+        PYTHONCAPI_COMPAT_SPEC(perf_profiling, UINT, _Py_NULL),
+#endif
+        PYTHONCAPI_COMPAT_SPEC(program_name, WSTR, _Py_NULL),
+        PYTHONCAPI_COMPAT_SPEC(run_command, WSTR_OPT, _Py_NULL),
+        PYTHONCAPI_COMPAT_SPEC(run_filename, WSTR_OPT, _Py_NULL),
+        PYTHONCAPI_COMPAT_SPEC(run_module, WSTR_OPT, _Py_NULL),
+#if 0x030B0000 <= PY_VERSION_HEX
+        PYTHONCAPI_COMPAT_SPEC(safe_path, BOOL, _Py_NULL),
+#endif
+        PYTHONCAPI_COMPAT_SPEC(show_ref_count, BOOL, _Py_NULL),
+        PYTHONCAPI_COMPAT_SPEC(site_import, BOOL, _Py_NULL),
+        PYTHONCAPI_COMPAT_SPEC(skip_source_first_line, BOOL, _Py_NULL),
+        PYTHONCAPI_COMPAT_SPEC(stdio_encoding, WSTR, _Py_NULL),
+        PYTHONCAPI_COMPAT_SPEC(stdio_errors, WSTR, _Py_NULL),
+        PYTHONCAPI_COMPAT_SPEC(tracemalloc, UINT, _Py_NULL),
+#if 0x030B0000 <= PY_VERSION_HEX
+        PYTHONCAPI_COMPAT_SPEC(use_frozen_modules, BOOL, _Py_NULL),
+#endif
+        PYTHONCAPI_COMPAT_SPEC(use_hash_seed, BOOL, _Py_NULL),
+        PYTHONCAPI_COMPAT_SPEC(user_site_directory, BOOL, _Py_NULL),
+#if 0x030A0000 <= PY_VERSION_HEX
+        PYTHONCAPI_COMPAT_SPEC(warn_default_encoding, BOOL, _Py_NULL),
+#endif
+    };
+
+#undef PYTHONCAPI_COMPAT_SPEC
+
+    const PyConfigSpec *spec;
+    int found = 0;
+    for (size_t i=0; i < sizeof(config_spec) / sizeof(config_spec[0]); i++) {
+        spec = &config_spec[i];
+        if (strcmp(spec->name, name) == 0) {
+            found = 1;
+            break;
+        }
+    }
+    if (found) {
+        if (spec->sys_attr != NULL) {
+            PyObject *value = PySys_GetObject(spec->sys_attr);
+            if (value == NULL) {
+                PyErr_Format(PyExc_RuntimeError, "lost sys.%s", spec->sys_attr);
+                return NULL;
+            }
+            return Py_NewRef(value);
+        }
+
+        const PyConfig *config = _Py_GetConfig();
+        void *member = (char *)config + spec->offset;
+        switch (spec->type) {
+        case _PyConfig_MEMBER_INT:
+        case _PyConfig_MEMBER_UINT:
+        {
+            int value = *(int *)member;
+            return PyLong_FromLong(value);
+        }
+        case _PyConfig_MEMBER_BOOL:
+        {
+            int value = *(int *)member;
+            return PyBool_FromLong(value != 0);
+        }
+        case _PyConfig_MEMBER_ULONG:
+        {
+            unsigned long value = *(unsigned long *)member;
+            return PyLong_FromUnsignedLong(value);
+        }
+        case _PyConfig_MEMBER_WSTR:
+        case _PyConfig_MEMBER_WSTR_OPT:
+        {
+            wchar_t *wstr = *(wchar_t **)member;
+            if (wstr != NULL) {
+                return PyUnicode_FromWideChar(wstr, -1);
+            }
+            else {
+                return Py_NewRef(Py_None);
+            }
+        }
+        case _PyConfig_MEMBER_WSTR_LIST:
+        {
+            const PyWideStringList *list = (const PyWideStringList *)member;
+            PyObject *tuple = PyTuple_New(list->length);
+            if (tuple == NULL) {
+                return NULL;
+            }
+
+            for (Py_ssize_t i = 0; i < list->length; i++) {
+                PyObject *item = PyUnicode_FromWideChar(list->items[i], -1);
+                if (item == NULL) {
+                    Py_DECREF(tuple);
+                    return NULL;
+                }
+                PyTuple_SET_ITEM(tuple, i, item);
+            }
+            return tuple;
+        }
+        default:
+            Py_UNREACHABLE();
+        }
+    }
+
+    PyErr_Format(PyExc_ValueError, "unknown config option name: %s", name);
+    return NULL;
+}
+
+static inline int
+PyConfig_GetInt(const char *name, int *value)
+{
+    PyObject *obj = PyConfig_Get(name);
+    if (obj == NULL) {
+        return -1;
+    }
+
+    if (!PyLong_Check(obj)) {
+        Py_DECREF(obj);
+        PyErr_Format(PyExc_TypeError, "config option %s is not an int", name);
+        return -1;
+    }
+
+    int as_int = PyLong_AsInt(obj);
+    Py_DECREF(obj);
+    if (as_int == -1 && PyErr_Occurred()) {
+        PyErr_Format(PyExc_OverflowError,
+                     "config option %s value does not fit into a C int", name);
+        return -1;
+    }
+
+    *value = as_int;
+    return 0;
+}
+#endif  // PY_VERSION_HEX > 0x03090000 && !defined(PYPY_VERSION)
+
+// gh-133144 added PyUnstable_Object_IsUniquelyReferenced() to Python 3.14.0b1.
+// Adapted from  _PyObject_IsUniquelyReferenced() implementation.
+#if PY_VERSION_HEX < 0x030E00B0
+static inline int PyUnstable_Object_IsUniquelyReferenced(PyObject *obj)
+{
+#if !defined(Py_GIL_DISABLED)
+    return Py_REFCNT(obj) == 1;
+#else
+    // NOTE: the entire ob_ref_shared field must be zero, including flags, to
+    // ensure that other threads cannot concurrently create new references to
+    // this object.
+    return (_Py_IsOwnedByCurrentThread(obj) &&
+            _Py_atomic_load_uint32_relaxed(&obj->ob_ref_local) == 1 &&
+            _Py_atomic_load_ssize_relaxed(&obj->ob_ref_shared) == 0);
+#endif
+}
+#endif
+
+// gh-128926 added PyUnstable_TryIncRef() and PyUnstable_EnableTryIncRef() to
+// Python 3.14.0a5. Adapted from _Py_TryIncref() and _PyObject_SetMaybeWeakref().
+#if PY_VERSION_HEX < 0x030E00A5
+static inline int PyUnstable_TryIncRef(PyObject *op)
+{
+#ifndef Py_GIL_DISABLED
+    if (Py_REFCNT(op) > 0) {
+        Py_INCREF(op);
+        return 1;
+    }
+    return 0;
+#else
+    // _Py_TryIncrefFast()
+    uint32_t local = _Py_atomic_load_uint32_relaxed(&op->ob_ref_local);
+    local += 1;
+    if (local == 0) {
+        // immortal
+        return 1;
+    }
+    if (_Py_IsOwnedByCurrentThread(op)) {
+        _Py_INCREF_STAT_INC();
+        _Py_atomic_store_uint32_relaxed(&op->ob_ref_local, local);
+#ifdef Py_REF_DEBUG
+        _Py_INCREF_IncRefTotal();
+#endif
+        return 1;
+    }
+
+    // _Py_TryIncRefShared()
+    Py_ssize_t shared = _Py_atomic_load_ssize_relaxed(&op->ob_ref_shared);
+    for (;;) {
+        // If the shared refcount is zero and the object is either merged
+        // or may not have weak references, then we cannot incref it.
+        if (shared == 0 || shared == _Py_REF_MERGED) {
+            return 0;
+        }
+
+        if (_Py_atomic_compare_exchange_ssize(
+                &op->ob_ref_shared,
+                &shared,
+                shared + (1 << _Py_REF_SHARED_SHIFT))) {
+#ifdef Py_REF_DEBUG
+            _Py_INCREF_IncRefTotal();
+#endif
+            _Py_INCREF_STAT_INC();
+            return 1;
+        }
+    }
+#endif
+}
+
+static inline void PyUnstable_EnableTryIncRef(PyObject *op)
+{
+#ifdef Py_GIL_DISABLED
+    // _PyObject_SetMaybeWeakref()
+    if (_Py_IsImmortal(op)) {
+        return;
+    }
+    for (;;) {
+        Py_ssize_t shared = _Py_atomic_load_ssize_relaxed(&op->ob_ref_shared);
+        if ((shared & _Py_REF_SHARED_FLAG_MASK) != 0) {
+            // Nothing to do if it's in WEAKREFS, QUEUED, or MERGED states.
+            return;
+        }
+        if (_Py_atomic_compare_exchange_ssize(
+                &op->ob_ref_shared, &shared, shared | _Py_REF_MAYBE_WEAKREF)) {
+            return;
+        }
+    }
+#else
+    (void)op;  // unused argument
+#endif
+}
+#endif
+
+
+#if PY_VERSION_HEX < 0x030F0000
+static inline PyObject*
+PySys_GetAttrString(const char *name)
+{
+#if PY_VERSION_HEX >= 0x03000000
+    PyObject *value = Py_XNewRef(PySys_GetObject(name));
+#else
+    PyObject *value = Py_XNewRef(PySys_GetObject((char*)name));
+#endif
+    if (value != NULL) {
+        return value;
+    }
+    if (!PyErr_Occurred()) {
+        PyErr_Format(PyExc_RuntimeError, "lost sys.%s", name);
+    }
+    return NULL;
+}
+
+static inline PyObject*
+PySys_GetAttr(PyObject *name)
+{
+#if PY_VERSION_HEX >= 0x03000000
+    const char *name_str = PyUnicode_AsUTF8(name);
+#else
+    const char *name_str = PyString_AsString(name);
+#endif
+    if (name_str == NULL) {
+        return NULL;
+    }
+
+    return PySys_GetAttrString(name_str);
+}
+
+static inline int
+PySys_GetOptionalAttrString(const char *name, PyObject **value)
+{
+#if PY_VERSION_HEX >= 0x03000000
+    *value = Py_XNewRef(PySys_GetObject(name));
+#else
+    *value = Py_XNewRef(PySys_GetObject((char*)name));
+#endif
+    if (*value != NULL) {
+        return 1;
+    }
+    return 0;
+}
+
+static inline int
+PySys_GetOptionalAttr(PyObject *name, PyObject **value)
+{
+#if PY_VERSION_HEX >= 0x03000000
+    const char *name_str = PyUnicode_AsUTF8(name);
+#else
+    const char *name_str = PyString_AsString(name);
+#endif
+    if (name_str == NULL) {
+        *value = NULL;
+        return -1;
+    }
+
+    return PySys_GetOptionalAttrString(name_str, value);
+}
+#endif  // PY_VERSION_HEX < 0x030F00A1
+
+
+#if PY_VERSION_HEX < 0x030F00A1
+typedef struct PyBytesWriter {
+    char small_buffer[256];
+    PyObject *obj;
+    Py_ssize_t size;
+} PyBytesWriter;
+
+static inline Py_ssize_t
+_PyBytesWriter_GetAllocated(PyBytesWriter *writer)
+{
+    if (writer->obj == NULL) {
+        return sizeof(writer->small_buffer);
+    }
+    else {
+        return PyBytes_GET_SIZE(writer->obj);
+    }
+}
+
+
+static inline int
+_PyBytesWriter_Resize_impl(PyBytesWriter *writer, Py_ssize_t size,
+                           int resize)
+{
+    int overallocate = resize;
+    assert(size >= 0);
+
+    if (size <= _PyBytesWriter_GetAllocated(writer)) {
+        return 0;
+    }
+
+    if (overallocate) {
+#ifdef MS_WINDOWS
+        /* On Windows, overallocate by 50% is the best factor */
+        if (size <= (PY_SSIZE_T_MAX - size / 2)) {
+            size += size / 2;
+        }
+#else
+        /* On Linux, overallocate by 25% is the best factor */
+        if (size <= (PY_SSIZE_T_MAX - size / 4)) {
+            size += size / 4;
+        }
+#endif
+    }
+
+    if (writer->obj != NULL) {
+        if (_PyBytes_Resize(&writer->obj, size)) {
+            return -1;
+        }
+        assert(writer->obj != NULL);
+    }
+    else {
+        writer->obj = PyBytes_FromStringAndSize(NULL, size);
+        if (writer->obj == NULL) {
+            return -1;
+        }
+
+        if (resize) {
+            assert((size_t)size > sizeof(writer->small_buffer));
+            memcpy(PyBytes_AS_STRING(writer->obj),
+                   writer->small_buffer,
+                   sizeof(writer->small_buffer));
+        }
+    }
+    return 0;
+}
+
+static inline void*
+PyBytesWriter_GetData(PyBytesWriter *writer)
+{
+    if (writer->obj == NULL) {
+        return writer->small_buffer;
+    }
+    else {
+        return PyBytes_AS_STRING(writer->obj);
+    }
+}
+
+static inline Py_ssize_t
+PyBytesWriter_GetSize(PyBytesWriter *writer)
+{
+    return writer->size;
+}
+
+static inline void
+PyBytesWriter_Discard(PyBytesWriter *writer)
+{
+    if (writer == NULL) {
+        return;
+    }
+
+    Py_XDECREF(writer->obj);
+    PyMem_Free(writer);
+}
+
+static inline PyBytesWriter*
+PyBytesWriter_Create(Py_ssize_t size)
+{
+    if (size < 0) {
+        PyErr_SetString(PyExc_ValueError, "size must be >= 0");
+        return NULL;
+    }
+
+    PyBytesWriter *writer = (PyBytesWriter*)PyMem_Malloc(sizeof(PyBytesWriter));
+    if (writer == NULL) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    writer->obj = NULL;
+    writer->size = 0;
+
+    if (size >= 1) {
+        if (_PyBytesWriter_Resize_impl(writer, size, 0) < 0) {
+            PyBytesWriter_Discard(writer);
+            return NULL;
+        }
+        writer->size = size;
+    }
+    return writer;
+}
+
+static inline PyObject*
+PyBytesWriter_FinishWithSize(PyBytesWriter *writer, Py_ssize_t size)
+{
+    PyObject *result;
+    if (size == 0) {
+        result = PyBytes_FromStringAndSize("", 0);
+    }
+    else if (writer->obj != NULL) {
+        if (size != PyBytes_GET_SIZE(writer->obj)) {
+            if (_PyBytes_Resize(&writer->obj, size)) {
+                goto error;
+            }
+        }
+        result = writer->obj;
+        writer->obj = NULL;
+    }
+    else {
+        result = PyBytes_FromStringAndSize(writer->small_buffer, size);
+    }
+    PyBytesWriter_Discard(writer);
+    return result;
+
+error:
+    PyBytesWriter_Discard(writer);
+    return NULL;
+}
+
+static inline PyObject*
+PyBytesWriter_Finish(PyBytesWriter *writer)
+{
+    return PyBytesWriter_FinishWithSize(writer, writer->size);
+}
+
+static inline PyObject*
+PyBytesWriter_FinishWithPointer(PyBytesWriter *writer, void *buf)
+{
+    Py_ssize_t size = (char*)buf - (char*)PyBytesWriter_GetData(writer);
+    if (size < 0 || size > _PyBytesWriter_GetAllocated(writer)) {
+        PyBytesWriter_Discard(writer);
+        PyErr_SetString(PyExc_ValueError, "invalid end pointer");
+        return NULL;
+    }
+
+    return PyBytesWriter_FinishWithSize(writer, size);
+}
+
+static inline int
+PyBytesWriter_Resize(PyBytesWriter *writer, Py_ssize_t size)
+{
+    if (size < 0) {
+        PyErr_SetString(PyExc_ValueError, "size must be >= 0");
+        return -1;
+    }
+    if (_PyBytesWriter_Resize_impl(writer, size, 1) < 0) {
+        return -1;
+    }
+    writer->size = size;
+    return 0;
+}
+
+static inline int
+PyBytesWriter_Grow(PyBytesWriter *writer, Py_ssize_t size)
+{
+    if (size < 0 && writer->size + size < 0) {
+        PyErr_SetString(PyExc_ValueError, "invalid size");
+        return -1;
+    }
+    if (size > PY_SSIZE_T_MAX - writer->size) {
+        PyErr_NoMemory();
+        return -1;
+    }
+    size = writer->size + size;
+
+    if (_PyBytesWriter_Resize_impl(writer, size, 1) < 0) {
+        return -1;
+    }
+    writer->size = size;
+    return 0;
+}
+
+static inline void*
+PyBytesWriter_GrowAndUpdatePointer(PyBytesWriter *writer,
+                                   Py_ssize_t size, void *buf)
+{
+    Py_ssize_t pos = (char*)buf - (char*)PyBytesWriter_GetData(writer);
+    if (PyBytesWriter_Grow(writer, size) < 0) {
+        return NULL;
+    }
+    return (char*)PyBytesWriter_GetData(writer) + pos;
+}
+
+static inline int
+PyBytesWriter_WriteBytes(PyBytesWriter *writer,
+                         const void *bytes, Py_ssize_t size)
+{
+    if (size < 0) {
+        size_t len = strlen((const char*)bytes);
+        if (len > (size_t)PY_SSIZE_T_MAX) {
+            PyErr_NoMemory();
+            return -1;
+        }
+        size = (Py_ssize_t)len;
+    }
+
+    Py_ssize_t pos = writer->size;
+    if (PyBytesWriter_Grow(writer, size) < 0) {
+        return -1;
+    }
+    char *buf = (char*)PyBytesWriter_GetData(writer);
+    memcpy(buf + pos, bytes, (size_t)size);
+    return 0;
+}
+
+static inline int
+PyBytesWriter_Format(PyBytesWriter *writer, const char *format, ...)
+                     Py_GCC_ATTRIBUTE((format(printf, 2, 3)));
+
+static inline int
+PyBytesWriter_Format(PyBytesWriter *writer, const char *format, ...)
+{
+    va_list vargs;
+    va_start(vargs, format);
+    PyObject *str = PyBytes_FromFormatV(format, vargs);
+    va_end(vargs);
+
+    if (str == NULL) {
+        return -1;
+    }
+    int res = PyBytesWriter_WriteBytes(writer,
+                                       PyBytes_AS_STRING(str),
+                                       PyBytes_GET_SIZE(str));
+    Py_DECREF(str);
+    return res;
+}
+#endif  // PY_VERSION_HEX < 0x030F00A1
+
+
+#if PY_VERSION_HEX < 0x030F00A1
+static inline PyObject*
+PyTuple_FromArray(PyObject *const *array, Py_ssize_t size)
+{
+    PyObject *tuple = PyTuple_New(size);
+    if (tuple == NULL) {
+        return NULL;
+    }
+    for (Py_ssize_t i=0; i < size; i++) {
+        PyObject *item = array[i];
+        PyTuple_SET_ITEM(tuple, i, Py_NewRef(item));
+    }
+    return tuple;
+}
+#endif
+
+
+#if PY_VERSION_HEX < 0x030F00A1
+static inline Py_hash_t
+PyUnstable_Unicode_GET_CACHED_HASH(PyObject *op)
+{
+#ifdef PYPY_VERSION
+    (void)op;  // unused argument
+    return -1;
+#elif PY_VERSION_HEX >= 0x03000000
+    return ((PyASCIIObject*)op)->hash;
+#else
+    return ((PyUnicodeObject*)op)->hash;
+#endif
+}
+#endif
+
 
 #ifdef __cplusplus
 }
diff --git a/torch/csrc/utils/structseq.cpp b/torch/csrc/utils/structseq.cpp
index 29d20d5a9bfe2..2e804aa44bad9 100644
--- a/torch/csrc/utils/structseq.cpp
+++ b/torch/csrc/utils/structseq.cpp
@@ -66,7 +66,7 @@ PyObject* returned_structseq_repr(PyStructSequence* obj) {
       ss << ",\n";
     }
   }
-  ss << ")";
+  ss << ')';
 
   return PyUnicode_FromString(ss.str().c_str());
 }
diff --git a/torch/csrc/utils/tensor_types.cpp b/torch/csrc/utils/tensor_types.cpp
index d696a0cdf4ddd..c46baea82a442 100644
--- a/torch/csrc/utils/tensor_types.cpp
+++ b/torch/csrc/utils/tensor_types.cpp
@@ -66,14 +66,14 @@ const char* backend_to_string(const at::Backend& backend) {
 
 std::string options_to_string(const at::TensorOptions& options) {
   std::ostringstream ss;
-  ss << backend_to_string(options.backend()) << "."
+  ss << backend_to_string(options.backend()) << '.'
      << toString(at::typeMetaToScalarType(options.dtype())) << "Tensor";
   return ss.str();
 }
 
 std::string type_to_string(const at::DeprecatedTypeProperties& type) {
   std::ostringstream ss;
-  ss << backend_to_string(type.backend()) << "." << toString(type.scalarType())
+  ss << backend_to_string(type.backend()) << '.' << toString(type.scalarType())
      << "Tensor";
   return ss.str();
 }
diff --git a/torch/csrc/xpu/Module.cpp b/torch/csrc/xpu/Module.cpp
index 44d11a5bd9741..ba5998ba3d3ce 100644
--- a/torch/csrc/xpu/Module.cpp
+++ b/torch/csrc/xpu/Module.cpp
@@ -367,7 +367,7 @@ static void registerXpuDeviceProperties(PyObject* module) {
                    << ", sub_group_sizes=[" << prop.sub_group_sizes
                    << "], has_fp16=" << prop.has_fp16
                    << ", has_fp64=" << prop.has_fp64
-                   << ", has_atomic64=" << prop.has_atomic64 << ")";
+                   << ", has_atomic64=" << prop.has_atomic64 << ')';
             return stream.str();
           });
 }
@@ -386,23 +386,8 @@ static void bindGetDeviceProperties(PyObject* module) {
 static void initXpuMethodBindings(PyObject* module) {
   auto m = py::handle(module).cast<py::module>();
   m.def("_xpu_getMemoryInfo", [](c10::DeviceIndex device_index) {
-#if SYCL_COMPILER_VERSION >= 20250000
-    auto total = at::xpu::getDeviceProperties(device_index)->global_mem_size;
-    auto& device = c10::xpu::get_raw_device(device_index);
-    TORCH_CHECK(
-        device.has(sycl::aspect::ext_intel_free_memory),
-        "The device (",
-        at::xpu::getDeviceProperties(device_index)->name,
-        ") doesn't support querying the available free memory. ",
-        "You can file an issue at https://github.com/pytorch/pytorch/issues ",
-        "to help us prioritize its implementation.");
-    auto free = device.get_info<sycl::ext::intel::info::device::free_memory>();
-    return std::make_tuple(free, total);
-#else
-  TORCH_CHECK_NOT_IMPLEMENTED(
-      false,
-      "torch.xpu.mem_get_info requires PyTorch to be built with SYCL compiler version 2025.0.0 or newer.");
-#endif
+    py::gil_scoped_release no_gil;
+    return at::getDeviceAllocator(at::kXPU)->getMemoryInfo(device_index);
   });
   m.def(
       "_xpu_getStreamFromExternal",
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index dff869742df56..23d297b6d95e0 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -1228,7 +1228,7 @@ def _get_pynvml_handler(device: "Device" = None):
             "nvidia-ml-py does not seem to be installed or it can't be imported."
             # pyrefly: ignore [invalid-inheritance]
         ) from _PYNVML_ERR
-    # pyrefly: ignore [import-error]
+    # pyrefly: ignore [import-error,missing-module-attribute]
     from pynvml import NVMLError_DriverNotLoaded
 
     try:
diff --git a/torch/cuda/_device_limits.py b/torch/cuda/_device_limits.py
index 808d748c8f6eb..60aeedc8053ab 100644
--- a/torch/cuda/_device_limits.py
+++ b/torch/cuda/_device_limits.py
@@ -53,7 +53,7 @@ def get_fma_per_cycle_per_sm_cuda_cores(self, data_type: dtype) -> int:
         else:
             dict_key = "unknown"
 
-        if dict_key not in hardcoded_device_values.keys():
+        if dict_key not in hardcoded_device_values:
             raise RuntimeError(
                 f"No data for sm_{self.compute_capability} and {data_type}."
             )
@@ -96,7 +96,7 @@ def get_fma_per_cycle_per_sm_tensor_cores(self, data_type: dtype) -> int:
         else:
             dict_key = "unknown"
 
-        if dict_key not in hardcoded_device_values.keys():
+        if dict_key not in hardcoded_device_values:
             raise RuntimeError(
                 f"No data for sm_{self.compute_capability} and {data_type}."
             )
diff --git a/torch/cuda/_memory_viz.py b/torch/cuda/_memory_viz.py
index 070a340f35fef..5f0d868653e0e 100644
--- a/torch/cuda/_memory_viz.py
+++ b/torch/cuda/_memory_viz.py
@@ -751,10 +751,10 @@ def _output(p):
 
     def _read(name):
         if name == "-":
-            f = sys.stdin.buffer
+            data = pickle.load(sys.stdin.buffer)
         else:
-            f = open(name, "rb")
-        data = pickle.load(f)
+            with open(name, "rb") as f:
+                data = pickle.load(f)
         if isinstance(data, list):  # segments only...
             data = {"segments": data, "traces": []}
         return data
diff --git a/torch/cuda/_sanitizer.py b/torch/cuda/_sanitizer.py
index 90953d888d6c2..8f215a730923b 100644
--- a/torch/cuda/_sanitizer.py
+++ b/torch/cuda/_sanitizer.py
@@ -303,7 +303,7 @@ def stream_wait_for_event(self, stream: StreamId, event: EventId) -> None:
 
     def all_streams_wait_for_event(self, event: EventId) -> None:
         self._ensure_event_exists(event)
-        for stream in self.current_sync_states.keys():
+        for stream in self.current_sync_states:
             self.stream_wait_for_event(stream, event)
 
         self._state_wait_for_other(
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
index 2dfd5f9479499..a1decc20cc9a8 100644
--- a/torch/cuda/memory.py
+++ b/torch/cuda/memory.py
@@ -4,12 +4,14 @@
 import collections
 import contextlib
 import ctypes
+import os
 import pickle
+import re
 import sys
 import warnings
 from inspect import signature
-from typing import Any, Literal, Optional, TYPE_CHECKING
-from typing_extensions import deprecated
+from typing import Any, cast, Literal, Optional, TYPE_CHECKING, TypedDict
+from typing_extensions import deprecated, NotRequired
 
 import torch
 from torch import _C
@@ -29,6 +31,60 @@
     from torch.types import Device
 
 
+# Type definitions for memory profiler
+class _Frame(TypedDict):
+    """Frame information from memory profiler snapshots."""
+
+    filename: str
+    line: int
+    name: str
+    # Fields added by FX augmentation (optional)
+    fx_node_op: NotRequired[str]
+    fx_node_name: NotRequired[str]
+    fx_node_target: NotRequired[str]
+    fx_original_trace: NotRequired[str]
+
+
+class _Block(TypedDict):
+    """Memory block information."""
+
+    size: int
+    requested_size: int
+    address: int
+    state: str
+    frames: list[_Frame]
+
+
+class _Segment(TypedDict):
+    """Memory segment information."""
+
+    address: int
+    total_size: int
+    stream: int
+    segment_type: str
+    allocated_size: int
+    active_size: int
+    blocks: list[_Block]
+
+
+class _TraceEntry(TypedDict):
+    """Memory trace entry information."""
+
+    action: str
+    addr: NotRequired[int]
+    frames: list[_Frame]
+    size: int
+    stream: int
+    device_free: NotRequired[int]
+
+
+class _Snapshot(TypedDict):
+    """Memory snapshot structure."""
+
+    segments: list[_Segment]
+    device_traces: NotRequired[list[list[_TraceEntry]]]
+
+
 __all__ = [
     "caching_allocator_alloc",
     "caching_allocator_delete",
@@ -772,7 +828,7 @@ def list_gpu_processes(device: "Device" = None) -> str:
             import pynvml  # type: ignore[import]
         except ModuleNotFoundError:
             return "pynvml module not found, please install nvidia-ml-py"
-        # pyrefly: ignore [import-error]
+        # pyrefly: ignore [import-error,missing-module-attribute]
         from pynvml import NVMLError_DriverNotLoaded
 
         try:
@@ -964,7 +1020,120 @@ def _record_memory_history_impl(
 _record_memory_history.__signature__ = signature(_record_memory_history_impl)  # type: ignore[attr-defined]
 
 
-def _snapshot(device: "Device" = None):
+def _augment_frames(frames: list[_Frame]) -> int:
+    """
+    Augment a list of frames with FX debug information.
+
+    Args:
+        frames: List of frame dictionaries to augment
+
+    Returns:
+        The count of frames that were augmented.
+    """
+    from torch.fx.graph_module import FX_GRAPH_MODULE_FILE_PREFIX
+
+    # Regex pattern to match FX generated files
+    _FX_GENERATED_PATTERN = re.compile(
+        rf"{re.escape(FX_GRAPH_MODULE_FILE_PREFIX)}.*\.py$"
+    )
+
+    count = 0
+    if not frames:
+        return count
+
+    for frame in frames:
+        if "filename" in frame and "line" in frame:
+            filename = frame["filename"]
+            lineno = frame["line"]
+
+            # Check if this looks like an FX generated file
+            if not _FX_GENERATED_PATTERN.search(os.path.basename(filename)):
+                continue
+
+            # Look up metadata from the global registry
+            from torch.fx.traceback import _FX_METADATA_REGISTRY
+
+            metadata = _FX_METADATA_REGISTRY.get(filename)
+            if metadata is None:
+                continue
+
+            lineno_map = metadata.get("lineno_map", {})
+            node_metadata = metadata.get("node_metadata", {})
+            prologue_start = metadata.get("prologue_start", 0)
+
+            # Get the node index for this line
+            node_idx = lineno_map.get(lineno - prologue_start)
+
+            if node_idx is not None and node_idx in node_metadata:
+                node_info = node_metadata[node_idx]
+                original_trace = node_info.get("stack_trace")
+                node_op = node_info.get("op")
+                node_name = node_info.get("name")
+                node_target = node_info.get("target")
+
+                # Always add node metadata
+                frame["fx_node_op"] = node_op
+                frame["fx_node_name"] = node_name
+                frame["fx_node_target"] = str(node_target)
+
+                # Add original trace if available
+                if original_trace:
+                    frame["fx_original_trace"] = original_trace
+
+                count += 1
+
+    return count
+
+
+def _augment_memory_snapshot_stack_traces(
+    snapshot: str | _Snapshot,
+) -> _Snapshot:
+    """
+    Augment a memory snapshot with original source stack traces from FX metadata.
+
+    IMPORTANT: This function reads from a global in-memory registry (_FX_METADATA_REGISTRY)
+    that is populated during graph module compilation. It must be called in the same
+    Python process where the FX graphs were compiled. It cannot be used to augment
+    snapshots loaded from disk in a different process.
+
+    Args:
+        snapshot: Either a memory snapshot dict or path to a snapshot pickle file
+
+    Returns:
+        The augmented snapshot dictionary with fx_node_op, fx_node_name,
+        fx_original_trace, and fx_node_info fields added to frames
+    """
+
+    snapshot_dict: _Snapshot
+    if isinstance(snapshot, str):
+        # Load the memory snapshot
+        with open(snapshot, "rb") as f:
+            snapshot_dict = cast(_Snapshot, pickle.load(f))
+    else:
+        snapshot_dict = snapshot
+
+    # Process stack traces in the snapshot
+    augmented_count = 0
+
+    # Process blocks in segments (for regular allocations)
+    if "segments" in snapshot_dict:
+        for segment in snapshot_dict["segments"]:
+            if "blocks" in segment:
+                for block in segment["blocks"]:
+                    if "frames" in block:
+                        augmented_count += _augment_frames(block["frames"])
+
+    # Process device traces (for memory history)
+    if "device_traces" in snapshot_dict:
+        for trace_list in snapshot_dict["device_traces"]:
+            for trace_entry in trace_list:
+                if isinstance(trace_entry, dict) and "frames" in trace_entry:
+                    augmented_count += _augment_frames(trace_entry["frames"])
+
+    return snapshot_dict
+
+
+def _snapshot(device: "Device" = None, augment_with_fx_traces=False):
     """Save a snapshot of CUDA memory state at the time it was called.
 
     The state is represented as a dictionary with the following structure.
@@ -1012,6 +1181,11 @@ class Frame(TypedDict):
             filename: str
             line: int
             name: str
+            # Optional FX debug fields (present when augment_with_fx_traces=True
+            # and the frame corresponds to FX-generated code)
+            fx_node_op: str  # FX node operation type (e.g., 'call_function', 'output')
+            fx_node_name: str  # FX node name (e.g., 'linear', 'relu_1')
+            fx_original_trace: str  # Original model source code stack trace
 
 
         class TraceEntry(TypedDict):
@@ -1041,13 +1215,23 @@ class TraceEntry(TypedDict):
             device_free: int  # only present for OOM, the amount of
             # memory cuda still reports to be free
 
+    Args:
+        device: Device to capture snapshot for. If None, captures for current device.
+        augment_with_fx_traces: If True, augment stack trace frames with FX debug information
+                                that maps generated FX code back to original model source code.
+                                This adds fx_node_op, fx_node_name, fx_original_trace, and
+                                fx_node_info fields to Frame objects. Default: False.
+
     Returns:
         The Snapshot dictionary object
     """
-    return _C._cuda_memorySnapshot(None)
+    s = _C._cuda_memorySnapshot(None)
+    if augment_with_fx_traces:
+        s = _augment_memory_snapshot_stack_traces(s)  # type: ignore[assignment, arg-type]
+    return s
 
 
-def _dump_snapshot(filename="dump_snapshot.pickle"):
+def _dump_snapshot(filename="dump_snapshot.pickle", augment_with_fx_traces=False):
     """
     Save a pickled version of the `torch.memory._snapshot()` dictionary to a file.
 
@@ -1059,8 +1243,14 @@ def _dump_snapshot(filename="dump_snapshot.pickle"):
 
     Args:
         filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle".
+        augment_with_fx_traces (bool, optional): If True, augment the snapshot with FX debug information
+                                                  before dumping. This maps generated FX code stack traces
+                                                  back to original model source code. Defaults to False.
+        verbose (bool, optional): If True and augment_with_fx_traces is True, print verbose debug output
+                                  during augmentation. Defaults to False.
     """
-    s = _snapshot()
+    s = _snapshot(augment_with_fx_traces=augment_with_fx_traces)
+
     with open(filename, "wb") as f:
         pickle.dump(s, f)
 
diff --git a/torch/distributed/_local_tensor/__init__.py b/torch/distributed/_local_tensor/__init__.py
index ea9707b2e1e85..c186694df94e7 100644
--- a/torch/distributed/_local_tensor/__init__.py
+++ b/torch/distributed/_local_tensor/__init__.py
@@ -64,6 +64,7 @@
     np = None  # type: ignore[assignment]
 
 import torch
+import torch.distributed as dist
 from torch import Size, SymBool, SymInt, Tensor
 from torch._C import DispatchKey, DispatchKeySet, ScriptObject
 from torch._export.wrappers import mark_subclass_constructor_exportable_experimental
@@ -921,6 +922,22 @@ def rank_map(self, cb: Callable[[int], Tensor]) -> LocalTensor:
             # pyrefly: ignore [bad-argument-type, bad-argument-count]
             return LocalTensor({r: cb(r) for r in self.ranks})
 
+    def tensor_map(
+        self, tensor: LocalTensor, cb: Callable[[int, Tensor], Tensor | None]
+    ) -> LocalTensor:
+        """
+        Creates a LocalTensor instance by mapping rank id to ids local shard.
+        """
+
+        with self.disable():
+            results = {}
+            for r in self.ranks:
+                if r in tensor._local_tensors:
+                    m = cb(r, tensor._local_tensors[r])
+                    if m is not None:
+                        results[r] = m
+            return LocalTensor(results)
+
     def _patch_device_mesh(self) -> None:
         assert self._old_get_coordinate is None
         self._old_get_coordinate = DeviceMesh.get_coordinate  # type: ignore[assignment]
@@ -1049,3 +1066,120 @@ def maybe_disable_local_tensor_mode() -> contextlib.AbstractContextManager:
     """
     lm = local_tensor_mode()
     return lm.disable() if lm is not None else contextlib.nullcontext()
+
+
+import threading
+from queue import Queue
+
+
+_LOCAL_RUNNER_MODE: "LocalRunnerMode | None" = None
+
+
+class LocalRunnerMode:
+    """
+    A class for running multiple SPMD functions concurrently, however at any point
+    in time only one function can be running. The main use case for the local runner
+    mode is to enable SPMD functions to be able to use send and recv to communicate
+    with each other. Without local runner mode send and recv are not supported.
+    """
+
+    runner_context = threading.local()
+
+    def __init__(
+        self, ranks: frozenset[int] | int, concurrency: int, fn: Callable[[int], None]
+    ):
+        if isinstance(ranks, int):
+            ranks = frozenset(range(ranks))
+        self._ranks = ranks
+        self._fn = fn
+        self._run_lock = threading.Lock()
+        self._run_id = -1
+        self._run_cond = threading.Condition(self._run_lock)
+
+        self._recv_objects: dict[int, dict[int, Queue]] = {
+            dst: {src: Queue() for src in ranks} for dst in ranks
+        }
+        self._runners = [
+            threading.Thread(target=self._run, args=(i,), name="LocalRunnerMode")
+            for i in range(concurrency)
+        ]
+
+    def __enter__(self) -> "LocalRunnerMode":
+        global _LOCAL_RUNNER_MODE
+        assert _LOCAL_RUNNER_MODE is None, "LocalRunnerMode is already running"
+        _LOCAL_RUNNER_MODE = self
+
+        for r in self._runners:
+            r.start()
+        return self
+
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: TracebackType | None,
+    ) -> None:
+        for r in self._runners:
+            r.join()
+        global _LOCAL_RUNNER_MODE
+        _LOCAL_RUNNER_MODE = None
+
+    def _run(self, id: int) -> None:
+        LocalRunnerMode.runner_context.id = id
+        # Only one thread can run at a time, hence must acquire the lock
+        try:
+            self._acquire_run_lock()
+            self._fn(id)
+        finally:
+            self._release_run_lock()
+
+    def _acquire_run_lock(self) -> None:
+        self._run_lock.acquire()
+        self._run_id = LocalRunnerMode.runner_context.id
+
+    def _release_run_lock(self) -> None:
+        self._run_id = -1
+        self._run_lock.release()
+
+    def _assert_holds_run_lock(self) -> None:
+        assert self._run_id == LocalRunnerMode.runner_context.id, (
+            "Calling thread does not hold the run lock"
+        )
+
+    def _get_recv_object(self, src: int, dst: int) -> object | None:
+        peers = [src] if src != -1 else list(self._ranks)
+        recv_objects = self._recv_objects[dst]
+
+        for p in peers:
+            if not recv_objects[p].empty():
+                return recv_objects[p].get()
+
+        return None
+
+    def _signal_send(self, src: int, dst: int, obj: object) -> None:
+        assert obj is not None, "Cannot signal None"
+        self._assert_holds_run_lock()
+        # Only a single thread a time executes so it is safe to mutate
+        # read objects queue (executing thread is already holding the lock)
+        self._recv_objects[dst][src].put(obj)
+        # Signal directly condition variable since the calling thread is already
+        # holding the lock
+        self._run_cond.notify_all()
+
+    def _wait_recv(self, src: int, dst: int, post: Callable[[object], None]) -> None:
+        self._assert_holds_run_lock()
+        # Wait for the object to be available
+        while True:
+            obj = self._get_recv_object(src, dst)
+            if obj is not None:
+                post(obj)
+                # Note that we are not releasing the lock here, since the thread
+                # will continue to run and therefore must hold the lock
+                return
+            self._run_cond.wait()
+
+    @staticmethod
+    def current() -> "LocalRunnerMode":
+        global _LOCAL_RUNNER_MODE
+        assert _LOCAL_RUNNER_MODE is not None, "LocalRunnerMode is not enabled"
+        return _LOCAL_RUNNER_MODE
diff --git a/torch/distributed/_local_tensor/_c10d.py b/torch/distributed/_local_tensor/_c10d.py
index 30b99931f2514..0b63330dfafce 100644
--- a/torch/distributed/_local_tensor/_c10d.py
+++ b/torch/distributed/_local_tensor/_c10d.py
@@ -1,13 +1,15 @@
 import functools
 import math
 import operator
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
+from datetime import timedelta
 
 import torch
 from torch._C import ScriptObject
-from torch._C._distributed_c10d import FakeWork
+from torch._C._distributed_c10d import FakeWork, PythonCallbackWork
 from torch.distributed._mesh_layout import _MeshLayout
 from torch.distributed.distributed_c10d import (
+    _check_op,
     _get_default_group,
     _resolve_process_group,
     ProcessGroup,
@@ -765,10 +767,19 @@ def _local_send(
     # "send(Tensor[] tensors, __torch__.torch.classes.c10d.ProcessGroup process_group, "
     # "int dst, int tag) -> __torch__.torch.classes.c10d.Work";
 
-    raise NotImplementedError(
-        "LocalTensor does not support MPMD operations like send. "
-        "Use SPMD collective operations instead."
-    )
+    from . import LocalRunnerMode, LocalTensor
+
+    assert len(tensors) == 1
+    tensor = tensors[0]
+
+    assert isinstance(tensor, LocalTensor), "Input tensor must be a Tensor"
+    src = int(tensor.__src_rank__)
+
+    LocalRunnerMode.current()._signal_send(src, dst, tensor._local_tensors[src])
+
+    work = FakeWork()
+    work_so = Work.boxed(work)
+    return work_so
 
 
 def _local_recv_(
@@ -779,11 +790,26 @@ def _local_recv_(
 ) -> ScriptObject:
     # "recv_(Tensor[] tensors, __torch__.torch.classes.c10d.ProcessGroup process_group, "
     # "int src, int tag) -> __torch__.torch.classes.c10d.Work";
+    from . import LocalRunnerMode, LocalTensor
 
-    raise NotImplementedError(
-        "LocalTensor does not support MPMD operations like recv. "
-        "Use SPMD collective operations instead."
-    )
+    assert len(tensors) == 1
+    tensor = tensors[0]
+
+    assert isinstance(tensor, LocalTensor), "Input tensor must be a Tensor"
+    dst = int(tensor.__src_rank__)
+
+    def _recv_and_store(timeout: timedelta) -> bool:
+        def _wait_and_store(obj: object) -> None:
+            assert isinstance(obj, torch.Tensor), "Expected to receive a Tensor"
+            assert isinstance(tensor, LocalTensor), "Input tensor must be a Tensor"
+            tensor._local_tensors[dst] = obj
+
+        LocalRunnerMode.current()._wait_recv(src, dst, _wait_and_store)
+        return True
+
+    work = PythonCallbackWork(_recv_and_store)
+    work_so = Work.boxed(work)
+    return work_so
 
 
 def _local_recv_any_source_(
@@ -792,7 +818,60 @@ def _local_recv_any_source_(
     # "recv_any_source_(Tensor[] tensors, __torch__.torch.classes.c10d.ProcessGroup process_group, "
     # "int tag) -> __torch__.torch.classes.c10d.Work";
 
-    raise NotImplementedError(
-        "LocalTensor does not support MPMD operations like recv_any_source. "
-        "Use SPMD collective operations instead."
+    return _local_recv_(tensors, process_group_so, -1, tag)
+
+
+def _attach_rank(tensor: torch.Tensor, rank: int) -> torch.Tensor:
+    """
+    Attaches rank as an attribute to given tensor so that the send or recv implementation
+    knows which rank initiates the operation (note under local tensor mode ).
+    """
+    from torch.distributed.tensor import DTensor
+
+    if isinstance(tensor, DTensor):
+        tensor = tensor._local_tensor
+
+    tensor.__src_rank__ = rank  # type: ignore[attr-defined]
+    return tensor
+
+
+def local_p2p_op(
+    dst: torch.SymInt,
+    tensor: torch.Tensor,
+    op: Callable[[torch.Tensor, int], Work | None],
+) -> Work | None | list[Work | None]:
+    """
+    Runs a point-to-point (P2P) operation for all combinations of source and destination ranks.
+    """
+    _check_op(op)
+
+    from . import LocalIntNode
+
+    assert isinstance(dst.node, LocalIntNode), (
+        "Expected 'dst' to be a LocalIntNode where the value is the destination rank and key is the source rank"
     )
+
+    w = []
+    for s, d in dst.node._local_ints.items():
+        tensor = _attach_rank(tensor, s)
+        w.append(op(tensor, d))
+    return w
+
+
+def wait_all(work: Work | None | list[Work | None]) -> None:
+    """
+    Waits for all work objects in the input to complete.
+
+    A single Work object, None, or a list of Work objects (possibly containing None).
+    If None, does nothing. If a single Work, waits for it to complete. If a list, waits
+    for each non-None Work in the list to complete.
+    """
+
+    if work is None:
+        return
+    if isinstance(work, Work):
+        work = [work]
+    for w in work:
+        if w is None:
+            continue
+        w.wait()
diff --git a/torch/distributed/_serialization.py b/torch/distributed/_serialization.py
index c13ba46ba5757..8f7043453be76 100644
--- a/torch/distributed/_serialization.py
+++ b/torch/distributed/_serialization.py
@@ -145,7 +145,7 @@ def _streaming_load(
         if pickle_module is None:
             pickle_module = pickle
 
-    if "encoding" not in pickle_load_args.keys():
+    if "encoding" not in pickle_load_args:
         pickle_load_args["encoding"] = "utf-8"
 
     zip_file = _PseudoZipFile()
diff --git a/torch/distributed/_shard/sharded_tensor/__init__.py b/torch/distributed/_shard/sharded_tensor/__init__.py
index e1e9983d52628..3d3af3ed35953 100644
--- a/torch/distributed/_shard/sharded_tensor/__init__.py
+++ b/torch/distributed/_shard/sharded_tensor/__init__.py
@@ -437,7 +437,7 @@ def pre_load_state_dict_hook(
     Pre-load state dict hook to add ShardedTensor to the module.
     """
     for submodule_name, submodule in module.named_modules():
-        for attr_name in submodule.__dict__.keys():
+        for attr_name in submodule.__dict__:
             mod_prefix = prefix + submodule_name
             key = mod_prefix + ("." if mod_prefix else "") + attr_name
             if key in state_dict:
diff --git a/torch/distributed/_symmetric_memory/__init__.py b/torch/distributed/_symmetric_memory/__init__.py
index 6aa4584b981bb..48f22902ff98b 100644
--- a/torch/distributed/_symmetric_memory/__init__.py
+++ b/torch/distributed/_symmetric_memory/__init__.py
@@ -465,6 +465,39 @@ def get_p2p_buf(rank: int, idx: int) -> torch.Tensor:
     "_low_contention_reduce_scatter(Tensor tensor, str reduce_op, str group_name) -> Tensor"
 )
 
+lib.define("get_remote_tensors(Tensor x, str group_name) -> Tensor[]")
+"""
+Given a local tensor and a group name, return a tuple of tensors that are
+symmetric on other devices. The returned tensors are ordered by rank IDs. The
+length of the tuple equals to the size of the group.
+
+Note: this API works only when `world_within_direct_access()` returns True, i.e.
+only when the group is within NVLink domain or similar. It does not work across
+network interfaces.
+"""
+
+
+@torch.library.impl(lib, "get_remote_tensors", "CUDA")
+def _get_remote_tensors_default(
+    local: torch.Tensor, group_name: str
+) -> tuple[torch.Tensor, ...]:
+    hdl = rendezvous(local, group_name)
+    if hdl is None:
+        raise ValueError("Tensor is not allocated from Symmetric Memory")
+
+    return tuple(
+        hdl.get_remote_tensor(peer, local.size(), local.dtype)
+        for peer in range(hdl.world_size)
+    )
+
+
+@torch.library.impl(lib, "get_remote_tensors", "Meta")
+def _get_remote_tensors_meta(
+    local: torch.Tensor, group_name: str
+) -> tuple[torch.Tensor, ...]:
+    group = c10d._resolve_process_group(group_name)
+    return tuple(torch.empty_like(local) for _ in range(group.size()))
+
 
 class _ScaleMode(Enum):
     UNSCALED = "unscaled"
diff --git a/torch/distributed/_tools/fsdp2_mem_tracker.py b/torch/distributed/_tools/fsdp2_mem_tracker.py
index 9a749922be939..52a601b895a89 100644
--- a/torch/distributed/_tools/fsdp2_mem_tracker.py
+++ b/torch/distributed/_tools/fsdp2_mem_tracker.py
@@ -2,7 +2,7 @@
 from copy import deepcopy
 from enum import auto, Enum
 from functools import partial, wraps
-from typing import Any, NamedTuple, Optional, TypeVar, Union
+from typing import Any, NamedTuple, Optional, TYPE_CHECKING, TypeVar, Union
 from typing_extensions import ParamSpec, TypeVarTuple, Unpack
 
 import torch
@@ -12,11 +12,15 @@
 from torch.distributed._tools.mem_tracker import _RefType, _State, MemTracker
 from torch.distributed.fsdp import FSDPModule
 from torch.distributed.fsdp._fully_shard._fsdp_param_group import FSDPParamGroup
+from torch.distributed.tensor import DTensor
 from torch.utils._python_dispatch import TorchDispatchMode
 from torch.utils._pytree import tree_map_only
 from torch.utils.weak import WeakIdKeyDictionary, weakref
 
 
+if TYPE_CHECKING:
+    from torch.utils.hooks import RemovableHandle
+
 _TOTAL_KEY = "Total"
 
 __all__ = ["FSDPMemTracker"]
@@ -365,14 +369,28 @@ def _instrument_fsdp_module(self) -> None:
         # `FSDPParamGroup.post_forward` because during AC these won't be called.
         # TODO(@sanketpurandare): This will need to be modified after this PR (https://github.com/pytorch/pytorch/pull/127786)
         # lands. For backward we monkey-patch the `FSDPParamGroup.pre_backward` and `FSDPParamGroup.post_backward`.
-        # pyrefly: ignore [missing-attribute]
+
+        # get the unique _MultiHandlers/RemoveHandlers and store in dictionary
+        # the _MultiHandlers object will only need to be grabbed once.
+        unique_handlers: dict[RemovableHandle, bool] = {}
+        # pyrefly: ignore  # missing-attribute
+        for module in self._root_mod.modules():
+            if isinstance(module, FSDPModule):
+                fsdp_state = module._get_fsdp_state()
+                if fsdp_param_group := fsdp_state._fsdp_param_group:
+                    if not unique_handlers.get(fsdp_state._pre_forward_hook_handle):
+                        unique_handlers[fsdp_state._pre_forward_hook_handle] = True
+                    if not unique_handlers.get(fsdp_state._post_forward_hook_handle):
+                        unique_handlers[fsdp_state._post_forward_hook_handle] = True
+        # call remove on the handles once
+        for f_hook_handle in unique_handlers.keys():
+            f_hook_handle.remove()
+        # pyrefly: ignore  # missing-attribute
         for module in self._root_mod.modules():
             if isinstance(module, FSDPModule):
                 fsdp_state = module._get_fsdp_state()
                 if fsdp_param_group := fsdp_state._fsdp_param_group:
                     self._instrument_fsdp_sharded_params_grads(fsdp_param_group)
-                    fsdp_state._pre_forward_hook_handle.remove()
-                    fsdp_state._post_forward_hook_handle.remove()
                     fsdp_state._pre_forward_hook_handle = (
                         # pyrefly: ignore [missing-attribute]
                         module.register_forward_pre_hook(
@@ -485,7 +503,6 @@ def __enter__(self) -> "FSDPMemTracker":
         if self._depth == 0:
             self._register_module_and_optimizer_hooks()
             self._track_resize()
-            self._track_dtensor_dispatch()
             self._peak_mem_snap = self.get_tracker_snapshot()
             self._peak_mem = {
                 dev: dev_snap[_TOTAL_KEY]
@@ -501,11 +518,16 @@ def __exit__(self, *args: Any) -> None:
         if self._depth == 0:
             self._deregister_module_and_optimizer_hooks()
             self._restore_resize()
-            self._restore_dtensor_dispatch()
             self._mod_tracker.__exit__(*args)
         TorchDispatchMode.__exit__(self, *args)
 
     def __torch_dispatch__(self, func, types, args=..., kwargs=None):  # type: ignore[no-untyped-def]
+        # When running this mode with DTensor, ordinarily all modes will
+        # run **before** subclasses get a chance to run.
+        # Returning NotImplemented here gives us a chance to let DTensor
+        # run and desugar into local tensor ops, before `MemTracker` sees them.
+        if any(t == DTensor for t in types):
+            return NotImplemented
         if (
             func is torch.ops._c10d_functional.wait_tensor.default
             and active_fake_mode()
diff --git a/torch/distributed/_tools/mem_tracker.py b/torch/distributed/_tools/mem_tracker.py
index 68952c33a6d72..819e16ca99698 100644
--- a/torch/distributed/_tools/mem_tracker.py
+++ b/torch/distributed/_tools/mem_tracker.py
@@ -3,7 +3,6 @@
 import re
 import warnings
 from collections.abc import Callable
-from contextlib import nullcontext
 from copy import deepcopy
 from enum import auto, Enum
 from functools import partial, wraps
@@ -392,7 +391,6 @@ def __init__(self) -> None:
         # Weak references to the topmost AC module currently active
         self._ac_mod: Optional[weakref.ref] = None
         self._orig_resize = torch.UntypedStorage.resize_
-        self._orig_dtensor_dispatch = DTensor._op_dispatcher.dispatch
         self._depth = 0
 
     def _update_snap(
@@ -878,24 +876,6 @@ def reset_mod_stats(self) -> None:
         """
         self.memory_tracking.clear()
 
-    def _track_dtensor_dispatch(self) -> None:
-        def track_dtensor_dispatch(
-            op_call: torch._ops.OpOverload,
-            args: tuple[object, ...],
-            kwargs: dict[str, object],
-        ) -> object:
-            with (
-                self
-                if op_call in DTensor._op_dispatcher._custom_op_handlers
-                else nullcontext()
-            ):
-                return self._orig_dtensor_dispatch(op_call, args, kwargs)
-
-        DTensor._op_dispatcher.dispatch = track_dtensor_dispatch  # type: ignore[method-assign, assignment]
-
-    def _restore_dtensor_dispatch(self) -> None:
-        DTensor._op_dispatcher.dispatch = self._orig_dtensor_dispatch  # type: ignore[method-assign]
-
     def __enter__(self) -> "MemTracker":
         if self._depth == 0:
             self._register_global_optimizer_hook()
@@ -906,7 +886,6 @@ def __enter__(self) -> "MemTracker":
                 self._post_bw_hook,
             )
             self._track_resize()
-            self._track_dtensor_dispatch()
             self._peak_mem_snap = self.get_tracker_snapshot()
             self._peak_mem = {
                 dev: dev_snap[_TOTAL_KEY]
@@ -924,11 +903,16 @@ def __exit__(self, *args: Any) -> None:
             self._deregister_param_and_optimizer_hooks()
             self._mod_tracker.clear_user_hooks()
             self._restore_resize()
-            self._restore_dtensor_dispatch()
             self._mod_tracker.__exit__(*args)
         super().__exit__(*args)
 
     def __torch_dispatch__(self, func, types, args=(), kwargs=None):  # type: ignore[no-untyped-def]
+        # When running this mode with DTensor, ordinarily all modes will
+        # run **before** subclasses get a chance to run.
+        # Returning NotImplemented here gives us a chance to let DTensor
+        # run and desugar into local tensor ops, before `MemTracker` sees them.
+        if any(t == DTensor for t in types):
+            return NotImplemented
         if (
             func is torch.ops._c10d_functional.wait_tensor.default
             and active_fake_mode()
diff --git a/torch/distributed/checkpoint/_consolidate_hf_safetensors.py b/torch/distributed/checkpoint/_consolidate_hf_safetensors.py
index 9db89d038658a..32d81fb1ea721 100644
--- a/torch/distributed/checkpoint/_consolidate_hf_safetensors.py
+++ b/torch/distributed/checkpoint/_consolidate_hf_safetensors.py
@@ -257,11 +257,11 @@ def _process_output_file(
             )
 
             # Process each input safetensors file
-            for safetensors_file in input_files_data.keys():
+            for safetensors_file in input_files_data:
                 file_metadata = input_files_data[safetensors_file].metadata
                 input_metadata_size = input_files_data[safetensors_file].metadata_size
 
-                if tensor_fqn not in file_metadata.keys():
+                if tensor_fqn not in file_metadata:
                     continue
 
                 metadata = file_metadata[tensor_fqn]
diff --git a/torch/distributed/checkpoint/default_planner.py b/torch/distributed/checkpoint/default_planner.py
index ee0029ec7d63b..716cb90a99653 100644
--- a/torch/distributed/checkpoint/default_planner.py
+++ b/torch/distributed/checkpoint/default_planner.py
@@ -4,9 +4,10 @@
 import dataclasses
 import io
 import logging
-import operator
+import math
+import sys
+from bisect import bisect_right, insort
 from collections import ChainMap
-from functools import reduce
 from typing import Any, cast, Optional, Union
 
 import torch
@@ -634,10 +635,11 @@ def _validate_global_plan(global_plan: list[SavePlan], metadata: Metadata) -> bo
             continue
         if len(value.size) == 0:
             continue
+        chunks = value.chunks
         chunks_volume = 0
-        for chunk_idx, chunk0 in enumerate(value.chunks):
+        for chunk in chunks:
             # Compute the volume
-            if not _check_box_bounds(value.size, chunk0):
+            if not _check_box_bounds(value.size, chunk):
                 logger.warning(
                     """
                         key:%s has out of bounds chunk:
@@ -645,21 +647,46 @@ def _validate_global_plan(global_plan: list[SavePlan], metadata: Metadata) -> bo
                     """,
                     key,
                     value.size,
-                    chunk0,
+                    chunk,
                 )
                 all_good = False
-            chunks_volume += reduce(operator.mul, chunk0.sizes, 1)
-
-            # Check for overlap
-            for chunk1 in value.chunks[chunk_idx + 1 :]:
-                if _check_box_overlap(chunk0, chunk1):
-                    logger.warning(
-                        "key:%s has overlapping chunks: %s %s", key, chunk0, chunk1
-                    )
-                    all_good = False
+            chunks_volume += math.prod(chunk.sizes)
+
+        if len(chunks) > 1:
+            dims = len(value.size)
+            sweep_dim = max(range(dims), default=0, key=lambda d: value.size[d])
+            sorted_indices = sorted(
+                range(len(chunks)),
+                key=lambda idx: (
+                    chunks[idx].offsets[sweep_dim],
+                    *(chunks[idx].offsets[d] for d in range(dims)),
+                ),
+            )
+            active: list[tuple[int, int]] = []
+            for idx in sorted_indices:
+                current = chunks[idx]
+                start = current.offsets[sweep_dim]
+                end = start + current.sizes[sweep_dim]
+
+                cutoff = bisect_right(active, (start, sys.maxsize))
+                if cutoff:
+                    del active[:cutoff]
+
+                for _, other_idx in active:
+                    other = chunks[other_idx]
+                    if _check_box_overlap(current, other):
+                        logger.warning(
+                            "key:%s has overlapping chunks: %s %s",
+                            key,
+                            current,
+                            other,
+                        )
+                        all_good = False
+
+                insort(active, (end, idx))
 
         # Check whether combined chunk cover the whole tensor
-        tensor_volume = reduce(operator.mul, value.size, 1)
+        tensor_volume = math.prod(value.size)
         if len(global_plan) > 1 and chunks_volume != tensor_volume:
             logger.warning(
                 """
diff --git a/torch/distributed/checkpoint/examples/fsdp_checkpoint_example.py b/torch/distributed/checkpoint/examples/fsdp_checkpoint_example.py
index 7d57b55c22fd6..a20ac912f8767 100644
--- a/torch/distributed/checkpoint/examples/fsdp_checkpoint_example.py
+++ b/torch/distributed/checkpoint/examples/fsdp_checkpoint_example.py
@@ -37,15 +37,14 @@ def init_model():
 
 
 def print_params(stage, model_1, model_2, optim_1, optim_2):
-    with FSDP.summon_full_params(model_1):
-        with FSDP.summon_full_params(model_2):
-            print(
-                f"{stage} --- rank: {dist.get_rank()}\n"
-                f"model.weight: {model_1.weight}\n"
-                f"model_2.weight:{model_2.weight}\n"
-                f"model.bias: {model_1.bias}\n"
-                f"model_2.bias: {model_2.bias}\n"
-            )
+    with FSDP.summon_full_params(model_1), FSDP.summon_full_params(model_2):
+        print(
+            f"{stage} --- rank: {dist.get_rank()}\n"
+            f"model.weight: {model_1.weight}\n"
+            f"model_2.weight:{model_2.weight}\n"
+            f"model.bias: {model_1.bias}\n"
+            f"model_2.bias: {model_2.bias}\n"
+        )
 
     print(
         f"{stage} --- rank: {dist.get_rank()}\n"
diff --git a/torch/distributed/checkpoint/quantized_hf_storage.py b/torch/distributed/checkpoint/quantized_hf_storage.py
index 2cb189d515a8a..36f4ddf937fee 100644
--- a/torch/distributed/checkpoint/quantized_hf_storage.py
+++ b/torch/distributed/checkpoint/quantized_hf_storage.py
@@ -82,7 +82,7 @@ def _build_weight_scale_mapping(self, weight_map: dict[str, str]):
         # Store the complete weight map for file location lookups
         self._weight_map = weight_map
 
-        for tensor_name in weight_map.keys():
+        for tensor_name in weight_map:
             if tensor_name.endswith(".weight_scale_inv"):
                 weight_name = tensor_name.replace(".weight_scale_inv", ".weight")
                 if weight_name in weight_map:
diff --git a/torch/distributed/checkpoint/state_dict.py b/torch/distributed/checkpoint/state_dict.py
index 16d988a79103e..6a31144348acb 100644
--- a/torch/distributed/checkpoint/state_dict.py
+++ b/torch/distributed/checkpoint/state_dict.py
@@ -443,7 +443,7 @@ def _verify_state_dict(
                 f"or load but optim state_dict is empty. {optim_state_dict}"
             )
 
-    for key in model_state_dict.keys():
+    for key in model_state_dict:
         if _FLAT_PARAM in key:
             raise RuntimeError(
                 f"{key} contains {_FLAT_PARAM}. This can happen if the model "
@@ -521,7 +521,7 @@ def verify(key, fqn) -> bool:
     if info.submodule_prefixes:
         new_state_dict: dict[str, ValueType] = {}
         # TODO: make this faster.
-        for fqn in state_dict.keys():
+        for fqn in state_dict:
             for prefix in info.submodule_prefixes:
                 if not fqn.startswith(prefix):
                     continue
@@ -826,7 +826,7 @@ def _reconstruct_nested_dict(
                 # the state_dict.
                 if fqn in info.shared_params_mapping:
                     in_params = False
-                    for k in param_group.keys():
+                    for k in param_group:
                         if k == _PARAMS:
                             continue
                         flatten_key = f"{_PG}.{fqn}.{k}"
@@ -850,7 +850,7 @@ def _reconstruct_nested_dict(
 
                 # Reconstruct state for this parameter
                 state[fqn] = {}
-                for state_name in optim.state[param].keys():
+                for state_name in optim.state[param]:
                     flattened_state_key = f"{_STATE}.{fqn}.{state_name}"
 
                     if flattened_state_key not in state_dict:
@@ -868,7 +868,7 @@ def _reconstruct_nested_dict(
                         ]
 
         first_param_fqn = cast(list[str], pg_state[-1][_PARAMS])[0]
-        for k in param_group.keys():
+        for k in param_group:
             if k == _PARAMS:
                 continue
             value = state_dict[f"{_PG}.{first_param_fqn}.{k}"]
@@ -980,9 +980,7 @@ def _split_optim_state_dict(
     return_osd: OptimizerStateType = {_STATE: state, _PG: pg_state}
     pg_mapping: dict[int, int] = {}
 
-    if all(
-        isinstance(k, int) for k in cast(DictValueType, optim_state_dict[_STATE]).keys()
-    ):
+    if all(isinstance(k, int) for k in cast(DictValueType, optim_state_dict[_STATE])):
         return optim_state_dict
 
     for param_group in optim.param_groups:
@@ -1007,7 +1005,14 @@ def _split_optim_state_dict(
                     raise AssertionError(f"Expected list, got {type(params)}")
                 params.append(fqn)
                 if param.requires_grad:
-                    state[fqn] = cast(DictValueType, optim_state_dict[_STATE])[fqn]
+                    if fqn in cast(DictValueType, optim_state_dict[_STATE]):
+                        state[fqn] = cast(DictValueType, optim_state_dict[_STATE])[fqn]
+                    elif info.strict:
+                        raise RuntimeError(
+                            f"Missing optimizer state for parameter '{fqn}' in checkpoint. "
+                            "The parameter requires gradients but has no saved optimizer state. "
+                            "To load anyway, use StateDictOptions(strict=False)."
+                        )
                 for loaded_param_group in cast(
                     ListDictValueType, optim_state_dict[_PG]
                 ):
@@ -1132,7 +1137,7 @@ def _device(t):
             # dissimilar parameters in comparison to optim_state_dict. This is achieved by
             # incorporating differential parameters within local, which may result in optim
             # having additional parameters ultimately.
-            for optim_key in flatten_osd.keys():
+            for optim_key in flatten_osd:
                 if optim_key not in flatten_local_osd:
                     if optim_key not in osd_mapping:
                         raise AssertionError(
diff --git a/torch/distributed/device_mesh.py b/torch/distributed/device_mesh.py
index a161a4394a93d..05ded47876a8c 100644
--- a/torch/distributed/device_mesh.py
+++ b/torch/distributed/device_mesh.py
@@ -194,6 +194,10 @@ def __init__(
             _rank_map: Optional[torch.Tensor] = None,
             _root_mesh: Optional["DeviceMesh"] = None,
         ) -> None:
+            # no-op in OSS, logs API usage metrics in meta-internal runs
+            torch._C._log_api_usage_once(
+                "torch.distributed.device_mesh.DeviceMesh.__init__"
+            )
             if mesh is not None:
                 if _layout is not None or _rank_map is not None:
                     raise TypeError(
@@ -255,14 +259,13 @@ def __init__(
                 )
 
             # private field to pre-generate DeviceMesh's hash
-            self._flatten_mesh_list = tuple(self.mesh.flatten().tolist())
+            self._flatten_rank_map = tuple(self._rank_map.tolist())
             self._thread_id = None
             # Initialize instance-specific flatten mapping
             self._flatten_mapping = {}
 
             # Skip process group initialization if xla device or init backend is False
             # TODO(yeounoh) implement DeviceMesh backend and register XLA backend.
-            self._thread_id = None
             if device_type != "xla":
                 # always try to create default (world) pg, even if it is not initialized
                 # already. The world pg is used for device mesh identity (rank) on each
@@ -293,11 +296,6 @@ def __init__(
                     rank_coords[0].tolist() if rank_coords.size(0) > 0 else None
                 )
 
-            # private field to pre-generate DeviceMesh's hash
-            self._flatten_rank_map = tuple(self._rank_map.tolist())
-            # Initialize instance-specific flatten mapping
-            self._flatten_mapping = {}
-
         @property
         def device_type(self) -> str:
             """Returns the device type of the mesh."""
@@ -644,7 +642,7 @@ def get_group(self, mesh_dim: Optional[Union[int, str]] = None) -> ProcessGroup:
 
             root_mesh = self._get_root_mesh()
             root_to_flatten_mapping = root_mesh._flatten_mapping
-            if root_to_flatten_mapping and mesh_dim in root_to_flatten_mapping.keys():
+            if root_to_flatten_mapping and mesh_dim in root_to_flatten_mapping:
                 dim_group_name = root_to_flatten_mapping[
                     mesh_dim  # type: ignore[index]
                 ]._dim_group_names[0]
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index bc79408a32ff9..801716e3855ac 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -3,6 +3,7 @@
 
 import collections.abc
 import contextlib
+import copy
 import ctypes
 import hashlib
 import io
@@ -130,6 +131,7 @@
     "reduce_scatter_tensor",
     "get_node_local_rank",
     "split_group",
+    "shrink_group",
 ]
 
 _MPI_AVAILABLE = True
@@ -698,7 +700,7 @@ def pg_config_info(self) -> list[dict[str, Any]]:
         """
         config_info: list[dict[str, Any]] = []
         default_pg_size = _get_group_size(None)
-        for pg in self.pg_map.keys():
+        for pg in self.pg_map:
             ranks = self.pg_group_ranks[pg]
             config_info.append(
                 {
@@ -1459,9 +1461,7 @@ def _get_all_pg_configs() -> list[dict[str, Any]]:
     Return the pg configuration of all the process groups.
 
     """
-    config_info: list[dict[str, Any]] = [
-        _get_pg_config(pg) for pg in _world.pg_map.keys()
-    ]
+    config_info: list[dict[str, Any]] = [_get_pg_config(pg) for pg in _world.pg_map]
     return config_info
 
 
@@ -1518,7 +1518,7 @@ def _add_ephemeral_timeout_for_all_pgs(timeout: timedelta) -> None:
     Returns:
         None.
     """
-    for pg in _world.pg_map.keys():
+    for pg in _world.pg_map:
         devices = pg._device_types
         if torch.device("cuda") in devices:
             backend = pg._get_backend(torch.device("cuda"))
@@ -2178,7 +2178,7 @@ def _new_process_group_helper(
 
         # register only a single backend when all get_device_backend_map values are the same
         if len(set(backend_config.get_device_backend_map().values())) == 1:
-            for device in backend_config.get_device_backend_map().keys():
+            for device in backend_config.get_device_backend_map():
                 pg._register_backend(torch.device(device), backend_type, backend_class)
 
             # break out of outer loop to not create any more backends
@@ -2285,7 +2285,7 @@ def destroy_process_group(group: Optional[ProcessGroup] = None):
         del _world.pg_names[pg]
         del _world.pg_group_ranks[pg]
         del _world.pg_backend_config[pg]
-        if pg in _world.pg_coalesce_state.keys():
+        if pg in _world.pg_coalesce_state:
             warnings.warn(
                 "Some coalesced collectives haven't been launched when "
                 "ProcessGroup is destroyed. They will be cleaned.",
@@ -2377,7 +2377,7 @@ def _abort_process_group(group: Optional[ProcessGroup] = None):
         del _world.pg_names[pg]
         del _world.pg_group_ranks[pg]
         del _world.pg_backend_config[pg]
-        if pg in _world.pg_coalesce_state.keys():
+        if pg in _world.pg_coalesce_state:
             warnings.warn(
                 "Some coalesced collectives haven't been launched when "
                 "ProcessGroup is aborted. They will be cleaned.",
@@ -2992,7 +2992,7 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=None, async_op=False):
     if group is None:
         group = _get_default_group()
 
-    if group in _world.pg_coalesce_state.keys():
+    if group in _world.pg_coalesce_state:
         # We are in coalescing context, do not issue single operation, just append a collective representation
         coll = _CollOp(all_reduce, tensor, None, op, None)
         _world.pg_coalesce_state[group].append(coll)
@@ -4110,7 +4110,7 @@ def all_gather_into_tensor(output_tensor, input_tensor, group=None, async_op=Fal
 
     group = group or _get_default_group()
 
-    if group in _world.pg_coalesce_state.keys():
+    if group in _world.pg_coalesce_state:
         # We are in coalescing context, do not issue single operation, just append a collective representation
         coll = _CollOp(all_gather_into_tensor, input_tensor, output_tensor)
         _world.pg_coalesce_state[group].append(coll)
@@ -4575,7 +4575,7 @@ def reduce_scatter_tensor(output, input, op=ReduceOp.SUM, group=None, async_op=F
 
     # Check if we are in coalescing context
     # If we are, do not issue single operation, just append a collective representation
-    if group in _world.pg_coalesce_state.keys():
+    if group in _world.pg_coalesce_state:
         coll = _CollOp(reduce_scatter_tensor, input, output, op, None)
         _world.pg_coalesce_state[group].append(coll)
         if async_op:
@@ -5211,7 +5211,9 @@ def split_group(
 
     if pg_options is None:
         # default pg_options same as the parent process group
-        pg_options = parent_backend.options
+        # A deep copy is needed because if the option will be modified inside split
+        # and if we split parent pg multiple times, we will run into device out of bound error.
+        pg_options = copy.deepcopy(parent_backend.options)
 
     # this timeout defaulting/validation is used for all the new_groups/new_subgroups variants,
     # which may just pass their timeout value (or None)
@@ -5753,3 +5755,521 @@ def _get_process_group_name(pg: ProcessGroup) -> str:
 
 def _get_process_group_store(pg: ProcessGroup) -> Store:
     return _world.pg_map[pg][1]
+
+
+# Shrink flags for process group backends
+SHRINK_DEFAULT = 0x00
+SHRINK_ABORT = 0x01
+
+
+@_time_logger
+def shrink_group(
+    ranks_to_exclude: list[int],
+    group: Optional[ProcessGroup] = None,
+    shrink_flags: int = SHRINK_DEFAULT,
+    pg_options: Optional[Any] = None,
+) -> ProcessGroup:
+    """
+    Shrinks a process group by excluding specified ranks.
+
+    Creates and returns a new, smaller process group comprising only the ranks
+    from the original group that were not in the ``ranks_to_exclude`` list.
+
+    Args:
+        ranks_to_exclude (List[int]): A list of ranks from the original
+            ``group`` to exclude from the new group.
+        group (ProcessGroup, optional): The process group to shrink. If ``None``,
+            the default process group is used. Defaults to ``None``.
+        shrink_flags (int, optional): Flags to control the shrinking behavior.
+            Can be ``SHRINK_DEFAULT`` (default) or ``SHRINK_ABORT``.
+            ``SHRINK_ABORT`` will attempt to terminate ongoing operations
+            in the parent communicator before shrinking.
+            Defaults to ``SHRINK_DEFAULT``.
+        pg_options (ProcessGroupOptions, optional): Backend-specific options to apply
+            to the shrunken process group. If provided, the backend will use
+            these options when creating the new group. If omitted, the new group
+            inherits defaults from the parent.
+
+    Returns:
+        ProcessGroup: a new group comprised of the remaining ranks. If the
+        default group was shrunk, the returned group becomes the new default group.
+
+    Raises:
+        TypeError: if the group’s backend does not support shrinking.
+        ValueError: if ``ranks_to_exclude`` is invalid (empty, out of bounds,
+        duplicates, or excludes all ranks).
+        RuntimeError: if an excluded rank calls this function or the backend
+        fails the operation.
+
+    Notes:
+        - Only non-excluded ranks should call this function; excluded ranks
+          must not participate in the shrink operation.
+        - Shrinking the default group destroys all other process groups since
+          rank reassignment makes them inconsistent.
+    """
+    # Step 1: Validate input parameters with comprehensive error checking
+    _validate_shrink_inputs(ranks_to_exclude, shrink_flags)
+
+    # Step 2: Get target group and essential properties
+    target_group_info = _prepare_shrink_target_group(group)
+
+    # Step 3: Validate backend requirements and availability
+    backend_impl = _validate_shrink_backend_requirements(target_group_info)
+
+    # Step 4: Validate ranks against group and check for duplicates
+    excluded_ranks_set = _validate_and_process_excluded_ranks(
+        ranks_to_exclude, target_group_info
+    )
+
+    # Step 5: Execute the actual shrink operation (backend-specific)
+    new_backend = backend_impl.shrink(
+        sorted(excluded_ranks_set),
+        shrink_flags,
+        pg_options if pg_options is not None else None,
+    )
+
+    # Step 6: Handle cleanup and creation of new process group
+    target_group_info["pg_options_override"] = pg_options
+    return _finalize_shrunk_group(target_group_info, excluded_ranks_set, new_backend)
+
+
+def _validate_shrink_inputs(ranks_to_exclude: list[int], shrink_flags: int) -> None:
+    """Validate input parameters for shrink_group."""
+    if not isinstance(ranks_to_exclude, list):
+        raise TypeError(
+            f"ranks_to_exclude must be a list, but got {type(ranks_to_exclude).__name__}. "
+            f"Example: [1, 3, 5] to exclude ranks 1, 3, and 5."
+        )
+
+    if not ranks_to_exclude:
+        raise ValueError(
+            "ranks_to_exclude cannot be empty. To shrink a group, you must specify at least "
+            "one rank to exclude. Example: [failed_rank_id]"
+        )
+
+    # Validate shrink_flags with clear explanation of valid values
+    valid_flags = [SHRINK_DEFAULT, SHRINK_ABORT]
+    if not isinstance(shrink_flags, int) or shrink_flags not in valid_flags:
+        raise ValueError(
+            f"Invalid shrink_flags value: {shrink_flags}. Must be one of: "
+            f"SHRINK_DEFAULT ({SHRINK_DEFAULT}) or SHRINK_ABORT ({SHRINK_ABORT}). "
+            f"Use SHRINK_ABORT to abort ongoing operations before shrinking."
+        )
+
+
+def _prepare_shrink_target_group(group: Optional[ProcessGroup]) -> dict:
+    """Prepare and validate the target group for shrinking."""
+    target_pg = group if group is not None else _get_default_group()
+
+    # Cache frequently accessed properties to avoid repeated calls
+    group_size = int(target_pg.size())
+    group_info = {
+        "process_group": target_pg,
+        "is_default_group": (target_pg == _get_default_group()),
+        "group_size": group_size,
+        "current_rank": target_pg.rank(),
+        "group_name": _get_process_group_name(target_pg),
+    }
+
+    # Validate that we have a valid process group
+    if group_size <= 1:
+        raise ValueError(
+            f"Cannot shrink a process group with size {group_size}. "
+            f"Group must have at least 2 ranks to support shrinking."
+        )
+
+    return group_info
+
+
+def _validate_shrink_backend_requirements(group_info: dict) -> Any:
+    """Return the backend implementation for the target group or raise if unsupported."""
+    target_pg = group_info["process_group"]
+    group_name = group_info["group_name"]
+
+    # Get the group's backend directly via ProcessGroup API. Prefer a bound device if present,
+    # otherwise try CUDA then fall back to CPU.
+    try:
+        preferred_device = getattr(target_pg, "bound_device_id", None)
+        if preferred_device is not None:
+            backend_impl = target_pg._get_backend(preferred_device)
+        else:
+            # Try CUDA first if available, else CPU
+            try:
+                backend_impl = target_pg._get_backend(torch.device("cuda"))
+            except Exception:
+                backend_impl = target_pg._get_backend(torch.device("cpu"))
+    except RuntimeError as e:
+        raise RuntimeError(
+            f"Cannot access device backend for process group '{group_name}'. "
+            f"Ensure the process group was initialized with a compatible device backend and devices are available."
+        ) from e
+
+    try:
+        supports = bool(backend_impl.supports_shrinking)
+    except Exception:
+        supports = False
+    if not supports:
+        raise TypeError(
+            f"Process group backend for '{group_name}' does not support shrinking operations."
+        )
+
+    return backend_impl
+
+
+def _validate_and_process_excluded_ranks(
+    ranks_to_exclude: list[int], group_info: dict
+) -> set:
+    """Validate excluded ranks and convert to set for efficient operations."""
+    group_size = group_info["group_size"]
+    current_rank = group_info["current_rank"]
+
+    # Use set for O(1) duplicate detection and membership testing
+    excluded_ranks_set = set()
+
+    # Validate each rank with detailed error messages
+    for i, rank in enumerate(ranks_to_exclude):
+        if not isinstance(rank, int):
+            raise TypeError(
+                f"All elements in ranks_to_exclude must be integers. "
+                f"Element at index {i} is {type(rank).__name__}: {rank}"
+            )
+
+        if not (0 <= rank < group_size):
+            raise ValueError(
+                f"Rank {rank} at index {i} is out of bounds for group size {group_size}. "
+                f"Valid ranks are in range [0, {group_size - 1}]."
+            )
+
+        if rank in excluded_ranks_set:
+            raise ValueError(
+                f"Duplicate rank {rank} found in ranks_to_exclude at index {i}. "
+                f"Each rank can only be excluded once."
+            )
+
+        excluded_ranks_set.add(rank)
+
+    # Ensure we don't exclude all ranks
+    if len(excluded_ranks_set) >= group_size:
+        raise ValueError(
+            f"Cannot exclude all {group_size} ranks from process group. "
+            f"At least one rank must remain. Excluding {len(excluded_ranks_set)} ranks."
+        )
+
+    # Critical check: current rank should not be in excluded list
+    if current_rank in excluded_ranks_set:
+        raise RuntimeError(
+            f"Current rank {current_rank} is in the exclusion list and should not call shrink_group(). "
+            f"Only non-excluded ranks should participate in the shrinking operation. "
+            f"Excluded ranks should terminate their processes instead."
+        )
+
+    return excluded_ranks_set
+
+
+def _finalize_shrunk_group(
+    group_info: dict, excluded_ranks_set: set, new_backend
+) -> ProcessGroup:
+    """Clean up old group and create new shrunk process group."""
+    target_pg = group_info["process_group"]
+    is_default_group = group_info["is_default_group"]
+
+    # Handle default group dependencies - destroy other groups first
+    if is_default_group:
+        _destroy_all_other_groups(exclude_group=target_pg)
+
+    # Gather original group metadata before cleanup
+    original_group_metadata = _extract_group_metadata(target_pg)
+
+    # Calculate remaining ranks efficiently
+    original_ranks = get_process_group_ranks(target_pg)
+    remaining_ranks = [
+        rank for rank in original_ranks if rank not in excluded_ranks_set
+    ]
+
+    # Clean up the original group
+    _cleanup_original_group(target_pg, is_default_group)
+
+    # Create and configure the new process group
+    new_pg = _create_shrunk_process_group(
+        new_backend, remaining_ranks, original_group_metadata, is_default_group
+    )
+
+    # Register the new group in global state
+    if is_default_group:
+        _update_default_pg(new_pg)
+
+    # Update global state with new group information
+    rank_mapping = {
+        global_rank: group_rank
+        for group_rank, global_rank in enumerate(remaining_ranks)
+    }
+    _update_process_group_global_state(
+        pg=new_pg,
+        backend_name=original_group_metadata["backend_name"],
+        store=original_group_metadata["store"],
+        group_name=original_group_metadata["new_group_name"],
+        backend_config=original_group_metadata["backend_config"],
+        rank_mapping=rank_mapping,
+    )
+
+    return new_pg
+
+
+def _extract_group_metadata(target_pg: ProcessGroup) -> dict:
+    """Extract metadata from the original group before cleanup."""
+    original_backend_name, original_store = _world.pg_map[target_pg]
+    original_backend_config = _world.pg_backend_config.get(target_pg, "")
+    original_group_name = _get_process_group_name(target_pg)
+
+    # Extract device binding information before cleanup to avoid accessing destroyed group
+    bound_device_id = None
+    if hasattr(target_pg, "bound_device_id"):
+        bound_device_id = target_pg.bound_device_id
+
+    # Generate new group name for the shrunk group; hash for uniqueness across backends
+    remaining_ranks = list(get_process_group_ranks(target_pg))
+    new_group_name = _process_group_name(remaining_ranks, use_hashed_name=True)
+
+    return {
+        "backend_name": original_backend_name,
+        "store": original_store,
+        "backend_config": original_backend_config,
+        "original_group_name": original_group_name,
+        "new_group_name": new_group_name,
+        "bound_device_id": bound_device_id,  # Safe to access after cleanup
+    }
+
+
+def _cleanup_original_group(target_pg: ProcessGroup, is_default_group: bool) -> None:
+    """Clean up the original process group safely."""
+    try:
+        destroy_process_group(target_pg)
+    except Exception:
+        group_type = "default" if is_default_group else "non-default"
+        logger.warning(
+            "Failed to destroy %s group during shrinking", group_type, exc_info=True
+        )
+
+    # Ensure global state cleanup even if destroy_process_group fails
+    _cleanup_process_group_global_state(target_pg)
+
+
+def _create_shrunk_process_group(
+    new_backend, remaining_ranks: list[int], metadata: dict, is_default_group: bool
+) -> ProcessGroup:
+    """Create and configure the new shrunk process group."""
+    # Create new group properties
+    new_group_rank = new_backend.rank()
+    new_group_size = new_backend.size()
+    group_name = metadata["new_group_name"]
+
+    # Generate descriptive group description
+    if is_default_group:
+        group_desc = "default:shrunken"
+    else:
+        group_desc = f"{metadata['original_group_name']}:shrunk"
+
+    # Create process group with new communicator (clone the parent store like split does)
+    prefix_store = PrefixStore(f"{group_name}/", metadata["store"].clone())
+    new_pg = ProcessGroup(prefix_store, new_group_rank, new_group_size)
+
+    # Configure backend using the device type of the new backend's bound device if available,
+    # otherwise derive from the original group's bound device or fall back to CPU.
+    backend_device = metadata.get("bound_device_id")
+    if backend_device is None:
+        # Default to CPU if no bound device is present
+        backend_device = torch.device("cpu")
+
+    # Choose backend enum based on device type
+    if backend_device.type == "cuda":
+        backend_type = ProcessGroup.BackendType.NCCL
+    else:
+        backend_type = ProcessGroup.BackendType.GLOO
+
+    new_pg._register_backend(backend_device, backend_type, new_backend)
+    new_pg._set_default_backend(backend_type)
+
+    # Inherit device binding from original group if it was bound
+    bound_device_id = metadata.get("bound_device_id")
+    if bound_device_id is not None:
+        new_pg.bound_device_id = bound_device_id
+
+    # Set group metadata
+    new_pg._set_group_name(group_name)
+    new_pg._set_group_desc(group_desc)
+
+    # Persist backend configuration overrides (if provided via shrink_group)
+    backend_config_override = metadata.get("backend_config")
+    if backend_config_override is not None:
+        # Store for introspection/debugging and potential backend hooks
+        _world.pg_backend_config[new_pg] = backend_config_override
+
+    return new_pg
+
+
+def _destroy_all_other_groups(exclude_group: Optional[ProcessGroup] = None) -> None:
+    """
+    Destroy all process groups except the excluded group and clean up all global state.
+
+    This is necessary when shrinking the default group because global ranks
+    are reassigned by NCCL, making all existing process groups inconsistent.
+
+    Note: Uses abort for non-collective cleanup since excluded ranks may not
+    participate in collective operations. Backend cleanup is handled independently per group.
+
+    Args:
+        exclude_group (ProcessGroup, optional): Process group to exclude from destruction.
+            If None, destroys all process groups.
+    """
+    # Get list of groups to destroy (avoid modifying dict while iterating)
+    groups_to_destroy = []
+    for pg in list(_world.pg_group_ranks.keys()):
+        if exclude_group is not None and pg == exclude_group:
+            continue
+        groups_to_destroy.append(pg)
+
+    # Warn user about automatic destruction
+    if groups_to_destroy:
+        group_names = [_get_process_group_name(pg) for pg in groups_to_destroy]
+        logger.warning(
+            "Shrinking default group will destroy %d other process groups: %s. "
+            "This is necessary because shrinking the default group reassigns global ranks, "
+            "making existing groups inconsistent.",
+            len(groups_to_destroy),
+            ", ".join(group_names),
+        )
+
+    # Destroy each group and clean up global state
+    for pg in groups_to_destroy:
+        try:
+            # First call abort_process_group which handles the C++ cleanup non-collectively
+            _abort_process_group(pg)
+        except Exception:
+            # Log but don't fail - some groups might already be destroyed
+            logger.warning(
+                "Failed to abort process group %s",
+                _get_process_group_name(pg),
+                exc_info=True,
+            )
+
+        # Ensure all global state is cleaned up even if _abort_process_group fails
+        # or doesn't clean up everything
+        _cleanup_process_group_global_state(pg)
+
+
+def _cleanup_process_group_global_state(pg: ProcessGroup) -> None:
+    """
+    Clean up all global state associated with a process group.
+
+    This function ensures complete cleanup of process group state from all
+    global dictionaries and registries, even if destroy_process_group fails
+    or doesn't clean up everything. This is critical when destroying multiple
+    groups to prevent inconsistent state.
+
+    The cleanup removes the process group from:
+    - _world.pg_map (backend and store mapping)
+    - _world.pg_names (group name mapping)
+    - _world.pg_group_ranks (rank mappings)
+    - _world.pg_backend_config (backend configuration)
+    - _world.tags_to_pg and _world.pg_to_tag (tag mappings)
+    - _world.pg_coalesce_state (coalescing state)
+    - C++ internal registries via _unregister_process_group
+
+    Args:
+        pg (ProcessGroup): The process group to clean up.
+    """
+    try:
+        # Clean up main process group mappings
+        _world.pg_map.pop(pg, None)
+        _world.pg_group_ranks.pop(pg, None)
+        _world.pg_backend_config.pop(pg, None)
+
+        # Clean up process group name mapping
+        group_name = _world.pg_names.pop(pg, None)
+
+        # Clean up tag mappings
+        pg_tag = _world.pg_to_tag.pop(pg, None)
+        if pg_tag is not None and pg_tag in _world.tags_to_pg:
+            try:
+                _world.tags_to_pg[pg_tag].remove(pg)
+                # Remove the tag entry if list is empty
+                if not _world.tags_to_pg[pg_tag]:
+                    _world.tags_to_pg.pop(pg_tag, None)
+            except (ValueError, KeyError):
+                # Process group was already removed from the list
+                pass
+
+        # Clean up any registered process group names using C++ unregister function
+        if group_name is not None:
+            try:
+                _unregister_process_group(group_name)
+            except Exception:
+                # Process group name might not be registered or already unregistered
+                pass
+
+        # Clean up coalesce state if present
+        _world.pg_coalesce_state.pop(pg, None)
+
+    except Exception:
+        # Log cleanup failures but don't propagate - we want to continue with other cleanups
+        logger.warning(
+            "Failed to fully clean up global state for process group", exc_info=True
+        )
+
+
+def _update_process_group_global_state(
+    pg: ProcessGroup,
+    backend_name: str,
+    store: Store,
+    group_name: str,
+    backend_config: str,
+    rank_mapping: Optional[dict[int, int]] = None,
+    pg_tag: Optional[str] = None,
+    user_tag: Optional[str] = None,
+) -> None:
+    """
+    Update all global state dictionaries for a process group.
+
+    This helper function consolidates the common pattern of updating multiple
+    global state dictionaries when creating or modifying process groups.
+
+    Args:
+        pg (ProcessGroup): The process group to update state for.
+        backend_name (str): Backend name for pg_map.
+        store (Store): Store instance for pg_map.
+        group_name (str): Group name for pg_names and registration.
+        backend_config (str): Backend configuration string.
+        rank_mapping (Dict[int, int], optional): Global rank to group rank mapping.
+            If None, skips updating pg_group_ranks.
+        pg_tag (str, optional): Process group tag. If None, defaults to f"ptd:{group_name}".
+        user_tag (str, optional): User-provided tag for special tag handling.
+            If provided, creates "user:{user_tag}" tag and also adds to default "".
+    """
+    # Update main process group mappings
+    _world.pg_map[pg] = (backend_name, store)
+    _world.pg_names[pg] = group_name
+    _world.pg_backend_config[pg] = backend_config
+
+    # Register the process group name
+    _register_process_group(group_name, pg)
+
+    # Update rank mapping if provided
+    if rank_mapping is not None:
+        _world.pg_group_ranks[pg] = rank_mapping
+
+    # Handle tag management
+    if pg_tag is None:
+        pg_tag = f"ptd:{group_name}"
+
+    if user_tag is not None:
+        # Special handling for user-provided tags
+        # Add to default "" tag first
+        _world.tags_to_pg.setdefault("", []).append(pg)
+        # Then create user-specific tag
+        user_pg_tag = f"user:{user_tag}"
+        _world.tags_to_pg.setdefault(user_pg_tag, []).append(pg)
+        _world.pg_to_tag[pg] = user_pg_tag
+    else:
+        # Standard process group tag
+        _world.tags_to_pg.setdefault(pg_tag, []).append(pg)
+        _world.pg_to_tag[pg] = pg_tag
diff --git a/torch/distributed/elastic/agent/server/api.py b/torch/distributed/elastic/agent/server/api.py
index d56d61e7eaac2..1122913ed95db 100644
--- a/torch/distributed/elastic/agent/server/api.py
+++ b/torch/distributed/elastic/agent/server/api.py
@@ -48,7 +48,8 @@
 
 @dataclass
 class WorkerSpec:
-    """Blueprint information about a particular type of worker.
+    """
+    Blueprint information about a particular type of worker.
 
     For a given role, there must only exist a single worker spec.
     Worker spec is expected to be homogeneous across all nodes (machine),
@@ -79,6 +80,10 @@ class WorkerSpec:
                                  that match _any_ of the filter strings.
         duplicate_stderr_filters: If non-empty, duplicates stderr to a file containing only lines
                                  that match _any_ of the filter strings.
+        virtual_local_rank: Enable virtual local rank mode for workers (defaults to False).
+                            When enabled, LOCAL_RANK is set to 0 for all workers and
+                            CUDA_VISIBLE_DEVICES is adjusted so each worker accesses its
+                            assigned GPU at device index 0.
     """
 
     role: str
@@ -97,6 +102,7 @@ class WorkerSpec:
     numa_options: Optional[NumaOptions] = None
     duplicate_stdout_filters: Optional[list[str]] = None
     duplicate_stderr_filters: Optional[list[str]] = None
+    virtual_local_rank: bool = False
 
     def __post_init__(self):
         assert self.local_world_size > 0
diff --git a/torch/distributed/elastic/agent/server/local_elastic_agent.py b/torch/distributed/elastic/agent/server/local_elastic_agent.py
index f643de5f9b25d..5fd3b7d3526db 100644
--- a/torch/distributed/elastic/agent/server/local_elastic_agent.py
+++ b/torch/distributed/elastic/agent/server/local_elastic_agent.py
@@ -303,7 +303,6 @@ def _start_workers(self, worker_group: WorkerGroup) -> dict[int, Any]:
         for worker in worker_group.workers:
             local_rank = worker.local_rank
             worker_env = {
-                "LOCAL_RANK": str(local_rank),
                 "RANK": str(worker.global_rank),
                 "GROUP_RANK": str(worker_group.group_rank),
                 "ROLE_RANK": str(worker.role_rank),
@@ -322,6 +321,7 @@ def _start_workers(self, worker_group: WorkerGroup) -> dict[int, Any]:
                     "TORCH_NCCL_ASYNC_ERROR_HANDLING", str(1)
                 ),
             }
+            self._set_local_rank_env(worker_env, local_rank, spec)
             if "OMP_NUM_THREADS" in os.environ:
                 worker_env["OMP_NUM_THREADS"] = os.environ["OMP_NUM_THREADS"]
 
@@ -362,6 +362,46 @@ def _start_workers(self, worker_group: WorkerGroup) -> dict[int, Any]:
 
         return self._pcontext.pids()
 
+    def _set_local_rank_env(
+        self, worker_env: dict[str, str | None], local_rank: int, spec: WorkerSpec
+    ) -> None:
+        # Set CUDA_VISIBLE_DEVICES and LOCAL_RANK based on virtual_local_rank mode.
+        # Virtual mode: Each worker sees only its assigned GPU as device 0, LOCAL_RANK=0
+        # Traditional mode: Workers see all GPUs, LOCAL_RANK matches actual local rank
+
+        if spec.virtual_local_rank:
+            # Set LOCAL_RANK=0 and use CUDA_VISIBLE_DEVICES to control the actual GPU access.
+
+            worker_env["LOCAL_RANK"] = "0"
+
+            # Map local_rank through existing CUDA_VISIBLE_DEVICES
+            # HIP uses CUDA_VISIBLE_DEVICES as a compatibility hack:
+            # https://rocm.docs.amd.com/en/latest/conceptual/gpu-isolation.html#cuda-visible-devices
+            parent_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
+            if parent_visible_devices is not None:
+                # Parse comma-separated list of GPU IDs
+                available_gpus = parent_visible_devices.split(",")
+                if local_rank >= len(available_gpus):
+                    raise ValueError(
+                        f"local_rank {local_rank} exceeds available GPUs in "
+                        f"CUDA_VISIBLE_DEVICES={parent_visible_devices}"
+                    )
+
+                visible_gpu = available_gpus[local_rank].strip()
+            else:
+                # No restriction, use local_rank directly
+                visible_gpu = str(local_rank)
+
+            worker_env["CUDA_VISIBLE_DEVICES"] = visible_gpu
+            return
+
+        # In traditional mode, don't override CUDA_VISIBLE_DEVICES
+        # (inherit from parent environment)
+        worker_env["LOCAL_RANK"] = str(local_rank)
+
+        if "CUDA_VISIBLE_DEVICES" in os.environ:
+            worker_env["CUDA_VISIBLE_DEVICES"] = os.environ["CUDA_VISIBLE_DEVICES"]
+
     def _shutdown(self, death_sig: signal.Signals = signal.SIGTERM) -> None:
         if self._worker_watchdog is not None:
             self._worker_watchdog.stop()
diff --git a/torch/distributed/elastic/multiprocessing/api.py b/torch/distributed/elastic/multiprocessing/api.py
index a6b9aa79dc668..dd1633252cb48 100644
--- a/torch/distributed/elastic/multiprocessing/api.py
+++ b/torch/distributed/elastic/multiprocessing/api.py
@@ -25,7 +25,7 @@
 from enum import IntFlag
 from multiprocessing import synchronize
 from types import FrameType
-from typing import Any, Optional, Union
+from typing import Any, Optional, TextIO, Union
 
 import torch.multiprocessing as mp
 from torch.distributed.elastic.multiprocessing.errors import ProcessFailure, record
@@ -491,8 +491,8 @@ def __init__(
         self.stderrs = logs_dest.stderrs
         self.error_files = logs_dest.error_files
         self.nprocs = nprocs
-        self.filtered_stdout = logs_dest.filtered_stdout
-        self.filtered_stderr = logs_dest.filtered_stderr
+        self.filtered_stdout: Optional[TextIO] = None
+        self.filtered_stderr: Optional[TextIO] = None
 
         self._tail_logs = [
             TailLog(name, logs_dest.tee_stdouts, sys.stdout, log_line_prefixes),
@@ -500,6 +500,9 @@ def __init__(
         ]
 
         if duplicate_stdout_filters:
+            self.filtered_stdout = open(
+                logs_dest.filtered_stdout, mode="w", errors="replace", buffering=1
+            )
             self._tail_logs.append(
                 TailLog(
                     name,
@@ -513,6 +516,9 @@ def __init__(
             )
 
         if duplicate_stderr_filters:
+            self.filtered_stderr = open(
+                logs_dest.filtered_stderr, mode="w", errors="replace", buffering=1
+            )
             self._tail_logs.append(
                 TailLog(
                     name,
@@ -657,6 +663,10 @@ def close(
         self._close(death_sig=death_sig, timeout=timeout)
         for tail_log in self._tail_logs:
             tail_log.stop()
+        if self.filtered_stdout:
+            self.filtered_stdout.close()
+        if self.filtered_stderr:
+            self.filtered_stderr.close()
 
 
 def get_std_cm(std_rd: str, redirect_fn):
diff --git a/torch/distributed/elastic/multiprocessing/tail_log.py b/torch/distributed/elastic/multiprocessing/tail_log.py
index 7ad35115cd34a..ad7c37e82c098 100644
--- a/torch/distributed/elastic/multiprocessing/tail_log.py
+++ b/torch/distributed/elastic/multiprocessing/tail_log.py
@@ -10,14 +10,14 @@
 import logging
 import os
 import time
+from collections.abc import Callable
 from concurrent.futures.thread import ThreadPoolExecutor
 from threading import Event
-from typing import Callable, Optional, TextIO, TYPE_CHECKING, Union
+from typing import Optional, TextIO, TYPE_CHECKING
 
 
 if TYPE_CHECKING:
     from concurrent.futures._base import Future
-    from io import TextIOWrapper
 
 __all__ = ["tail_logfile", "TailLog"]
 
@@ -97,7 +97,7 @@ def __init__(
         self,
         name: str,
         log_files: dict[int, str],
-        dst: Union[TextIO, str],
+        dst: TextIO,
         log_line_prefixes: Optional[dict[int, str]] = None,
         interval_sec: float = 0.1,
         log_line_filter: Callable[[str], bool] = (lambda _: True),
@@ -112,24 +112,12 @@ def __init__(
             )
 
         self._name = name
-        self._dst_file: Optional[TextIOWrapper] = None
-        self._dst: Optional[Union[TextIO, TextIOWrapper]] = None
-        if isinstance(dst, str):
-            try:
-                self._dst_file = open(dst, mode="w", errors="replace")
-                self._dst = self._dst_file
-            except Exception:
-                logger.exception("error opening dst file %s.", dst)
-                self._dst = None
-                self._dst_file = None
-
-        else:
-            self._dst = dst
+        self._dst = dst
         self._log_files = log_files
         self._log_line_prefixes = log_line_prefixes
         self._log_line_filter = log_line_filter
         self._finished_events: dict[int, Event] = {
-            local_rank: Event() for local_rank in log_files.keys()
+            local_rank: Event() for local_rank in log_files
         }
         self._futs: list[Future] = []
         self._interval_sec = interval_sec
@@ -174,9 +162,6 @@ def stop(self) -> None:
         if self._threadpool:
             self._threadpool.shutdown(wait=True)
 
-        if self._dst_file:
-            self._dst_file.close()
-
         self._stopped = True
 
     def stopped(self) -> bool:
diff --git a/test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_keyerror_without_factory b/torch/distributed/flight_recorder/__init__.py
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_keyerror_without_factory
rename to torch/distributed/flight_recorder/__init__.py
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dict_copy_order b/torch/distributed/flight_recorder/components/__init__.py
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dict_copy_order
rename to torch/distributed/flight_recorder/components/__init__.py
diff --git a/tools/flight_recorder/components/builder.py b/torch/distributed/flight_recorder/components/builder.py
similarity index 98%
rename from tools/flight_recorder/components/builder.py
rename to torch/distributed/flight_recorder/components/builder.py
index 5d95b708c34c1..f3c9d324fc479 100644
--- a/tools/flight_recorder/components/builder.py
+++ b/torch/distributed/flight_recorder/components/builder.py
@@ -11,8 +11,8 @@
 import sys
 from typing import Any  # type: ignore[attr-defined]
 
-from tools.flight_recorder.components.fr_logger import FlightRecorderLogger
-from tools.flight_recorder.components.types import (
+from torch.distributed.flight_recorder.components.fr_logger import FlightRecorderLogger
+from torch.distributed.flight_recorder.components.types import (
     Collective,
     Database,
     EntryState,
@@ -23,7 +23,7 @@
     Op,
     Traceback,
 )
-from tools.flight_recorder.components.utils import (
+from torch.distributed.flight_recorder.components.utils import (
     add_stack_id_in_entries,
     align_trace_from_beginning,
     check_current_entry_match,
@@ -39,6 +39,13 @@
 )
 
 
+__all__ = [
+    "build_groups_memberships",
+    "build_collectives",
+    "transform_ft",
+    "build_db",
+]
+
 # Set up logging
 logger: FlightRecorderLogger = FlightRecorderLogger()
 
diff --git a/tools/flight_recorder/components/config_manager.py b/torch/distributed/flight_recorder/components/config_manager.py
similarity index 97%
rename from tools/flight_recorder/components/config_manager.py
rename to torch/distributed/flight_recorder/components/config_manager.py
index 1a2336c28c505..8a2a4e2efe3c7 100644
--- a/tools/flight_recorder/components/config_manager.py
+++ b/torch/distributed/flight_recorder/components/config_manager.py
@@ -9,7 +9,10 @@
 from collections.abc import Sequence
 from typing import Optional
 
-from tools.flight_recorder.components.fr_logger import FlightRecorderLogger
+from torch.distributed.flight_recorder.components.fr_logger import FlightRecorderLogger
+
+
+__all__ = ["JobConfig"]
 
 
 logger: FlightRecorderLogger = FlightRecorderLogger()
diff --git a/tools/flight_recorder/components/fr_logger.py b/torch/distributed/flight_recorder/components/fr_logger.py
similarity index 97%
rename from tools/flight_recorder/components/fr_logger.py
rename to torch/distributed/flight_recorder/components/fr_logger.py
index 49d878bf45596..029e1449935b9 100644
--- a/tools/flight_recorder/components/fr_logger.py
+++ b/torch/distributed/flight_recorder/components/fr_logger.py
@@ -9,6 +9,9 @@
 from typing import Any, Optional
 
 
+__all__ = ["FlightRecorderLogger"]
+
+
 class FlightRecorderLogger:
     _instance: Optional[Any] = None
     logger: logging.Logger
diff --git a/tools/flight_recorder/components/loader.py b/torch/distributed/flight_recorder/components/loader.py
similarity index 95%
rename from tools/flight_recorder/components/loader.py
rename to torch/distributed/flight_recorder/components/loader.py
index 7634226bae528..3da51a6622ff9 100644
--- a/tools/flight_recorder/components/loader.py
+++ b/torch/distributed/flight_recorder/components/loader.py
@@ -13,7 +13,13 @@
 from collections import defaultdict
 from typing import Any, Union
 
-from tools.flight_recorder.components.fr_logger import FlightRecorderLogger
+from torch.distributed.flight_recorder.components.fr_logger import FlightRecorderLogger
+
+
+__all__ = [
+    "read_dump",
+    "read_dir",
+]
 
 
 logger: FlightRecorderLogger = FlightRecorderLogger()
diff --git a/tools/flight_recorder/components/types.py b/torch/distributed/flight_recorder/components/types.py
similarity index 98%
rename from tools/flight_recorder/components/types.py
rename to torch/distributed/flight_recorder/components/types.py
index 2c8fea5fb3340..e1561000d5750 100644
--- a/tools/flight_recorder/components/types.py
+++ b/torch/distributed/flight_recorder/components/types.py
@@ -16,7 +16,24 @@
     TypeVar,
 )
 
-from tools.flight_recorder.components.fr_logger import FlightRecorderLogger
+from torch.distributed.flight_recorder.components.fr_logger import FlightRecorderLogger
+
+
+__all__ = [
+    "Ref",
+    "TypeInfo",
+    "MatchState",
+    "MatchInfo",
+    "Group",
+    "Membership",
+    "Traceback",
+    "Collective",
+    "NCCLCall",
+    "Database",
+    "EntryState",
+    "Op",
+    "MatchStateRecord",
+]
 
 
 T = TypeVar("T", bound=NamedTuple)
@@ -164,7 +181,7 @@ class Database(NamedTuple):
 # TODO: We need to add a schema for the following
 types = [
     TypeInfo.from_type(t)  # type: ignore[type-var]
-    for t in globals().values()
+    for t in [Database, NCCLCall, Collective, Traceback, Membership, Group]
     if (
         isinstance(t, type)
         and issubclass(t, tuple)
diff --git a/tools/flight_recorder/components/utils.py b/torch/distributed/flight_recorder/components/utils.py
similarity index 97%
rename from tools/flight_recorder/components/utils.py
rename to torch/distributed/flight_recorder/components/utils.py
index 2f8736fa7c828..4e4e448158124 100644
--- a/tools/flight_recorder/components/utils.py
+++ b/torch/distributed/flight_recorder/components/utils.py
@@ -8,8 +8,8 @@
 import math
 from typing import Any
 
-from tools.flight_recorder.components.fr_logger import FlightRecorderLogger
-from tools.flight_recorder.components.types import (
+from torch.distributed.flight_recorder.components.fr_logger import FlightRecorderLogger
+from torch.distributed.flight_recorder.components.types import (
     Collective,
     EntryState,
     Group,
@@ -22,6 +22,25 @@
 )
 
 
+__all__ = [
+    "add_stack_id_in_entries",
+    "align_trace_from_beginning",
+    "check_current_entry_match",
+    "check_no_missing_dump_files",
+    "check_version",
+    "error_analysis",
+    "find_coalesced_group",
+    "find_coalesced_group_with_non_p2p",
+    "get_version_detail",
+    "just_print_entries",
+    "match_coalesced_groups_with_non_p2p",
+    "match_coalesced_groups",
+    "format_frame",
+    "format_frames",
+    "match_one_event",
+    "check_size_alltoall",
+]
+
 logger: FlightRecorderLogger = FlightRecorderLogger()
 
 
diff --git a/tools/flight_recorder/fr_trace.py b/torch/distributed/flight_recorder/fr_trace.py
similarity index 89%
rename from tools/flight_recorder/fr_trace.py
rename to torch/distributed/flight_recorder/fr_trace.py
index 8989bcdfebd93..0d05c0bf381f9 100644
--- a/tools/flight_recorder/fr_trace.py
+++ b/torch/distributed/flight_recorder/fr_trace.py
@@ -32,10 +32,13 @@
 from collections.abc import Sequence
 from typing import Optional
 
-from tools.flight_recorder.components.builder import build_db, transform_ft
-from tools.flight_recorder.components.config_manager import JobConfig
-from tools.flight_recorder.components.loader import read_dir
-from tools.flight_recorder.components.types import types
+from torch.distributed.flight_recorder.components.builder import build_db, transform_ft
+from torch.distributed.flight_recorder.components.config_manager import JobConfig
+from torch.distributed.flight_recorder.components.loader import read_dir
+from torch.distributed.flight_recorder.components.types import types
+
+
+__all__ = ["main"]
 
 
 def main(args: Optional[Sequence[str]] = None) -> None:
diff --git a/torch/distributed/fsdp/_optim_utils.py b/torch/distributed/fsdp/_optim_utils.py
index 60e3f37a99919..564cfeece48ee 100644
--- a/torch/distributed/fsdp/_optim_utils.py
+++ b/torch/distributed/fsdp/_optim_utils.py
@@ -469,7 +469,7 @@ def _flatten_optim_state_dict(
             for fqn in fqns:
                 if not unflat_osd_state[fqn]:
                     continue
-                for state_name in unflat_osd_state[fqn].keys():
+                for state_name in unflat_osd_state[fqn]:
                     unflat_osd_state[fqn][state_name] = _broadcast_state(
                         fsdp_state, unflat_osd_state[fqn][state_name], group=group
                     )
@@ -1377,9 +1377,7 @@ def _convert_all_state_info(
 
     for fqn, gathered_state in output_states.items():
         state_info = [s[fqn] for s in gathered_state_info]
-        all_tensor_states = sorted(
-            {n for state in state_info for n in state.tensors.keys()}
-        )
+        all_tensor_states = sorted({n for state in state_info for n in state.tensors})
         empty_ranks: set[int] = set()
         dtype: Optional[torch.dtype] = None
         # First check all the non-scalar states and get the information of
@@ -1549,7 +1547,7 @@ def _allgather_orig_param_states(
             fsdp_state._device_handle.memory_summary(),
         )
 
-    output_states: dict[str, dict[str, Any]] = {fqn: {} for fqn in input_states.keys()}
+    output_states: dict[str, dict[str, Any]] = {fqn: {} for fqn in input_states}
 
     dtype, state_buffers = _convert_all_state_info(
         fsdp_param_info, gathered_state_info, input_states, output_states
diff --git a/torch/distributed/fsdp/wrap.py b/torch/distributed/fsdp/wrap.py
index f0a210eca8a6b..f731854dab2eb 100644
--- a/torch/distributed/fsdp/wrap.py
+++ b/torch/distributed/fsdp/wrap.py
@@ -586,7 +586,7 @@ def enable_autowrap_context(kwargs: Any) -> None:
             )
         _ConfigAutoWrap.in_autowrap_context = True
         # Get and save the wrapper cls for the context.
-        if "wrapper_cls" not in kwargs.keys():
+        if "wrapper_cls" not in kwargs:
             raise AssertionError(
                 "Expected to pass in wrapper_cls arg into _ConfigAutoWrap."
             )
diff --git a/torch/distributed/launcher/api.py b/torch/distributed/launcher/api.py
index b75db1b11abbc..666fb24463f0d 100644
--- a/torch/distributed/launcher/api.py
+++ b/torch/distributed/launcher/api.py
@@ -75,6 +75,10 @@ class LaunchConfig:
                                 that match _any_ of the filter strings.
         duplicate_stderr_filters: If non-empty, duplicates stderr to a file containing only lines
                                 that match _any_ of the filter strings.
+        virtual_local_rank: Enable virtual local rank mode for workers (defaults to False).
+                           When enabled, LOCAL_RANK is set to 0 for all workers and
+                           CUDA_VISIBLE_DEVICES is adjusted so each worker accesses its
+                           assigned GPU at device index 0.
 
 
     .. note::
@@ -104,6 +108,7 @@ class LaunchConfig:
     signals_to_handle: str = "SIGTERM,SIGINT,SIGHUP,SIGQUIT"
     duplicate_stdout_filters: Optional[list[str]] = None
     duplicate_stderr_filters: Optional[list[str]] = None
+    virtual_local_rank: bool = False
 
     def __post_init__(self):
         default_timeout = 900
@@ -288,6 +293,7 @@ def launch_agent(
         numa_options=config.numa_options,
         duplicate_stdout_filters=config.duplicate_stdout_filters,
         duplicate_stderr_filters=config.duplicate_stderr_filters,
+        virtual_local_rank=config.virtual_local_rank,
     )
 
     agent = LocalElasticAgent(
diff --git a/torch/distributed/nn/api/remote_module.py b/torch/distributed/nn/api/remote_module.py
index 066197fad24a7..d2db28d4371de 100644
--- a/torch/distributed/nn/api/remote_module.py
+++ b/torch/distributed/nn/api/remote_module.py
@@ -493,7 +493,7 @@ def _init_template(self, module_interface_cls, enable_moving_cpu_tensors_to_cuda
 
     def _check_attribute_picklability(self):
         """Check if all the attribute has explicitly defined whether to be pickled (i.e., picklability)."""
-        for k in self.__dict__.keys():
+        for k in self.__dict__:
             if (
                 k not in _REMOTE_MODULE_PICKLED_ATTRIBUTES
                 and k not in _REMOTE_MODULE_ATTRIBUTES_IGNORE_FOR_PICKLING
diff --git a/torch/distributed/optim/named_optimizer.py b/torch/distributed/optim/named_optimizer.py
index b5135ae5411ef..c2384dabd9dad 100644
--- a/torch/distributed/optim/named_optimizer.py
+++ b/torch/distributed/optim/named_optimizer.py
@@ -203,7 +203,7 @@ def load_state_dict(self, state_dict: dict[str, Any]) -> None:
 
         for idx, param_key in enumerate(self.ordered_param_keys):
             # When the conditional training is performed, not all parameters are updated in the optim.
-            if param_key not in state.keys():
+            if param_key not in state:
                 continue
             if len(state[param_key]) != len(new_state[idx]):
                 raise ValueError(
diff --git a/torch/distributed/pipelining/_IR.py b/torch/distributed/pipelining/_IR.py
index 62e3764abe055..120c717755c78 100644
--- a/torch/distributed/pipelining/_IR.py
+++ b/torch/distributed/pipelining/_IR.py
@@ -108,7 +108,7 @@ def _find_loss_output(mod: torch.nn.Module, g: fx.Graph, output_loss_value_spec)
         generated_spec = TrivialLossWrapper.loss_spec
     elif output_loss_value_spec is None:
         # Use default spec, i.e. search for "loss" in output values
-        if isinstance(output_val, dict) and "loss" in output_val.keys():
+        if isinstance(output_val, dict) and "loss" in output_val:
             loss_node = output_val["loss"]
             generated_spec = {k: k == "loss" for k in output_val}
         else:
diff --git a/torch/distributed/pipelining/_backward.py b/torch/distributed/pipelining/_backward.py
index 38d30c793e89d..e34460449e1e0 100644
--- a/torch/distributed/pipelining/_backward.py
+++ b/torch/distributed/pipelining/_backward.py
@@ -337,7 +337,7 @@ def extract_tensors_with_grads(
                     return
                 assert isinstance(grad_val, dict)
                 assert set(output_val.keys()) == set(grad_val.keys())
-                for k in output_val.keys():
+                for k in output_val:
                     extract_tensors_with_grads(
                         output_val[k], grad_val[k], extract_tensors_with_grads
                     )
diff --git a/torch/distributed/pipelining/schedules.py b/torch/distributed/pipelining/schedules.py
index 39da483fe002b..abc007a8166db 100644
--- a/torch/distributed/pipelining/schedules.py
+++ b/torch/distributed/pipelining/schedules.py
@@ -54,6 +54,7 @@ class _ComputationType(Enum):
     RECV_B = 9
     FULL_BACKWARD = 10
     OVERLAP_F_B = 11
+    REDUCE_GRAD = 12
 
     def __str__(self):
         str_map = {
@@ -68,6 +69,7 @@ def __str__(self):
             _ComputationType.RECV_B: "RECV_B",
             _ComputationType.FULL_BACKWARD: "B",
             _ComputationType.OVERLAP_F_B: "OVERLAP_F_B",
+            _ComputationType.REDUCE_GRAD: "REDUCE_GRAD",
         }
         return str_map[self]
 
@@ -95,6 +97,8 @@ def from_str(action):
             return _ComputationType.FULL_BACKWARD
         elif action == "OVERLAP_F_B":
             return _ComputationType.OVERLAP_F_B
+        elif action == "REDUCE_GRAD":
+            return _ComputationType.REDUCE_GRAD
         else:
             raise RuntimeError(f"Invalid computation type {action}")
 
@@ -110,6 +114,7 @@ def from_str(action):
 RECV_B = _ComputationType.RECV_B
 FULL_BACKWARD = _ComputationType.FULL_BACKWARD
 OVERLAP_F_B = _ComputationType.OVERLAP_F_B
+REDUCE_GRAD = _ComputationType.REDUCE_GRAD
 
 # Convenience shorthand for compute actions only since they are used in 'simple schedule format'
 F = FORWARD
@@ -119,7 +124,7 @@ def from_str(action):
 
 # Helper to parse an action string like 1F0 into a tuple of (stage_index, computation_type, microbatch_index)
 _action_regex = re.compile(
-    r"(\d+)(F|I|B|W|UNSHARD|RESHARD|SEND_F|RECV_F|SEND_B|RECV_B)(\d*)"
+    r"(\d+)(F|I|B|W|UNSHARD|RESHARD|REDUCE_GRAD|SEND_F|RECV_F|SEND_B|RECV_B)(\d*)"
 )
 
 
@@ -645,10 +650,6 @@ def step(
             args_split, kwargs_split, targets_split, losses, return_outputs
         )
 
-        # Stage post processing
-        grad_scale_factor = self._n_microbatches if self.scale_grads else 1
-        self._stage._post_backward(grad_scale_factor)
-
         # Return merged results per original format
         if self._stage.is_last and return_outputs:
             return self._merge_outputs(self._stage.output_chunks)
@@ -809,6 +810,8 @@ def _step_microbatches(
         # Update losses if there is a container passed in
         self._update_losses(self._stage, losses)
 
+        self._stage.perform_reduce_grad(self._n_microbatches if self.scale_grads else 1)
+
     def _get_pipeline_order(self) -> Optional[dict[int, list[Optional[_Action]]]]:
         """
         Returns the pipeline order for GPipe schedule.
@@ -837,9 +840,9 @@ def _get_pipeline_order(self) -> Optional[dict[int, list[Optional[_Action]]]]:
             for mb_idx in range(self._n_microbatches):
                 actions.append(_Action(rank, _ComputationType.FULL_BACKWARD, mb_idx))
 
-            pipeline_order[rank] = actions
+            pipeline_order[rank] = _add_reduce_grad(actions, self._n_microbatches)
 
-        return pipeline_order
+        return pipeline_order  # type: ignore[return-value]
 
 
 class Schedule1F1B(PipelineScheduleSingle):
@@ -990,6 +993,8 @@ def _step_microbatches(
         # Return losses if there is a container passed in
         self._update_losses(self._stage, losses)
 
+        self._stage.perform_reduce_grad(self._n_microbatches if self.scale_grads else 1)
+
     def _get_pipeline_order(self) -> Optional[dict[int, list[Optional[_Action]]]]:
         """
         Returns the pipeline order for 1F1B schedule.
@@ -1055,10 +1060,47 @@ def _get_pipeline_order(self) -> Optional[dict[int, list[Optional[_Action]]]]:
                     backward_mb += 1
                     remaining_backward -= 1
 
-            pipeline_order[rank] = actions
+            pipeline_order[rank] = _add_reduce_grad(actions, self._n_microbatches)
         return pipeline_order
 
 
+def _requires_reduce_grad(action_type: _ComputationType) -> bool:
+    return action_type in (W, B)
+
+
+def _add_reduce_grad(
+    actions: list[Optional[_Action]], n_microbatches: int
+) -> list[Optional[_Action]]:
+    """
+    REDUCE_GRAD refers to joint across minibatches grad reduction.
+    reduce_grad frees memory and we want to schedule it just after the last "backward"-like stage.
+    """
+    actions_with_reduce_grad: list[Optional[_Action]] = []
+    cnt: dict[int, int] = defaultdict(int)
+
+    def _leaf_action(a, to_schedule):
+        if _requires_reduce_grad(a.computation_type):
+            stage_index = a.stage_index
+            cnt[stage_index] += 1
+            if cnt[stage_index] == n_microbatches:
+                to_schedule.append(stage_index)
+
+    for a in actions:
+        if a is None:
+            continue
+        actions_with_reduce_grad.append(a)
+        schedule_reduce_grad_stage_idxs: list[int] = []
+        if a.computation_type == OVERLAP_F_B and a.sub_actions is not None:
+            for sub_action in a.sub_actions:
+                _leaf_action(sub_action, schedule_reduce_grad_stage_idxs)
+        else:
+            _leaf_action(a, schedule_reduce_grad_stage_idxs)
+
+        for stage_idx in schedule_reduce_grad_stage_idxs:
+            actions_with_reduce_grad.append(_Action(stage_idx, REDUCE_GRAD, None))
+    return actions_with_reduce_grad
+
+
 def _add_unshard_reshard(
     compute_actions: list[Optional[_Action]],
     max_active_stages: int = 3,
@@ -1443,6 +1485,7 @@ def __init__(
         output_merge_spec: Optional[Union[dict[str, Any], tuple[Any]]] = None,
         use_full_backward: Optional[bool] = None,
         scale_grads: bool = True,
+        backward_requires_autograd: bool = True,
     ):
         # Init parent
         super().__init__(
@@ -1475,6 +1518,11 @@ def __init__(
         # This will be set during init of derived schedules
         self.pipeline_order: dict[int, list[Optional[_Action]]] = {}
 
+        # When using a custom backward function, we may or may not need autograd to be used
+        # for the backward pass. This flag is used to determine whether or torch.is_grad_enabled()
+        # check should be performed before the step function.
+        self._backward_requires_autograd = backward_requires_autograd
+
         if use_full_backward is not None:
             logger.warning(
                 "Deprecation warning: 'use_full_backward' is no longer supported. "
@@ -1567,7 +1615,11 @@ def step(
         losses: a list to store the losses for each microbatch.
         return_outputs: whether to return the outputs from the last stage.
         """
-        if self._has_backward and not torch.is_grad_enabled():
+        if (
+            self._has_backward
+            and self._backward_requires_autograd
+            and not torch.is_grad_enabled()
+        ):
             raise RuntimeError(
                 "step() requires gradients to be enabled for backward computation; "
                 "it should not be used under torch.no_grad() context. "
@@ -1596,12 +1648,6 @@ def step(
             args_split, kwargs_split, targets_split, losses, return_outputs
         )
 
-        # Stage post processing
-        # TODO: remove this section and include as part of the schedule IR?
-        for stage in self._stages:
-            grad_scale_factor = self._n_microbatches if self.scale_grads else 1
-            stage._post_backward(grad_scale_factor)
-
         # Return merged results per original format
         for stage in self._stages:
             if stage.is_last and return_outputs:
@@ -1637,7 +1683,7 @@ def _step_microbatches(
         # the stages in the pipeline_order
         all_prev_ranks: set[int] = set()
         all_next_ranks: set[int] = set()
-        for stage_index in stage_index_to_stage.keys():
+        for stage_index in stage_index_to_stage:
             # TODO: assumption that stages only communicate from distances of +1/-1 (no skip connections)
             if stage_index > 0:
                 all_prev_ranks.add(self.stage_index_to_group_rank[stage_index - 1])
@@ -1855,7 +1901,7 @@ def register_custom_function(
         Args:
             computation_type: The computation type for which to register the custom function
             custom_function: The function to execute when this computation type is encountered.
-                Must have signature: (stage: _PipelineStageBase, mb_index: int, *args, **kwargs) -> None
+                Must have signature: (action: _Action, ctx: _PipelineContext) -> None
         """
         # Ensure that the computation type is valid
         if computation_type not in (
@@ -1864,10 +1910,13 @@ def register_custom_function(
             BACKWARD_INPUT,
             BACKWARD_WEIGHT,
             OVERLAP_F_B,
+            UNSHARD,
+            RESHARD,
+            REDUCE_GRAD,
         ):
             raise ValueError(
                 f"Invalid computation type {computation_type}. Only FORWARD, FULL_BACKWARD, \
-BACKWARD_INPUT, BACKWARD_WEIGHT, and OVERLAP_F_B are supported."
+                BACKWARD_INPUT, BACKWARD_WEIGHT, OVERLAP_F_B, UNSHARD, RESHARD and REDUCE_GRAD are supported."
             )
 
         # Check if computation_type is already registered
@@ -1917,6 +1966,10 @@ def _prepare_schedule_with_comms(
                 self.pipeline_order_with_comms[rank] = _add_unshard_reshard(
                     actions[rank]
                 )
+                self.pipeline_order_with_comms[rank] = _add_reduce_grad(  # type: ignore[assignment]
+                    self.pipeline_order_with_comms[rank],  # type: ignore[arg-type]
+                    self._n_microbatches,
+                )
 
             self.pipeline_order_with_comms = _add_send_recv(
                 self.pipeline_order_with_comms,
@@ -2025,6 +2078,7 @@ def _perform_action(action: _Action) -> None:
             assert mb_index >= 0 or comp_type in (
                 UNSHARD,
                 RESHARD,
+                REDUCE_GRAD,
             ), f"{action=} missing mb_index"
             stage_idx = action.stage_index
             stage = stage_index_to_stage[stage_idx]
@@ -2033,12 +2087,6 @@ def _perform_action(action: _Action) -> None:
             is_next_stage_on_this_rank = stage_idx + 1 in stage_index_to_stage
             is_prev_stage_on_this_rank = stage_idx - 1 in stage_index_to_stage
 
-            logger.debug(
-                "_PipelineScheduleRuntime running time_step %d, action %s",
-                time_step,
-                action,
-            )
-
             # TODO(whc) it's not actually safe to use _batch_p2p here in the uncommon case the model has skip-connections,
             # since we do not want to batch up ops between more than a pair of ranks.  _sorted_batch_p2p would be
             # safe to use instead.
@@ -2185,12 +2233,20 @@ def _perform_action(action: _Action) -> None:
                     mb_index,
                     last_backward=last_backward,
                 )
+            elif comp_type == REDUCE_GRAD:
+                grad_scale_factor = self._n_microbatches if self.scale_grads else 1
+                stage.perform_reduce_grad(grad_scale_factor)
             else:
                 raise ValueError(f"{action=} is unknown or unsupported")
 
         # count either full_backward or backward_weight together, to determine when to sync DP grads
         self.backward_counter.clear()
         for time_step, action in enumerate(self.pipeline_order_with_comms[self.rank]):
+            logger.debug(
+                "_PipelineScheduleRuntime running time_step %d, action %s",
+                time_step,
+                action,
+            )
             try:
                 with record_function(_get_profiler_function_name(action)):
                     if action.computation_type in self._comp_type_to_function_map:
@@ -2253,6 +2309,7 @@ def __init__(
         loss_fn: Optional[Union[Callable, _Loss]] = None,
         output_merge_spec: Optional[Union[dict[str, Any], tuple[Any]]] = None,
         scale_grads: bool = True,
+        backward_requires_autograd: bool = True,
     ):
         super().__init__(
             stages=stages,
@@ -2260,6 +2317,7 @@ def __init__(
             loss_fn=loss_fn,
             output_merge_spec=output_merge_spec,
             scale_grads=scale_grads,
+            backward_requires_autograd=backward_requires_autograd,
         )
 
         # 1. Create the pipeline_order (all ranks do this calculation)
@@ -2467,6 +2525,7 @@ def __init__(
         kwargs_chunk_spec: Optional[dict[str, TensorChunkSpec]] = None,
         output_merge_spec: Optional[Union[dict[str, Any], tuple[Any]]] = None,
         scale_grads: bool = True,
+        backward_requires_autograd: bool = True,
     ):
         self.pp_group_size = stages[0].group_size
         super().__init__(
@@ -2477,6 +2536,7 @@ def __init__(
             kwargs_chunk_spec=kwargs_chunk_spec,
             output_merge_spec=output_merge_spec,
             scale_grads=scale_grads,
+            backward_requires_autograd=backward_requires_autograd,
         )
         self.n_local_stages = len(stages)
         self.rank = stages[0].group_rank
@@ -2579,6 +2639,7 @@ def __init__(
         kwargs_chunk_spec: Optional[dict[str, TensorChunkSpec]] = None,
         output_merge_spec: Optional[Union[dict[str, Any], tuple[Any]]] = None,
         scale_grads: bool = True,
+        backward_requires_autograd: bool = True,
     ):
         # TODO: we dont support input/weight backward split with torch.compile
         _check_torch_compile_compatibility(stages, self.__class__.__name__)
@@ -2591,6 +2652,7 @@ def __init__(
             kwargs_chunk_spec=kwargs_chunk_spec,
             output_merge_spec=output_merge_spec,
             scale_grads=scale_grads,
+            backward_requires_autograd=backward_requires_autograd,
         )
         self.n_local_stages = len(stages)
         self.rank = stages[0].group_rank
@@ -2776,6 +2838,7 @@ def __init__(
         kwargs_chunk_spec: Optional[dict[str, TensorChunkSpec]] = None,
         output_merge_spec: Optional[Union[dict[str, Any], tuple[Any]]] = None,
         scale_grads: bool = True,
+        backward_requires_autograd: bool = True,
     ):
         # TODO: we dont support input/weight backward split with torch.compile
         _check_torch_compile_compatibility(stages, self.__class__.__name__)
@@ -2788,6 +2851,7 @@ def __init__(
             kwargs_chunk_spec=kwargs_chunk_spec,
             output_merge_spec=output_merge_spec,
             scale_grads=scale_grads,
+            backward_requires_autograd=backward_requires_autograd,
         )
         self.stage_index_to_group_rank = generate_stage_to_rank_mapping(
             self.pp_group_size, self._num_stages, style="v"
@@ -2952,6 +3016,7 @@ def __init__(
         kwargs_chunk_spec: Optional[dict[str, TensorChunkSpec]] = None,
         output_merge_spec: Optional[Union[dict[str, Any], tuple[Any]]] = None,
         scale_grads: bool = True,
+        backward_requires_autograd: bool = True,
     ):
         # TODO: we dont support input/weight backward split with torch.compile
         _check_torch_compile_compatibility(stages, self.__class__.__name__)
@@ -2964,6 +3029,7 @@ def __init__(
             kwargs_chunk_spec=kwargs_chunk_spec,
             output_merge_spec=output_merge_spec,
             scale_grads=scale_grads,
+            backward_requires_autograd=backward_requires_autograd,
         )
         self.stage_index_to_group_rank = generate_stage_to_rank_mapping(
             self.pp_group_size, self._num_stages, style="v"
@@ -3176,7 +3242,7 @@ def get_schedule_class(schedule_name: str):
         "ZBVZeroBubble": ScheduleZBVZeroBubble,
         "DualPipeV": ScheduleDualPipeV,
     }
-    lowercase_keys = {k.lower(): k for k in schedule_map.keys()}
+    lowercase_keys = {k.lower(): k for k in schedule_map}
     lowercase_schedule_name = schedule_name.lower()
     if lowercase_schedule_name not in lowercase_keys:
         raise ValueError(
diff --git a/torch/distributed/pipelining/stage.py b/torch/distributed/pipelining/stage.py
index 6274689945109..a232f5519c9ee 100644
--- a/torch/distributed/pipelining/stage.py
+++ b/torch/distributed/pipelining/stage.py
@@ -978,7 +978,14 @@ def _get_init_p2p_neighbors_ops(self) -> list[dist.P2POp]:
 
         return ops
 
-    def _post_backward(self, grad_scale_factor: int):
+    def perform_reduce_grad(self, grad_scale_factor: int):
+        """
+        Called as a part of schedule IR.
+        REDUCE_GRAD action is scheduled after all microbatches W, B actions.
+
+        Currently contains "post_backward" functionality for FSDP.
+        We can try to extract post_backward in a separate IR action in future.
+        """
         # Manually call post backward for FSDP
         if isinstance(self.submod, FSDPModule):
             fsdp_module = self.submod
@@ -1001,7 +1008,8 @@ def _post_backward(self, grad_scale_factor: int):
             distributed_state._root_post_backward_final_callback()
         # Call gradient scaling at the end of the backward pass
         # NOTE: this must happen after FSDP post_backward is FSDP is enabled
-        self.scale_grads(grad_scale_factor)
+        if grad_scale_factor != 1:
+            self.scale_grads(grad_scale_factor)
 
 
 class _PipelineStage(_PipelineStageBase):
diff --git a/torch/distributed/rpc/backend_registry.py b/torch/distributed/rpc/backend_registry.py
index 16299404c6b65..3f30252bd8256 100644
--- a/torch/distributed/rpc/backend_registry.py
+++ b/torch/distributed/rpc/backend_registry.py
@@ -58,7 +58,7 @@ def backend_registered(backend_name):
         True if the backend has been registered with ``register_backend``, else
         False.
     """
-    return backend_name in BackendType.__members__.keys()
+    return backend_name in BackendType.__members__
 
 
 def register_backend(
diff --git a/torch/distributed/rpc/internal.py b/torch/distributed/rpc/internal.py
index c830fc11d8edd..faef8afddfc2c 100644
--- a/torch/distributed/rpc/internal.py
+++ b/torch/distributed/rpc/internal.py
@@ -122,7 +122,7 @@ def serialize(self, obj):
             p.dispatch_table[obj.__class__] = self._script_module_reducer  # type: ignore[index]
 
         # Install customized picklers.
-        for class_name in self._class_reducer_dict.keys():
+        for class_name in self._class_reducer_dict:
             p.dispatch_table[class_name] = self._class_reducer_dict[class_name]  # type: ignore[index]
 
         # save _thread_local_tensor_tables.send_tables if it is in nested call
diff --git a/torch/distributed/run.py b/torch/distributed/run.py
index cd9820e0e10ea..2343f7bb9b74c 100644
--- a/torch/distributed/run.py
+++ b/torch/distributed/run.py
@@ -688,6 +688,15 @@ def comma_separated_list(value):
         "Common additional signals: SIGUSR1,SIGUSR2 (used in SLURM environments).",
     )
 
+    parser.add_argument(
+        "--virtual-local-rank",
+        "--virtual_local_rank",
+        action=check_env,
+        help="Enable virtual local rank mode for workers. When enabled, LOCAL_RANK is set to 0 "
+        "for all workers and CUDA_VISIBLE_DEVICES is adjusted so each worker accesses its "
+        "assigned GPU at device index 0.",
+    )
+
     #
     # Positional arguments.
     #
@@ -907,6 +916,7 @@ def config_from_args(args) -> tuple[LaunchConfig, Union[Callable, str], list[str
         signals_to_handle=args.signals_to_handle,
         duplicate_stdout_filters=args.duplicate_stdout_filters,
         duplicate_stderr_filters=args.duplicate_stderr_filters,
+        virtual_local_rank=args.virtual_local_rank,
     )
 
     with_python = not args.no_python
diff --git a/torch/distributed/tensor/_api.py b/torch/distributed/tensor/_api.py
index de86d7923ae65..f21ef72533658 100644
--- a/torch/distributed/tensor/_api.py
+++ b/torch/distributed/tensor/_api.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-decorators
 # mypy: allow-untyped-defs
 # Copyright (c) Meta Platforms, Inc. and affiliates
+import copy
 import inspect
 import warnings
 from collections.abc import Callable, Sequence
@@ -96,16 +97,23 @@ def backward(ctx, grad_output: torch.Tensor):  # type: ignore[override]
         )
         tensor_stride = tuple(tensor_stride)
         grad_placements = grad_placements or dtensor_spec.placements
-        grad_spec = DTensorSpec(
-            mesh,
-            grad_placements,
-            tensor_meta=TensorMeta(
-                shape=dtensor_meta.shape,
-                stride=tensor_stride,
-                dtype=dtensor_meta.dtype,
-            ),
-        )
-
+        if (
+            tensor_stride == dtensor_meta.stride
+            and grad_placements == dtensor_spec.placements
+        ):
+            # Avoid actual sharing of specs in case they're modified during (e.g.)
+            # sharding propagation.
+            grad_spec = copy.copy(dtensor_spec)
+        else:
+            grad_spec = DTensorSpec(
+                mesh,
+                grad_placements,
+                tensor_meta=TensorMeta(
+                    shape=dtensor_meta.shape,
+                    stride=tensor_stride,
+                    dtype=dtensor_meta.dtype,
+                ),
+            )
         return (
             # pyrefly: ignore [bad-argument-type]
             DTensor(
@@ -338,14 +346,11 @@ def __coerce_same_metadata_as_tangent__(self, flatten_spec, expected_type=None):
         )
 
     @classmethod
-    @torch._disable_dynamo
-    # pyre-fixme[3]: Return type must be annotated.
-    # pyre-fixme[2]: Parameter must be annotated.
     def __torch_dispatch__(cls, func, types, args=(), kwargs=None):  # type: ignore[override]
-        return DTensor._op_dispatcher.dispatch(
-            func,
-            args,
-            kwargs or {},
+        # We just need to have an implementation here; the __torch_dispatch__ machinery
+        # calls into a specific C++ fast path that doesn't call here.
+        raise NotImplementedError(
+            "DTensor.__torch_dispatch__ should not actually get called"
         )
 
     @staticmethod
@@ -671,6 +676,8 @@ def __get_tensor_shard__(self, index):
     def __metadata_guard__(
         cls, orig: tuple[DTensorSpec, bool], other: tuple[DTensorSpec, bool]
     ) -> bool:
+        # TODO - delete this - This is now unused after the PR -
+        # https://github.com/pytorch/pytorch/pull/165824
         orig_spec, orig_requires_grad = orig
         other_spec, other_requires_grad = other
         return (
diff --git a/torch/distributed/tensor/_dispatch.py b/torch/distributed/tensor/_dispatch.py
index 1800edbfdb344..cbd817a8bde37 100644
--- a/torch/distributed/tensor/_dispatch.py
+++ b/torch/distributed/tensor/_dispatch.py
@@ -12,7 +12,12 @@
 from torch._library.utils import fill_defaults
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
-from torch.distributed.tensor._op_schema import OpInfo, OpSchema, OutputSpecType
+from torch.distributed.tensor._op_schema import (
+    OpInfo,
+    OpSchema,
+    OutputSharding,
+    OutputSpecType,
+)
 from torch.distributed.tensor._random import is_rng_supported_mesh
 from torch.distributed.tensor._redistribute import redistribute_local_tensor
 from torch.distributed.tensor._sharding_prop import ShardingPropagator
@@ -20,7 +25,10 @@
     convolution_backward_handler,
     convolution_handler,
 )
-from torch.distributed.tensor._utils import try_find_mesh_from_args
+from torch.distributed.tensor._utils import (
+    ExplicitRedistributionContext,
+    try_find_mesh_from_args,
+)
 from torch.distributed.tensor.placement_types import Partial, Placement, Replicate
 from torch.utils._debug_mode import get_active_debug_mode
 from torch.utils._python_dispatch import return_and_correct_aliasing
@@ -122,6 +130,8 @@ class OpDispatcher:
 
     def __init__(self) -> None:
         self.sharding_propagator = ShardingPropagator()
+        # NOTE: must stay in sync with is_random_op in
+        # torch/csrc/autograd/python_variable.cpp
         self._random_ops = {
             aten.native_dropout.default,
             aten.normal_.default,
@@ -154,26 +164,17 @@ def _allow_implicit_replication(self) -> bool:
     def _allow_implicit_replication(self, value: bool) -> None:
         return torch._C._set_dtensor_allow_implicit_replication(value)
 
-    def dispatch(
+    def _propagate_op_sharding_non_cached_dispatch_slow_path(
         self,
         op_call: torch._ops.OpOverload,
         args: tuple[object, ...],
         kwargs: dict[str, object],
+        op_info: OpInfo,
     ) -> object:
-        """
-        Main dispatching logic.  Follows precedence order:
-        (1) custom_op_handler
-        (2) registered sharding strategy, then rule
-        (3) composite implicit autograd decomposition
-        """
-        if op_call in self._custom_op_handlers:
-            return self._custom_op_handlers[op_call](op_call, args, kwargs)  # type: ignore[operator]
-
-        # extract local tensor and sharding infos to a OpInfo
-        op_info = self.unwrap_to_op_info(op_call, args, kwargs)
-
         try:
-            self.sharding_propagator.propagate(op_info)
+            return self.sharding_propagator.propagate_op_sharding_non_cached(
+                op_info.schema
+            )
         except NotImplementedError:
             if torch._C._dispatch_has_kernel_for_dispatch_key(
                 op_call.name(), torch._C.DispatchKey.CompositeImplicitAutograd
@@ -190,6 +191,12 @@ def dispatch(
                 f"{e}\n\nSharding propagation failed for {op_info.schema}"
             ) from e
 
+    def _dispatch_get_local_results_slow_path(
+        self,
+        op_call: torch._ops.OpOverload,
+        args: tuple[object, ...],
+        op_info: OpInfo,
+    ) -> object:
         output_sharding = op_info.output_sharding
         assert output_sharding is not None, "output sharding should not be None"
 
@@ -261,7 +268,7 @@ def dispatch(
             #   2. if the return type is Tensor or List[Tensor], return empty
             #   tensor(s) with correct dtype.
             spec = output_sharding.output_spec
-            ret_list = op_info.schema.op._schema.returns
+            ret_list = op_call._schema.returns
 
             if spec is None:
                 # For a scalar return type, the non-participating device has None
@@ -296,6 +303,23 @@ def default_tensor(spec: DTensorSpec) -> torch.Tensor:
                         raise NotImplementedError(
                             f"return type {ret_type} in DTensor op is not supported"
                         )
+        return local_results
+
+    def _dispatch_fast_path_python_tail(
+        self,
+        op_call: torch._ops.OpOverload,
+        args: tuple[object, ...],
+        kwargs: dict[str, object],
+        compute_mesh: DeviceMesh,
+        output_sharding: OutputSharding,
+        local_results: object,
+        participating: bool,
+        is_inplace_op: bool,
+        is_out_variant_op: bool,
+    ) -> object:
+        """
+        Tail of main dispatching logic, called from C++ fast path.
+        """
 
         if output_sharding.output_spec is None:
             if op_call == aten.equal.default:
@@ -305,31 +329,46 @@ def default_tensor(spec: DTensorSpec) -> torch.Tensor:
                 assert local_results is None or isinstance(local_results, bool)
                 r = torch.tensor(
                     int(local_results) if local_results is not None else 1,
-                    device=mesh.device_type,
+                    device=compute_mesh.device_type,
                 )
                 dist.all_reduce(r, op=dist.ReduceOp.MIN)
                 local_results = bool(r.item())
 
-        if op_info.schema.is_inplace_op():
+        if is_inplace_op:
             # inplace op should return self instead of re-wrapping
             if output_sharding.output_spec is not None:
+                output_spec = output_sharding.output_spec
+                assert isinstance(output_spec, DTensorSpec)
+                assert isinstance(args[0], dtensor.DTensor)
+
                 # NOTE: aten.squeeze_.dim is an inplace op but it also may change
                 # the inplace argument's tensor meta. Here we choose to special case
                 # this op because as far as I know this is the only inplace op that
                 # has such as behavior. We can extend this special case if necessary.
                 if op_call == aten.squeeze_.dim:
-                    output_spec = output_sharding.output_spec
-                    assert isinstance(output_spec, DTensorSpec)
-                    assert isinstance(args[0], dtensor.DTensor)
+                    # update the spec to handle tensor meta changes
                     args[0]._spec = output_spec
                     # use return_and_correct_aliasing to match the outer and the inner
                     # aliasing. See https://github.com/pytorch/pytorch/pull/158954
                     return return_and_correct_aliasing(op_call, args, kwargs, args[0])
                 else:
+                    # For all other inplace ops, check if placement changes are required
+                    # Inplace operations that change placement are not supported because
+                    # they would require redistribution, which breaks aliasing semantics.
+                    # If there are views into the tensor, the views would not be updated.
+                    if args[0]._spec.placements != output_spec.placements:
+                        raise RuntimeError(
+                            f"{op_call}: in-place operations that require placement changes "
+                            f"are not supported. The operation would change placement from "
+                            f"{args[0]._spec.placements} to {output_spec.placements}, "
+                            f"which requires redistribution and breaks aliasing semantics. "
+                            f"Please use the out-of-place version of this operation instead."
+                        )
+                    # Most inplace ops don't change tensor meta, so no spec update needed
                     return args[0]
             else:
                 return None
-        elif op_info.schema.is_out_variant_op():
+        elif is_out_variant_op:
             # out variant could possibly have multiple out args (i.e. lu_unpack.out)
             output_specs = (
                 (output_sharding.output_spec,)
@@ -348,8 +387,9 @@ def default_tensor(spec: DTensorSpec) -> torch.Tensor:
             assert len(out_dts) >= 1, "out variant should have at least one out arg"
             return tuple(out_dts) if len(out_dts) > 1 else out_dts[0]
         else:
+            assert op_call == aten.equal.default, op_call
             ret = self.wrap(local_results, output_sharding.output_spec)  # type: ignore[possibly-undefined]
-            if participating and op_info.schema.is_view_op():
+            if participating and op_call._schema._is_view_op():
                 return return_and_correct_aliasing(op_call, args, kwargs, ret)
             else:
                 return ret
@@ -383,7 +423,14 @@ def redistribute_local_args(
                         if debug_mode is not None
                         else contextlib.nullcontext()
                     )
-
+                    if not ExplicitRedistributionContext.is_redistribute_allowed(
+                        arg_spec,
+                        # pyrefly: ignore [bad-argument-type]
+                        reshard_arg_spec,
+                    ):
+                        raise RuntimeError(
+                            f"Implicit redistribution occurred for {op_info.schema} while ExplicitRedistributionContext was active"
+                        )
                     with redistribute_context:
                         resharded_local_tensor = redistribute_local_tensor(
                             local_tensor,
@@ -409,6 +456,15 @@ def unwrap_to_op_info(
         op_call: torch._ops.OpOverload,
         args: tuple[object, ...],
         kwargs: dict[str, object],
+    ) -> OpInfo:
+        return self._unwrap_to_op_info_impl(op_call, args, kwargs, True)
+
+    def _unwrap_to_op_info_impl(
+        self,
+        op_call: torch._ops.OpOverload,
+        args: tuple[object, ...],
+        kwargs: dict[str, object],
+        create_schema: bool,
     ) -> OpInfo:
         # get runtime schema info to determine whether to use pytree to flatten inputs
         runtime_schema_info = self.sharding_propagator.op_to_schema_info.get(
@@ -485,7 +541,9 @@ def unwrap_to_op_info(
                 ),
                 kwargs_schema,
                 schema_info=runtime_schema_info,
-            ),
+            )
+            if create_schema
+            else None,  # type: ignore[arg-type]
             args_schema,
             tuple(local_args),
             local_kwargs,
diff --git a/torch/distributed/tensor/_dtensor_spec.py b/torch/distributed/tensor/_dtensor_spec.py
index 5e7d7b3c842d2..ca51cdf70c058 100644
--- a/torch/distributed/tensor/_dtensor_spec.py
+++ b/torch/distributed/tensor/_dtensor_spec.py
@@ -1,4 +1,5 @@
 import itertools
+import math
 from collections import defaultdict
 from dataclasses import dataclass
 from typing import Any, cast, NamedTuple, Optional
@@ -7,6 +8,7 @@
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.tensor.placement_types import (
     _StridedShard,
+    MaskPartial,
     Partial,
     Placement,
     Replicate,
@@ -127,6 +129,185 @@ def compute_default_shard_order(
         )
         return default_shard_order
 
+    @staticmethod
+    def _convert_shard_order_to_StridedShard(
+        shard_order: ShardOrder, placements: tuple[Placement, ...], mesh: DeviceMesh
+    ) -> tuple[Placement, ...]:
+        """
+        Convert ShardOrder to placements with _StridedShard.
+
+        This function converts a ShardOrder specification into a tuple of Placement objects,
+        using _StridedShard when a tensor dimension is sharded across multiple mesh dimensions
+        in a non-default order. The split_factor of each _StridedShard is determined by the
+        product of mesh dimension sizes that appear earlier in the shard order but later in
+        the placement tuple.
+
+        Args:
+            shard_order: ShardOrder specification indicating which tensor dimensions are
+                sharded on which mesh dimensions and in what execution order.
+            placements: Tuple of Placement objects that does not contain _StridedShard.
+            mesh: DeviceMesh containing the size information for each mesh dimension.
+
+        Returns:
+            Updated tuple of Placement objects with Shard or _StridedShard placements.
+
+        Algorithm:
+            For each ShardOrderEntry in shard_order:
+              - For each mesh dimension in the entry's mesh_dims (in order):
+                - Calculate split_factor as the product of mesh sizes for all mesh dimensions
+                  that appear:
+                  1. Earlier in the shard order (lower index in mesh_dims), and
+                  2. Later in the placement tuple (higher mesh dimension index)
+                - If split_factor == 1: use normal Shard
+                - Otherwise: use _StridedShard with the calculated split_factor
+
+        Example:
+            >>> # xdoctest: +SKIP("Requires DeviceMesh")
+            >>> # Tensor dimension 0 sharded on mesh dims [2, 0, 1] in that order
+            >>> # mesh = DeviceMesh([4, 3, 2])  # sizes: mesh[0]=4, mesh[1]=3, mesh[2]=2
+            >>> shard_order = (ShardOrderEntry(tensor_dim=0, mesh_dims=(2, 0, 1)),)
+            >>> placements = (Shard(0), Shard(0), Shard(0))
+            >>> # For mesh_dim=2 (index 0 in mesh_dims): no earlier dims, split_factor=1
+            >>> #   -> placements[2] = Shard(0)
+            >>> # For mesh_dim=0 (index 1 in mesh_dims): mesh_dim=2 is earlier and has index 2>0
+            >>> #   -> split_factor = mesh.size(2) = 2
+            >>> #   -> placements[0] = _StridedShard(0, split_factor=2)
+            >>> # For mesh_dim=1 (index 2 in mesh_dims): mesh_dim=2 is earlier and has index 2>1
+            >>> #   -> split_factor = mesh.size(2) = 2
+            >>> #   -> placements[1] = _StridedShard(0, split_factor=2)
+            >>> # Result: (_StridedShard(0, sf=2), _StridedShard(0, sf=2), Shard(0))
+        """
+        placements_list = list(placements)
+        for entry in shard_order:
+            tensor_dim = entry.tensor_dim
+            mesh_dims = entry.mesh_dims
+            for idx in range(len(mesh_dims)):
+                # TODO(zpcore): split_factor from `view` and `shard order`
+                # should be able to be multiplied into one. Need to loosen the
+                # condition here.
+                mesh_dim = mesh_dims[idx]
+                if type(placements[mesh_dim]) is not Shard:
+                    raise ValueError(
+                        f"Only Shard placement can be converted to _StridedShard, "
+                        f"found {placements[mesh_dim]} in {placements=}."
+                    )
+                split_factor = math.prod(
+                    mesh.size(i) for i in mesh_dims[:idx] if i > mesh_dim
+                )
+                if split_factor == 1:
+                    # use normal Shard
+                    placements_list[mesh_dim] = Shard(tensor_dim)
+                else:
+                    placements_list[mesh_dim] = _StridedShard(
+                        tensor_dim, split_factor=split_factor
+                    )
+        return tuple(placements_list)
+
+    @staticmethod
+    def _maybe_convert_StridedShard_to_shard_order(
+        placements: tuple[Placement, ...], mesh: DeviceMesh
+    ) -> Optional[ShardOrder]:
+        """
+        Try to convert _StridedShard placements to ShardOrder.
+
+        This is the inverse of `_convert_shard_order_to_StridedShard`. It reconstructs the shard
+        order by examining the split_factor of each _StridedShard and determining its position
+        in the execution order. If the _StridedShard configuration cannot be represented as a
+        valid ShardOrder (i.e., there's no shard order that produces the observed split_factors),
+        this function returns None.
+
+        Args:
+            placements: Tuple of Placement objects that may contain _StridedShard.
+            mesh: DeviceMesh containing the size information for each mesh dimension.
+
+        Returns:
+            ShardOrder if conversion is possible, None otherwise. For placements without
+            _StridedShard, returns the default shard order.
+
+          Algorithm:
+              1. If no _StridedShard in placements, return default shard order
+              2. Create an empty list for each tensor dimension to represent mesh dim ordering
+              3. Iterate through placements in reverse order (right to left):
+                 - For each Shard/_StridedShard on a tensor dimension:
+                   - Extract its split_factor (1 for Shard, split_factor for _StridedShard)
+                   - Find the position in mesh_dims_order where accumulated_sf equals split_factor
+                   - accumulated_sf is the product of mesh sizes of mesh dimensions that appear
+                     earlier in mesh_dims_order (lower indices)
+                   - Insert mesh_dim at the found position
+              4. If no valid position found for any split_factor, return None (unable to convert)
+              5. Construct ShardOrderEntry for each tensor dimension from mesh_dims_order
+
+        Example:
+            >>> # xdoctest: +SKIP("Requires DeviceMesh")
+            >>> # mesh = DeviceMesh([4, 3, 2])  # sizes: mesh[0]=4, mesh[1]=3, mesh[2]=2
+            >>> # placements = (_StridedShard(0, sf=2), _StridedShard(0, sf=2), Shard(0))
+            >>> # Process tensor_dim=0 from right to left:
+            >>> #   - mesh_dim=2: Shard(0) with sf=1
+            >>> #     Try position 0: accumulated_sf=1, matches! Insert at position 0
+            >>> #     Current mesh_dims_order order: [2]
+            >>> #   - mesh_dim=1: _StridedShard(0, sf=2) with sf=2
+            >>> #     Try position 0: accumulated_sf=1, no match
+            >>> #     Try position 1: accumulated_sf=1*mesh.size(2)=2, matches! Insert at position 1
+            >>> #     Current mesh_dims_order order: [2, 1]
+            >>> #   - mesh_dim=0: _StridedShard(0, sf=2) with sf=2
+            >>> #     Try position 0: accumulated_sf=1, no match
+            >>> #     Try position 1: accumulated_sf=1*mesh.size(2)=2, matches! Insert at position 1
+            >>> #     Final mesh_dims_order order: [2, 0, 1]
+            >>> # Result: ShardOrder((ShardOrderEntry(tensor_dim=0, mesh_dims=(2, 0, 1)),))
+            >>> # This means: first shard on mesh_dim=2, then mesh_dim=0, then mesh_dim=1
+
+        Note:
+            This function validates that _StridedShard can be represented as a ShardOrder.
+            Not all _StridedShard configurations are valid - the split_factor must match
+            the product of mesh sizes in some execution order.
+        """
+        if not any(isinstance(p, _StridedShard) for p in placements):
+            return DTensorSpec.compute_default_shard_order(placements)
+        max_tensor_dim = (
+            max([i.dim for i in placements if isinstance(i, Shard | _StridedShard)]) + 1
+        )
+        shard_order = []
+
+        tensor_dim_to_mesh_dims_order: list[list[int]] = [
+            [] for i in range(max_tensor_dim)
+        ]
+        for mesh_dim in reversed(range(len(placements))):
+            cur_placement = placements[mesh_dim]
+            # _StridedShard may not be a subclass of Shard in the future, so write in this way:
+            if isinstance(cur_placement, Shard | _StridedShard):
+                tensor_dim = cur_placement.dim
+                mesh_dims_order = tensor_dim_to_mesh_dims_order[tensor_dim]
+                cur_sf = 1
+                if isinstance(cur_placement, _StridedShard):
+                    cur_sf = cur_placement.split_factor
+                accumulated_sf = 1
+                find_order = False
+                for i in range(len(mesh_dims_order) + 1):
+                    if accumulated_sf == cur_sf:
+                        mesh_dims_order.insert(i, mesh_dim)
+                        find_order = True
+                        break
+                    if i < len(mesh_dims_order):
+                        accumulated_sf *= mesh.size(mesh_dims_order[i])
+                if not find_order:
+                    # _StridedShard is not convertible to ShardOrder
+                    return None
+            else:
+                if not isinstance(cur_placement, Replicate | Partial | MaskPartial):
+                    raise ValueError(
+                        f"Unsupported placement type {type(cur_placement)} encountered in "
+                        f"{placements}; expected Replicate, Partial, or MaskPartial."
+                    )
+        for tensor_dim in range(max_tensor_dim):
+            if len(tensor_dim_to_mesh_dims_order[tensor_dim]) > 0:
+                shard_order.append(
+                    ShardOrderEntry(
+                        tensor_dim=tensor_dim,
+                        mesh_dims=tuple(tensor_dim_to_mesh_dims_order[tensor_dim]),
+                    )
+                )
+        return tuple(shard_order)
+
     def _verify_shard_order(self, shard_order: ShardOrder) -> None:
         """Verify that the shard_order is valid and matches the placements."""
         total_shard = 0
diff --git a/torch/distributed/tensor/_op_schema.py b/torch/distributed/tensor/_op_schema.py
index 6fc3cc1d4e670..95e9509cdbcd6 100644
--- a/torch/distributed/tensor/_op_schema.py
+++ b/torch/distributed/tensor/_op_schema.py
@@ -205,7 +205,7 @@ def __init__(self, strategies: list[OpSpec]) -> None:
     def __str__(self) -> str:
         strategy_list_str = ", ".join([str(strategy) for strategy in self.strategies])
         mesh_shape = self.mesh_shape
-        return f"OpStragety[{strategy_list_str}] @ mesh: {mesh_shape}"
+        return f"OpStrategy[{strategy_list_str}] @ mesh: {mesh_shape}"
 
     def max_num_shards(self) -> int:
         """
diff --git a/torch/distributed/tensor/_ops/_conv_ops.py b/torch/distributed/tensor/_ops/_conv_ops.py
index bcb9e01b5ed9b..df9b81ac5df6e 100644
--- a/torch/distributed/tensor/_ops/_conv_ops.py
+++ b/torch/distributed/tensor/_ops/_conv_ops.py
@@ -26,15 +26,18 @@ def convolution_rules(op_schema: OpSchema) -> OutputSharding:
 
     assert isinstance(input_spec, DTensorSpec)
     assert isinstance(weight_spec, DTensorSpec)
-    assert isinstance(bias_spec, DTensorSpec)
+    # bias_spec can be None (optional parameter in aten.convolution schema)
+    if bias_spec is not None:
+        assert isinstance(bias_spec, DTensorSpec)
     assert input_spec.tensor_meta is not None
     assert weight_spec.tensor_meta is not None
     in_shape = input_spec.tensor_meta.shape
     weight_shape = weight_spec.tensor_meta.shape
-    assert isinstance(stride, list)
-    assert isinstance(padding, list)
-    assert isinstance(dilation, list)
-    assert isinstance(weight_shape, torch.Size)
+    assert isinstance(stride, list), f"stride must be list, got {type(stride)}"
+    assert isinstance(padding, list), f"padding must be list, got {type(padding)}"
+    assert isinstance(dilation, list), f"dilation must be list, got {type(dilation)}"
+    # weight_shape might not be torch.Size in all cases (e.g., SymIntArrayRef during tracing)
+    # so we don't assert its type, just use it
     out_conv_shape = [
         (d + 2 * padding[i] - dilation[i] * (weight_shape[i + 1] - 1) - 1) // stride[i]
         + 1
@@ -82,14 +85,21 @@ def convolution_backward_rules(op_schema: OpSchema) -> OutputSharding:
     assert isinstance(grad_output_spec, DTensorSpec)
     assert isinstance(input_spec, DTensorSpec)
     assert isinstance(weight_spec, DTensorSpec)
-    assert isinstance(bias_shape_opt, list)
+    # bias_shape_opt can be None (optional parameter in aten.convolution_backward schema)
+    if bias_shape_opt is not None:
+        assert isinstance(bias_shape_opt, list)
     assert input_spec.tensor_meta is not None
     weight_tensor_meta = weight_spec.tensor_meta
-    bias_tensor_meta = TensorMeta(
-        torch.Size(bias_shape_opt),
-        (1,),
-        input_spec.tensor_meta.dtype,
-    )
+
+    # Only create bias_tensor_meta if bias_shape_opt is not None
+    if bias_shape_opt is not None:
+        bias_tensor_meta = TensorMeta(
+            torch.Size(bias_shape_opt),
+            (1,),
+            input_spec.tensor_meta.dtype,
+        )
+    else:
+        bias_tensor_meta = None
 
     grad_input_spec = input_spec
     grad_weight_spec = DTensorSpec.from_dim_map(
@@ -98,12 +108,18 @@ def convolution_backward_rules(op_schema: OpSchema) -> OutputSharding:
         [0],
         tensor_meta=weight_tensor_meta,
     )
-    grad_bias_spec = DTensorSpec.from_dim_map(
-        input_spec.mesh,
-        [-1],
-        [0],
-        tensor_meta=bias_tensor_meta,
-    )
+
+    # Only create grad_bias_spec if we have bias_tensor_meta
+    if bias_tensor_meta is not None:
+        grad_bias_spec = DTensorSpec.from_dim_map(
+            input_spec.mesh,
+            [-1],
+            [0],
+            tensor_meta=bias_tensor_meta,
+        )
+    else:
+        grad_bias_spec = None
+
     # TODO: actually the output_mask is not respected here, we should
     # set the corresponding spec to `None` if the output_mask is not `False`
     # for a certain output Tensor. This also applies to the conv handler
diff --git a/torch/distributed/tensor/_ops/_math_ops.py b/torch/distributed/tensor/_ops/_math_ops.py
index 45a786b9058e2..545895c83b6eb 100644
--- a/torch/distributed/tensor/_ops/_math_ops.py
+++ b/torch/distributed/tensor/_ops/_math_ops.py
@@ -441,15 +441,11 @@ def vector_norm_strategy(op_schema: OpSchema) -> OpStrategy:
     keepdim = args_schema[3] if len(args_schema) > 3 else False
     dims = _infer_reduction_dims(dim, input_strategy.ndim)
     reduce_dims = list(range(input_strategy.ndim)) if dims is None else dims
-    reduction_linear = all(
-        all(not p.is_partial() for p in op_spec.output_spec.placements)
-        for op_spec in input_strategy.strategies
-    )
     return common_reduction_strategy(
         input_strategy,
         reduce_dims,
         keep_dim=cast(bool, keepdim),
-        reduction_linear=reduction_linear,
+        reduction_linear=True,
         reduction_op=NormReduction(norm_type),
     )
 
@@ -472,14 +468,10 @@ def foreach_norm_strategy(op_schema: OpSchema) -> TupleStrategy:
         if not isinstance(op_strategy, OpStrategy):
             raise AssertionError(f"Expected OpStrategy, got {type(op_strategy)}")
         reduce_dims = list(range(op_strategy.ndim))
-        reduction_linear = all(
-            all(not p.is_partial() for p in op_spec.output_spec.placements)
-            for op_spec in op_strategy.strategies
-        )
         output_strategy = common_reduction_strategy(
             op_strategy,
             reduce_dims,
-            reduction_linear=reduction_linear,
+            reduction_linear=True,
             reduction_op=NormReduction(norm_type),
         )
         output_tuple_strategy_children.append(output_strategy)
diff --git a/torch/distributed/tensor/_ops/_matrix_ops.py b/torch/distributed/tensor/_ops/_matrix_ops.py
index cd6ba48d9832b..49152a1bee13a 100644
--- a/torch/distributed/tensor/_ops/_matrix_ops.py
+++ b/torch/distributed/tensor/_ops/_matrix_ops.py
@@ -256,7 +256,7 @@ def bmm_strategy(op_schema: OpSchema) -> OpStrategy:
 
 
 @register_op_strategy(aten.baddbmm.default)
-def baddmm_strategy(op_schema: OpSchema) -> OpStrategy:
+def baddbmm_strategy(op_schema: OpSchema) -> OpStrategy:
     mesh = op_schema.get_mesh_from_args()
     return _addmm_like_strategy("bmk,bkn->bmn", mesh, op_schema)
 
diff --git a/torch/distributed/tensor/_ops/_pointwise_ops.py b/torch/distributed/tensor/_ops/_pointwise_ops.py
index 084fa62706e0d..53b759e993c0d 100644
--- a/torch/distributed/tensor/_ops/_pointwise_ops.py
+++ b/torch/distributed/tensor/_ops/_pointwise_ops.py
@@ -618,7 +618,7 @@ def common_pointwise_strategy(
     return pointwise_strategy
 
 
-for op in linear_pointwise_ops.keys():
+for op in linear_pointwise_ops:
     register_op_strategy(op, schema_info=RuntimeSchemaInfo(static_kwargkey=["out"]))(
         linear_pointwise_strategy
     )
diff --git a/torch/distributed/tensor/_ops/_tensor_ops.py b/torch/distributed/tensor/_ops/_tensor_ops.py
index 34ebc4b79368d..fe20e41f59285 100644
--- a/torch/distributed/tensor/_ops/_tensor_ops.py
+++ b/torch/distributed/tensor/_ops/_tensor_ops.py
@@ -36,6 +36,7 @@
     Replicate,
     Shard,
 )
+from torch.fx.experimental.symbolic_shapes import statically_known_true
 
 
 aten = torch.ops.aten
@@ -381,7 +382,7 @@ def gen_slice_strategy(op_schema: OpSchema) -> StrategyType:
         raise AssertionError(f"Expected int, got {type(dim)}")
     if start is None:
         start = 0
-    if end is None or end > input_shape[dim]:
+    if end is None or statically_known_true(end > input_shape[dim]):
         end = input_shape[dim]
     if not isinstance(start, IntLike):
         raise AssertionError(f"Expected IntLike, got {type(start)}")
@@ -395,13 +396,20 @@ def gen_slice_strategy(op_schema: OpSchema) -> StrategyType:
     start = normalize_dim(start, input_shape[dim])  # type: ignore[arg-type]
     end = normalize_dim(end, input_shape[dim])  # type: ignore[arg-type]
 
-    redundant_slice = start == 0 and end == input_shape[dim] and step == 1
+    statically_redundant_slice = (
+        statically_known_true(start == 0)
+        and statically_known_true(end == input_shape[dim])
+        and statically_known_true(step == 1)
+    )
 
     slice_strategy = OpStrategy([])
 
     for arg_strategy in input_strategy.strategies:
         arg_spec = arg_strategy.output_spec
-        if not is_tensor_dim_sharded(arg_spec, dim=slice_dim) or redundant_slice:
+        if (
+            not is_tensor_dim_sharded(arg_spec, dim=slice_dim)
+            or statically_redundant_slice
+        ):
             # only add the strategy if the slice dim is not sharded
             out_spec = DTensorSpec(mesh, arg_spec.placements)
             slice_strategy.strategies.append(
diff --git a/torch/distributed/tensor/_sharding_prop.py b/torch/distributed/tensor/_sharding_prop.py
index c1af2c1317174..ede7515efd102 100644
--- a/torch/distributed/tensor/_sharding_prop.py
+++ b/torch/distributed/tensor/_sharding_prop.py
@@ -1,4 +1,5 @@
 # mypy: allow-untyped-defs
+import contextlib
 import threading
 from collections.abc import Callable, Sequence
 from functools import lru_cache
@@ -6,6 +7,7 @@
 from typing import cast, Optional, Union
 
 import torch
+from torch._guards import detect_fake_mode
 from torch._ops import OpOverload
 from torch._subclasses import FakeTensorMode
 from torch.distributed._functional_collectives import _are_we_tracing
@@ -169,7 +171,16 @@ def _propagate_tensor_meta_non_cached(
         # these operators to be inserted in the fx graph.
         from torch.fx.experimental.proxy_tensor import disable_proxy_modes_tracing
 
-        with FakeTensorMode(), disable_proxy_modes_tracing():
+        # DTensor.dispatch runs fake tensor prop twice, once here, and once for the actual
+        # local tensor result. The result here is never surfaced to tracing, and so if
+        # the op is data-dependent, can result in PendingUnbackedSymbolNotFound errors.
+        fake_mode = detect_fake_mode() or FakeTensorMode()
+        suppress_fresh_symbols_ctx = (
+            fake_mode.shape_env.ignore_fresh_unbacked_symbols()
+            if fake_mode.shape_env
+            else contextlib.nullcontext()
+        )
+        with fake_mode, disable_proxy_modes_tracing(), suppress_fresh_symbols_ctx:
             fake_args = op_schema.gen_fake_args()
             fake_kwargs = op_schema.gen_fake_kwargs()
             fake_out = op_schema.op(*fake_args, **fake_kwargs)
@@ -264,14 +275,16 @@ def _create_output_spec_with_new_tensor_meta(
                     output_tensor_meta_i = output_tensor_meta[i]
                     if not isinstance(output_tensor_meta_i, TensorMeta):
                         # NOTE: aten.convolution_backward.default is an exception and it
-                        # needs extra handling because the first Tensor in the output
-                        # tuple can be `None` if the input Tensor to convolution op has
-                        # `requires_grad=False` (e.g. convolution layer is the first
-                        # layer in the model). We explicitly allow its corresponding
-                        # TensorMeta to be `None`.
+                        # needs extra handling because any Tensor in the output tuple
+                        # can be `None` depending on the output_mask parameter. This can
+                        # occur during double backpropagation or when certain gradients
+                        # are not needed (e.g., grad_input when input has requires_grad=False,
+                        # grad_weight/grad_bias when weight/bias have requires_grad=False,
+                        # or grad_bias when bias is None). We explicitly allow the
+                        # corresponding TensorMeta to be `None`.
                         if (
                             op == aten.convolution_backward.default
-                            and i == 0
+                            and i in (0, 1, 2)
                             and output_tensor_meta_i is None
                         ):
                             assert isinstance(output_specs, list)
@@ -348,6 +361,10 @@ def propagate_op_sharding_non_cached(self, op_schema: OpSchema) -> OutputShardin
         """
         Propagate the sharding for an operator given the op_schema.
         """
+        # no-op in OSS, logs API usage metrics in meta-internal runs
+        torch._C._log_api_usage_once(
+            "torch.distributed.tensor._sharding_prop.ShardingPropagator.propogate_op_sharding_non_cached"
+        )
         # special case op, we don't need to propagate for local
         # scalar. TODO: figure out a better way to handle this
         if op_schema.op is aten._local_scalar_dense.default:
diff --git a/torch/distributed/tensor/_tp_conv.py b/torch/distributed/tensor/_tp_conv.py
index 2b3f126c7e506..275cb07934b50 100644
--- a/torch/distributed/tensor/_tp_conv.py
+++ b/torch/distributed/tensor/_tp_conv.py
@@ -11,7 +11,10 @@
 aten = torch.ops.aten
 
 
-def _requires_data_exchange(padding):
+def _requires_data_exchange(padding, dim_map) -> bool:
+    # Data exchange is not need if only sharded across batch dim
+    if all(x == -1 for x in dim_map[1:]):
+        return False
     # TODO: whether there requires data exchange is currently determined by padding
     return padding[-1] != 0
 
@@ -107,6 +110,7 @@ def tp_convolution(
     op_call: torch._ops.OpOverload,
     local_tensor_args: tuple[object, ...],
     local_tensor_kwargs: dict[str, object],
+    dim_map: list[int],
 ) -> object:
     assert op_call == aten.convolution.default
     assert len(local_tensor_args) == 9
@@ -120,7 +124,7 @@ def tp_convolution(
     assert _is_supported(in_tensor.shape, weight.shape, stride, padding, dilation)
     assert isinstance(padding, list)
 
-    if not _requires_data_exchange(padding):
+    if not _requires_data_exchange(padding, dim_map):
         local_results = op_call(*local_tensor_args, **local_tensor_kwargs)
         return local_results
     else:
@@ -160,6 +164,7 @@ def tp_convolution_backward(
     op_call: torch._ops.OpOverload,
     local_tensor_args: tuple[object, ...],
     local_tensor_kwargs: dict[str, object],
+    dim_map: list[int],
 ) -> object:
     assert op_call == aten.convolution_backward.default
     assert len(local_tensor_args) == 11
@@ -174,7 +179,7 @@ def tp_convolution_backward(
     assert _is_supported(in_tensor.shape, weight.shape, stride, padding, dilation)
     assert isinstance(padding, list)
 
-    if not _requires_data_exchange(padding):
+    if not _requires_data_exchange(padding, dim_map):
         local_results = op_call(*local_tensor_args, **local_tensor_kwargs)
         return local_results
     else:
@@ -239,15 +244,18 @@ def convolution_handler(
     dtensor.DTensor._op_dispatcher.sharding_propagator.propagate(op_info)
     output_sharding = op_info.output_sharding
     assert output_sharding is not None, "output sharding should not be None"
+    output_spec = output_sharding.output_spec
+    assert isinstance(output_spec, dtensor.DTensorSpec)
 
     # local propagation
     local_results = tp_convolution(
-        op_call, tuple(op_info.local_args), op_info.local_kwargs
+        op_call,
+        tuple(op_info.local_args),
+        op_info.local_kwargs,
+        output_spec.dim_map,
     )
 
-    return dtensor.DTensor._op_dispatcher.wrap(
-        local_results, output_sharding.output_spec
-    )
+    return dtensor.DTensor._op_dispatcher.wrap(local_results, output_spec)
 
 
 def convolution_backward_handler(
@@ -270,10 +278,14 @@ def convolution_backward_handler(
     dtensor.DTensor._op_dispatcher.sharding_propagator.propagate(op_info)
     output_sharding = op_info.output_sharding
     assert output_sharding is not None, "output sharding should not be None"
+    assert isinstance(op_info.flat_args_schema[0], dtensor.DTensorSpec)
 
     # local propagation
     local_results = tp_convolution_backward(
-        op_call, tuple(op_info.local_args), op_info.local_kwargs
+        op_call,
+        tuple(op_info.local_args),
+        op_info.local_kwargs,
+        op_info.flat_args_schema[0].dim_map,
     )
 
     return dtensor.DTensor._op_dispatcher.wrap(
diff --git a/torch/distributed/tensor/_utils.py b/torch/distributed/tensor/_utils.py
index d192ddf7c35b3..74ad2aaa80434 100644
--- a/torch/distributed/tensor/_utils.py
+++ b/torch/distributed/tensor/_utils.py
@@ -1,3 +1,4 @@
+import threading
 from collections import defaultdict
 from collections.abc import Sequence
 from typing import cast, Optional
@@ -7,6 +8,7 @@
 import torch.distributed.tensor._api as dtensor
 from torch._prims_common import ShapeType
 from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.tensor._collective_utils import redistribute_cost
 from torch.distributed.tensor._dtensor_spec import DTensorSpec
 from torch.distributed.tensor.placement_types import (
     _StridedShard,
@@ -18,6 +20,42 @@
 from torch.utils._typing_utils import not_none
 
 
+class ExplicitRedistributionContext:
+    """
+    Within this context manager, DTensor will refuse to perform implicit redistribution,
+    instead raising an error.  Manual calls to ``redistribute()`` are required wherever a redistribution
+    must occur to avoid erroring.  This can be used to ensure that the user is aware of all redistribution.
+
+    Note: it is easier to use this mode on just the forward pass of a typical DTensor program, as the backwards pass
+    may contain implicit redistribution calls that are not visible to the user and difficult to replace with manual
+    calls.  Redistribution during backward can be made explicit by writing `autograd.Function`s that are no-op
+    during forward and perform a manual redistribution during backwards.
+    """
+
+    _local = threading.local()
+
+    def __init__(self, enable: bool = True, strict: bool = False):
+        self._enable = enable
+        self._strict = strict
+
+    @classmethod
+    def is_redistribute_allowed(cls, src_spec: DTensorSpec, dst_spec: DTensorSpec):
+        if instance := getattr(cls._local, "_active", None):
+            if instance._enable:
+                if instance._strict:
+                    return False
+                return redistribute_cost(src_spec, dst_spec) <= 0
+        return True
+
+    def __enter__(self):
+        self._prev = getattr(ExplicitRedistributionContext._local, "_active", None)
+        ExplicitRedistributionContext._local._active = self
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        ExplicitRedistributionContext._local._active = self._prev
+
+
 def _explicit_order_placements(
     mesh_shape: ShapeType, placements: Sequence[Placement]
 ) -> Sequence[tuple[int, Placement]]:
diff --git a/torch/distributed/tensor/debug/__init__.py b/torch/distributed/tensor/debug/__init__.py
index a74f1449ad125..e6aeca3b93a12 100644
--- a/torch/distributed/tensor/debug/__init__.py
+++ b/torch/distributed/tensor/debug/__init__.py
@@ -1,4 +1,5 @@
 # mypy: allow-untyped-defs
+import torch._C
 from torch.distributed.tensor.debug._comm_mode import CommDebugMode
 from torch.distributed.tensor.debug._visualize_sharding import visualize_sharding
 
@@ -6,11 +7,12 @@
 __all__ = ["CommDebugMode", "visualize_sharding"]
 
 
-def _get_sharding_prop_cache_info():
+def _get_python_sharding_prop_cache_info():
     """
-    Get the cache info for the sharding propagation cache, used for debugging purpose only.
+    Get the cache info for the Python sharding propagation cache, used for debugging purpose only.
     This would return a named tuple showing hits, misses, maxsize and cursize of the sharding
-    propagator cache.
+    propagator cache. Note that directly calling into the sharding propagator does not share cache
+    state with the DTensor dispatch fast path!
     """
     from torch.distributed.tensor._api import DTensor
 
@@ -19,9 +21,17 @@ def _get_sharding_prop_cache_info():
     )
 
 
-def _clear_sharding_prop_cache():
+def _get_fast_path_sharding_prop_cache_stats():
     """
-    Clears the cache for the sharding propagation cache, used for debugging purpose only.
+    Get a tuple (hits, misses) for the fast path sharding propagation cache, used for debugging
+    only.
+    """
+    return torch._C._get_DTensor_sharding_propagator_cache_stats()
+
+
+def _clear_python_sharding_prop_cache():
+    """
+    Clears the cache for the Python sharding propagation cache, used for debugging purpose only.
     """
     from torch.distributed.tensor._api import DTensor
 
@@ -30,6 +40,13 @@ def _clear_sharding_prop_cache():
     )
 
 
+def _clear_fast_path_sharding_prop_cache():
+    """
+    Clears the cache for the fast path sharding propagation cache, used for debugging purpose only.
+    """
+    torch._C._clear_DTensor_sharding_propagator_cache()
+
+
 # Set namespace for exposed private names
 CommDebugMode.__module__ = "torch.distributed.tensor.debug"
 visualize_sharding.__module__ = "torch.distributed.tensor.debug"
diff --git a/torch/distributed/tensor/experimental/_attention.py b/torch/distributed/tensor/experimental/_attention.py
index 2444467a3595f..f238739ddd5cf 100644
--- a/torch/distributed/tensor/experimental/_attention.py
+++ b/torch/distributed/tensor/experimental/_attention.py
@@ -10,6 +10,7 @@
     _enable_context_parallel_dispatcher,
     _is_causal_behavior,
     _RotateMethod,
+    _templated_ring_attention,
     context_parallel,
     context_parallel_unshard,
     set_rotate_method,
@@ -22,6 +23,7 @@
 )
 
 
+# TODO(fegin): add deprecation message once the final interfaces are concluded.
 __all__ = [
     "_CausalBehavior",
     "_context_parallel_shard",
@@ -31,6 +33,7 @@
     "_enable_context_parallel_dispatcher",
     "_is_causal_behavior",
     "_RotateMethod",
+    "_templated_ring_attention",
     "context_parallel",
     "context_parallel_unshard",
     "set_rotate_method",
diff --git a/torch/distributed/tensor/experimental/_context_parallel/_attention.py b/torch/distributed/tensor/experimental/_context_parallel/_attention.py
index 09a86081df522..b1903e211a1c1 100644
--- a/torch/distributed/tensor/experimental/_context_parallel/_attention.py
+++ b/torch/distributed/tensor/experimental/_context_parallel/_attention.py
@@ -1032,9 +1032,7 @@ def _disable_context_parallel_dispatcher_impl() -> None:
     _disable_cp_dtensor_dispatcher()
 
 
-_compiled_create_block_mask = torch.compile(
-    create_block_mask, dynamic=False, fullgraph=True
-)
+_compiled_create_block_mask = None
 
 
 def _context_parallel_buffers(
@@ -1187,9 +1185,12 @@ def _create_cp_block_mask(
             f"BLOCK_SIZE {_DEFAULT_SPARSE_BLOCK_SIZE}. This is not supported yet. "
         )
 
-    compiled_create_block_mask = torch.compile(
-        create_block_mask, dynamic=False, fullgraph=True
-    )
+    global _compiled_create_block_mask
+    if _compiled_create_block_mask is None:
+        _compiled_create_block_mask = torch.compile(
+            create_block_mask, dynamic=False, fullgraph=True
+        )
+    compiled_create_block_mask = _compiled_create_block_mask
 
     def _rewrite_mask_mod(
         mask_mod: _mask_mod_signature,
diff --git a/torch/distributed/tensor/parallel/style.py b/torch/distributed/tensor/parallel/style.py
index 032179bafa3eb..182a3fbcafebf 100644
--- a/torch/distributed/tensor/parallel/style.py
+++ b/torch/distributed/tensor/parallel/style.py
@@ -560,7 +560,7 @@ def _prepare_input_fn(self, inputs, device_mesh):
     def _prepare_input_kwarg_fn(self, inputs, kwarg_inputs, device_mesh):
         prepared_arg_inputs = self._prepare_input_fn(inputs, device_mesh)
         prepared_kwarg_inputs = {}
-        for kwarg_key in kwarg_inputs.keys():
+        for kwarg_key in kwarg_inputs:
             kwarg_val = kwarg_inputs[kwarg_key]
             input_layout = self.input_kwarg_layouts.get(kwarg_key)
             desired_input_layout = self.desired_input_kwarg_layouts.get(kwarg_key)
diff --git a/torch/export/__init__.py b/torch/export/__init__.py
index c300df11a0c50..3224f8abe21f8 100644
--- a/torch/export/__init__.py
+++ b/torch/export/__init__.py
@@ -398,6 +398,9 @@ def load(
         Under active development, saved files may not be usable in newer versions
         of PyTorch.
 
+    .. warning::
+        :func:`torch.export.load()` uses pickle under the hood to load models. **Never load data from an untrusted source.**
+
     Loads an :class:`ExportedProgram` previously saved with
     :func:`torch.export.save <torch.export.save>`.
 
diff --git a/torch/export/_trace.py b/torch/export/_trace.py
index 934ee44882052..b38986ab070f7 100644
--- a/torch/export/_trace.py
+++ b/torch/export/_trace.py
@@ -12,6 +12,7 @@
 from contextlib import contextmanager, ExitStack, nullcontext
 from itertools import chain
 from typing import Any, Optional, TYPE_CHECKING, TypeAlias, Union
+from unittest import mock
 
 
 if TYPE_CHECKING:
@@ -139,6 +140,8 @@ class ExportDynamoConfig:
     capture_dynamic_output_shape_ops: bool = True
     capture_scalar_outputs: bool = True
     prefer_deferred_runtime_asserts_over_guards: bool = False
+    replay_side_effects: bool = False
+    side_effect_replay_policy: str = "warn"
 
 
 @dataclasses.dataclass
@@ -274,6 +277,24 @@ def _extract_fake_inputs(gm, args, kwargs):
         else:
             fake_vals.append(node.meta.get("example_value"))
 
+    if in_shuffle_graph := getattr(gm, "_in_shuffle_graph", None):
+        flat_args = pytree.tree_leaves((args, kwargs))
+        node_map = {
+            node: i
+            for i, node in enumerate(
+                next(iter(reversed(in_shuffle_graph.graph.nodes))).args[0]
+            )
+            if node.op == "placeholder"
+        }
+        new_fake_inps: list[Any] = []
+        for i, node in enumerate(
+            in_shuffle_graph.graph.find_nodes(op="placeholder")[1:]
+        ):
+            if node in node_map:
+                new_fake_inps.append(fake_inps[node_map[node]])
+            else:
+                new_fake_inps.append(flat_args[i])
+        fake_inps = new_fake_inps
     # We get both because now we might have a combination of symint and tensor
     # inputs, and we want to check that the shape env is consistent between
     # both. Unfortunately we can't see what fake mode is attached to the shape
@@ -798,6 +819,16 @@ def _export_to_torch_ir(
         prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
     )
 
+    def use_legacy_dynamo_graph_capture() -> bool:
+        return bool(
+            constraints  # dynamic shape
+            or dynamic_shapes  # dynamic shape
+            or isinstance(f, torch.fx.GraphModule)  # retracing
+            or preserve_module_call_signature  # unflatten
+            or torch._functorch.config.fake_tensor_propagate_real_tensors  # draft
+            or torch._export.config.use_legacy_dynamo_graph_capture
+        )
+
     with torch._dynamo.config.patch(dataclasses.asdict(dynamo_cfg)):
         try:
             module_call_specs: dict[str, dict[str, pytree.TreeSpec]] = (
@@ -812,11 +843,20 @@ def _export_to_torch_ir(
                 if torch._export.config.use_new_tracer_experimental:
                     from torch._dynamo.functional_export import (
                         _dynamo_graph_capture_for_export,
+                        dynamo_graph_capture_for_export,
                     )
 
-                    gm_torch_level = _dynamo_graph_capture_for_export(
-                        f, constraints=constraints, dynamic_shapes=dynamic_shapes
-                    )(*args, **kwargs)
+                    if use_legacy_dynamo_graph_capture():
+                        dynamo_graph_capture = _dynamo_graph_capture_for_export(
+                            f, constraints=constraints, dynamic_shapes=dynamic_shapes
+                        )
+                    else:
+                        dynamo_graph_capture = dynamo_graph_capture_for_export(f)
+                    # We can't serialize entire fake mode yet, so this is to make sure
+                    # things like copy.deepcopy(ep.graph_module) not crash.
+                    # see test_export.py::test_custom_tag_metadata_re_export
+                    # Once we delete the old strict export, we can use
+                    gm_torch_level = dynamo_graph_capture(*args, **kwargs)
                     # We can't serialize entire fake mode yet, so this is to make sure
                     # things like copy.deepcopy(ep.graph_module) not crash.
                     # see test_export.py::test_custom_tag_metadata_re_export
@@ -1568,7 +1608,11 @@ def _strict_export(
     }
 
     tx = TracingContext(dynamo_fake_mode)
-    with dynamo_fake_mode, tracing(tx):
+    with (
+        dynamo_fake_mode,
+        tracing(tx),
+        mock.patch.object(dynamo_fake_mode, "allow_non_fake_inputs", True),
+    ):
         aten_export_artifact = _to_aten_func(
             gm_torch_level,
             # NOTE: graph module expects only positional args
diff --git a/torch/export/dynamic_shapes.py b/torch/export/dynamic_shapes.py
index 1e1f1f409857b..a9a018468cef1 100644
--- a/torch/export/dynamic_shapes.py
+++ b/torch/export/dynamic_shapes.py
@@ -1333,7 +1333,7 @@ def refine_dynamic_shapes_from_suggested_fixes(
             roots.add(c.root.__name__)  # type: ignore[attr-defined]
 
     # check keys are existing dims or new roots
-    for k in shape_fixes.keys():
+    for k in shape_fixes:
         assert k in name_to_dim or k in roots
 
     # cache so we don't produce multiple derived dim objects
diff --git a/torch/export/experimental/__init__.py b/torch/export/experimental/__init__.py
index ec5e73cad85d4..0dabd98016a1b 100644
--- a/torch/export/experimental/__init__.py
+++ b/torch/export/experimental/__init__.py
@@ -420,13 +420,15 @@ def _compiled_and_package(
                     path = Path(base_directory) / f"{name}_input_{i}.pt"
                     torch.save(t, path)
 
-        cmake_file_str = _get_make_file(package_name, model_names, use_cuda)
+        # Detect if ROCm is being used
+        is_hip = torch.version.hip is not None
+        cmake_file_str = _get_make_file(package_name, model_names, use_cuda, is_hip)
 
         with open(Path(base_directory) / "CMakeLists.txt", "w") as file:
             file.write(cmake_file_str)
 
         main_file_str = _get_main_cpp_file(
-            package_name, model_names, use_cuda, example_inputs_map
+            package_name, model_names, use_cuda, example_inputs_map, is_hip
         )
         with open(Path(base_directory) / "main.cpp", "w") as file:
             file.write(main_file_str)
diff --git a/torch/export/experimental/_utils.py b/torch/export/experimental/_utils.py
index 67bda0c34ce4f..3f45f337fe912 100644
--- a/torch/export/experimental/_utils.py
+++ b/torch/export/experimental/_utils.py
@@ -13,6 +13,7 @@ def _get_main_cpp_file(
     model_names: list[str],
     cuda: bool,
     example_inputs_map: typing.Optional[dict[str, int]],
+    is_hip: bool,
 ) -> str:
     """
     Generates a main.cpp file for AOTInductor standalone models in the specified package.
@@ -43,12 +44,20 @@ def _get_main_cpp_file(
         ]
     )
     if cuda:
-        ib.writelines(
-            [
-                "#include <cuda.h>",
-                "#include <cuda_runtime_api.h>",
-            ]
-        )
+        if is_hip:
+            ib.writelines(
+                [
+                    "#include <hip/hip_runtime.h>",
+                ]
+            )
+
+        else:
+            ib.writelines(
+                [
+                    "#include <cuda.h>",
+                    "#include <cuda_runtime_api.h>",
+                ]
+            )
 
     for model_name in model_names:
         ib.writeline(
@@ -181,7 +190,9 @@ def _get_main_cpp_file(
     return ib.getvalue()
 
 
-def _get_make_file(package_name: str, model_names: list[str], cuda: bool) -> str:
+def _get_make_file(
+    package_name: str, model_names: list[str], cuda: bool, is_hip: bool
+) -> str:
     ib = IndentedBuffer()
 
     ib.writelines(
@@ -200,7 +211,10 @@ def _get_make_file(package_name: str, model_names: list[str], cuda: bool) -> str
         ib.writeline("find_package(Torch REQUIRED)")
 
     if cuda:
-        ib.writeline("find_package(CUDA REQUIRED)")
+        if is_hip:
+            ib.writeline("find_package(hip REQUIRED)")
+        else:
+            ib.writeline("find_package(CUDA REQUIRED)")
 
     ib.newline()
     for model_name in model_names:
@@ -208,12 +222,18 @@ def _get_make_file(package_name: str, model_names: list[str], cuda: bool) -> str
 
     ib.writeline("\nadd_executable(main main.cpp)")
     if cuda:
-        ib.writeline("target_compile_definitions(main PRIVATE USE_CUDA)")
+        if is_hip:
+            ib.writeline("target_compile_definitions(main PRIVATE USE_HIP)")
+        else:
+            ib.writeline("target_compile_definitions(main PRIVATE USE_CUDA)")
 
     model_libs = " ".join(model_names)
     ib.writeline(f"target_link_libraries(main PRIVATE torch {model_libs})")
 
     if cuda:
-        ib.writeline("target_link_libraries(main PRIVATE cuda ${CUDA_LIBRARIES})")
+        if is_hip:
+            ib.writeline("target_link_libraries(main PRIVATE hip::host)")
+        else:
+            ib.writeline("target_link_libraries(main PRIVATE cuda ${CUDA_LIBRARIES})")
 
     return ib.getvalue()
diff --git a/torch/export/exported_program.py b/torch/export/exported_program.py
index 58bd4b9087d21..afd73ce13d00b 100644
--- a/torch/export/exported_program.py
+++ b/torch/export/exported_program.py
@@ -1709,8 +1709,11 @@ def _convert_guards_to_code(graph_module):
     py_printer = torch.fx.experimental.symbolic_shapes.ShapeGuardPythonPrinter(
         shape_env.var_to_sources, lambda s: s.name(), shape_env.var_to_sources
     )
-    return [
+    ret = [
         py_printer.doprint(guard.expr)
         for guard in shape_env.guards
         if guard.expr.free_symbols.issubset(local_vars)
     ]
+    # TODO Figure out how to resolve guards containing weight sizes.
+    # This is not a big deal as _guards_code is mostly empty today.
+    return [guard for guard in ret if "L['self']" not in guard]
diff --git a/torch/export/passes/__init__.py b/torch/export/passes/__init__.py
index 90430608cab21..9de0bea443920 100644
--- a/torch/export/passes/__init__.py
+++ b/torch/export/passes/__init__.py
@@ -30,7 +30,7 @@ def _get_new_device(
         location: Union[torch.device, str, dict[str, str]],
     ) -> str:
         if isinstance(location, dict):
-            if str(curr_device) in location.keys():
+            if str(curr_device) in location:
                 return location[str(curr_device)]
             else:
                 return str(curr_device)
diff --git a/torch/export/unflatten.py b/torch/export/unflatten.py
index 3701ba99047fb..a3f86fabceb7b 100644
--- a/torch/export/unflatten.py
+++ b/torch/export/unflatten.py
@@ -1331,7 +1331,7 @@ def get_actual_output_node(output):
         else:
             graph_outputs = []
             # Iterate through nodes we have copied into self.graph.
-            for orig_node in self.node_map.keys():
+            for orig_node in self.node_map:
                 for user_node in orig_node.users:
                     if user_node.name not in self.seen_nodes:
                         # external user node, need to expose as an output
diff --git a/torch/fx/_symbolic_trace.py b/torch/fx/_symbolic_trace.py
index cfce00fb05ea2..150c8ed746872 100644
--- a/torch/fx/_symbolic_trace.py
+++ b/torch/fx/_symbolic_trace.py
@@ -18,6 +18,7 @@
 import torch.utils._pytree as pytree
 from torch._C import ScriptObject  # type: ignore[attr-defined]
 from torch._library.fake_class_registry import FakeScriptObject
+from torch._library.opaque_object import is_opaque_type
 
 from ._compatibility import compatibility
 from ._lazy_graph_module import _make_graph_module
@@ -421,8 +422,10 @@ def create_arg(self, a: Any) -> "Argument":
         # a get_attr to retrieve that tensor. Otherwise, we'll store away the
         # tensor value into a special attribute on the Module s.t. we can
         # retrieve it with a get_attr.
-        if isinstance(a, _constant_attribute_types):
-            qualname: Optional[str] = self.tensor_attrs.get(a)
+        if isinstance(a, _constant_attribute_types) or is_opaque_type(type(a)):
+            qualname: Optional[str] = self.tensor_attrs.get(
+                a
+            )  # pyrefly: ignore[no-matching-overload]
 
             # Tensor was not found in the Module hierarchy, stow it away in a
             # special attribute and set the qualname to refer to that
@@ -433,13 +436,17 @@ def create_arg(self, a: Any) -> "Argument":
                     base_name = "_torchbind_obj"
                 elif isinstance(a, pytree.TreeSpec):
                     base_name = "_tree_spec_constant"
+                elif is_opaque_type(type(a)):
+                    base_name = "_opaque_obj"
                 else:
                     raise RuntimeError(
                         f"cannot create constant arg for {a} of type {type(a)}."
                     )
                 qualname = self.get_fresh_qualname(base_name)
                 assert isinstance(qualname, str)
-                self.tensor_attrs[a] = qualname
+                self.tensor_attrs[a] = (  # pyrefly: ignore[unsupported-operation]
+                    qualname
+                )
                 setattr(self.root, qualname, a)
 
             return self.create_node("get_attr", qualname, (), {})
@@ -863,12 +870,13 @@ def forward(*args, **kwargs):
                     _autowrap_check(
                         patcher, module.__dict__, self._autowrap_function_ids
                     )
+                ann = inspect.get_annotations(inspect.unwrap(fn))
                 self.create_node(
                     "output",
                     "output",
                     (self.create_arg(fn(*args)),),
                     {},
-                    type_expr=fn.__annotations__.get("return", None),
+                    type_expr=ann.get("return", None),
                 )
 
             self.submodule_paths = None
diff --git a/torch/fx/experimental/_config.py b/torch/fx/experimental/_config.py
index ce4296b6410c9..a537978db3834 100644
--- a/torch/fx/experimental/_config.py
+++ b/torch/fx/experimental/_config.py
@@ -2,6 +2,8 @@
 import sys
 from typing import Optional
 
+from torch.utils._config_module import Config, install_config_module
+
 
 # [@compile_ignored: debug] Fails hard instead of graph breaking on guard on data dependent errors.
 no_data_dependent_graph_break = (
@@ -100,7 +102,11 @@
 # Skip dtype check in meta registrations. Only used for systems that does its own dtype checking.
 skip_dtype_check_in_meta_registrations = False
 
-from torch.utils._config_module import install_config_module
+# Experimental: If True, graph module will register fx metadata during recompile()
+enrich_profiler_metadata: bool = Config(  # type: ignore[var-annotated]
+    default=False,
+    env_name_default="TORCH_ENRICH_RPOFILER_STACK_TRACE",
+)
 
 
 install_config_module(sys.modules[__name__])
diff --git a/torch/fx/experimental/const_fold.py b/torch/fx/experimental/const_fold.py
index 0f6460302dfbb..f494f11593410 100644
--- a/torch/fx/experimental/const_fold.py
+++ b/torch/fx/experimental/const_fold.py
@@ -177,6 +177,26 @@ def split_const_subgraphs(
     else:
         mod_traced = module
 
+    def _subgraph_has_impure_ops(module: torch.fx.GraphModule) -> bool:
+        """
+        Return True if a GraphModule type subgraph contains any impure op, else False.
+        """
+        assert isinstance(module, torch.fx.GraphModule), (
+            "caller should only pass GraphModule to subgraph_has_impure_ops check"
+        )
+        for node in module.graph.nodes:
+            if node.op == "call_function" and node.is_impure():
+                return True
+            if (
+                # pyrefly: ignore [invalid-argument]
+                node.op == "call_module"
+                # pyrefly: ignore [not-callable]
+                and (submodule := module.get_submodule(node.target))
+                and isinstance(submodule, torch.fx.GraphModule)
+            ):
+                return _subgraph_has_impure_ops(submodule)
+        return False
+
     # Build up a list of const_nodes, defined as nodes that are themselves
     # get_attrs, or have all get_attr or other constant node inputs.
     const_nodes: set[torch.fx.Node] = set()
@@ -206,6 +226,17 @@ def split_const_subgraphs(
         if isinstance(node.kwargs.get("fill_value", None), sympy.Expr):
             continue
 
+        # Skip folding submodules that have impure ops
+        if (
+            # pyrefly: ignore [invalid-argument]
+            node.op == "call_module"
+            # pyrefly: ignore [not-callable]
+            and (target_mod := mod_traced.get_submodule(node.target))
+            and isinstance(target_mod, torch.fx.GraphModule)
+            and _subgraph_has_impure_ops(target_mod)
+        ):
+            continue
+
         # Must be a constant foldable node at this point.
         const_nodes.add(node)
         if node.op != "get_attr":
diff --git a/torch/fx/experimental/proxy_tensor.py b/torch/fx/experimental/proxy_tensor.py
index a2bb9a7549c5e..f763ad2ee2cfc 100644
--- a/torch/fx/experimental/proxy_tensor.py
+++ b/torch/fx/experimental/proxy_tensor.py
@@ -42,6 +42,7 @@
 from torch._library.fake_class_registry import FakeScriptObject
 from torch._library.opaque_object import is_opaque_type
 from torch._logging import trace_structured
+from torch._ops import HigherOrderOperator
 from torch._subclasses.fake_impls import fast_detach
 from torch._subclasses.fake_tensor import (
     FakeTensor,
@@ -83,13 +84,14 @@
 
     from torch._ops import OpOverload
     from torch.fx._symbolic_trace import PHBase
-    from torch.types import IntLikeType
+    from torch.types import BoolLikeType, FloatLikeType, IntLikeType
 
 __all__ = [
     "PythonKeyTracer",
     "dispatch_trace",
     "make_fx",
     "DecompositionInterpreter",
+    "selective_decompose",
     "py_sym_types",
     "get_innermost_proxy_mode",
     "get_proxy_mode",
@@ -456,7 +458,7 @@ def _sympy_handlers() -> dict[type[sympy.Expr], Callable[..., Any]]:
 
 def _build_proxy_for_sym_expr(
     tracer: _ProxyTracer, expr: sympy.Expr, out: PySymType | None = None
-) -> PySymType | None:
+) -> IntLikeType | FloatLikeType | BoolLikeType | None:
     """
     Decompose `expr` and look for the pieces as inputs. If `out` is provided
     then that will be the resulting SymNode (and `out.expr` must be the same as
@@ -530,6 +532,13 @@ def _build_proxy_for_sym_expr(
         assert not out
         return value.value
 
+    if isinstance(expr, (int, float, bool)):
+        return expr
+    if expr.is_Integer:
+        return int(expr)
+    if expr.is_Float:
+        return float(expr)
+
     args = []
     for arg in expr.args:
         if (arg_value := _build_proxy_for_sym_expr(tracer, arg)) is None:
@@ -1534,7 +1543,9 @@ def get_sym_proxy_slot(t: PySymType) -> Proxy:
 
 
 @contextmanager
-def set_original_aten_op(func: OpOverload) -> Generator[None, None, None]:
+def set_original_aten_op(
+    func: OpOverload | torch._ops.HigherOrderOperator,
+) -> Generator[None, None, None]:
     global ORIGINAL_ATEN
     if ORIGINAL_ATEN is None and fx_traceback.has_preserved_node_meta():
         ORIGINAL_ATEN = func
@@ -1881,6 +1892,93 @@ def run(self, *args: object, **kwargs: object) -> object:
             return super().run(*args, **kwargs)  # type: ignore[arg-type]
 
 
+class _SelectiveDecomposeInterpreter(fx.Interpreter):
+    def __init__(
+        self,
+        module: fx.GraphModule,
+        should_decompose: Callable[[fx.Node], bool],
+        decomposition_table: Mapping[OpOverload, Callable],
+        **kwargs: object,
+    ) -> None:
+        """
+        For all nodes in `module`, selectively decompose if is `should_decompose`,
+        following the given `decomposition_table`.
+        """
+        super().__init__(module, **kwargs)  # type: ignore[arg-type]
+        self.should_decompose = should_decompose
+        self.decomposition_table = decomposition_table
+
+    @staticmethod
+    def recursive_wrap(
+        gm: fx.GraphModule,
+        should_decompose: Callable[[fx.Node], bool],
+        decomposition_table: Mapping[OpOverload, Callable],
+        **kwargs: object,
+    ) -> _SelectiveDecomposeInterpreter:
+        """
+        Recursively wrap gm and its sub graph modules. Specifically, HOP takes
+        sub graph module as args. We may not want to decompose all nodes within
+        these sub graph modules. So we also need to wrap these sub graph modules.
+        As a result:
+        - if should_decompose(hop) is True, we decompose all nodes within the hop.
+        - if should_decompose(hop) is False, we check each node within the hop
+            and decide whether decompose or not.
+        """
+        for node in gm.graph.nodes:
+            if node.op == "call_function" and isinstance(
+                node.target, HigherOrderOperator
+            ):
+                new_args = []
+                for arg in node.args:
+                    if isinstance(arg, fx.GraphModule):
+                        new_arg = _SelectiveDecomposeInterpreter.recursive_wrap(
+                            arg, should_decompose, decomposition_table, **kwargs
+                        )
+                    else:
+                        new_arg = arg
+                    new_args.append(new_arg)
+                node.args = tuple(new_args)
+
+        return _SelectiveDecomposeInterpreter(
+            gm, should_decompose, decomposition_table, **kwargs
+        )
+
+    def run_node(self, n):
+        if self.should_decompose(n):
+            with decompose(self.decomposition_table):
+                result = super().run_node(n)
+        else:
+            result = super().run_node(n)
+        return result
+
+
+def selective_decompose(
+    joint_gm: fx.GraphModule,
+    *args,
+    decomposition,
+    should_decompose,
+    trace_joint_graph: bool,
+) -> fx.GraphModule:
+    """Retrace a joint graph module and selectively apply decomposition."""
+
+    if trace_joint_graph:
+        # the arg name, primals and tangents, are important.
+        # make_fx keeps the name in the traced graph and partitioner later relies
+        # on the name to partition joint graph correctly.
+        def wrap_fn(primals: list[Any], tangents: list[Any]):
+            return _SelectiveDecomposeInterpreter.recursive_wrap(
+                joint_gm, should_decompose, decomposition
+            ).run(*args)
+    else:
+
+        def wrap_fn(*args):
+            return _SelectiveDecomposeInterpreter.recursive_wrap(
+                joint_gm, should_decompose, decomposition
+            ).run(*args)
+
+    return make_fx(wrap_fn, decomposition_table={})(*args)
+
+
 def wrapper_and_args_for_make_fx(
     func: Callable[..., R], args: tuple[object, ...], kwargs: dict[str, object]
 ) -> tuple[Callable[[list[object]], R], list[object]]:
diff --git a/torch/fx/experimental/sym_node.py b/torch/fx/experimental/sym_node.py
index d07d235e51321..16b975f6b069a 100644
--- a/torch/fx/experimental/sym_node.py
+++ b/torch/fx/experimental/sym_node.py
@@ -448,6 +448,9 @@ def bitwise_and(self, other):
     def bitwise_or(self, other):
         return self._bitwise_or(other)  # type: ignore[attr-defined]
 
+    def bitwise_xor(self, other):
+        return self._bitwise_xor(other)  # type: ignore[attr-defined]
+
     # There is no int_truediv available from C++
     def truediv(self, other):
         return self.float_truediv(other)
@@ -669,6 +672,7 @@ def __rfloordiv__(self, other):
     "neg": operator.neg,
     "or": operator.or_,
     "bitwise_or": operator.or_,
+    "bitwise_xor": operator.xor,
     "float_pow": operator.pow,
     "pow_by_natural": operator.pow,
     "round": builtins.round,
@@ -748,10 +752,7 @@ def fn(self):
 
 magic_methods_on_operator_with_trailing_underscore = {"and", "or"}
 # remap necessary because an op name can have a bitwise and boolean implementation
-bitwise_ops = {
-    "bitwise_and": "and",
-    "bitwise_or": "or",
-}
+bitwise_ops = {"bitwise_and": "and", "bitwise_or": "or", "bitwise_xor": "xor"}
 
 
 always_float_magic_methods = {"int_truediv", "float_truediv", "sym_float", "float_pow"}
@@ -951,6 +952,12 @@ def _bitwise_or(a, b):
     return BitwiseFn_bitwise_or(a, b)
 
 
+def _bitwise_xor(a, b):
+    from torch.utils._sympy.functions import BitwiseFn_bitwise_xor
+
+    return BitwiseFn_bitwise_xor(a, b)
+
+
 reflectable_magic_methods = {
     "add": _optimized_add,
     "sub": operator.sub,
@@ -962,6 +969,7 @@ def _bitwise_or(a, b):
     "bitwise_and": _bitwise_and,
     "or": _sympy_or,
     "bitwise_or": _bitwise_or,
+    "bitwise_xor": _bitwise_xor,
     "float_truediv": _sympy_float_truediv,
     "int_truediv": _sympy_int_truediv,
     "int_floordiv": _sympy_floordiv,
@@ -1871,7 +1879,7 @@ def round_magic_impl(self, ndigits=None):
             setattrs(user_type, f"__r{method_name}__", rbinary_magic_impl)
 
 
-for method in magic_methods.keys():  # type: ignore[assignment]
+for method in magic_methods:  # type: ignore[assignment]
     if method in only_bool_magic_methods:
         _make_user_magic(method, SymBool)
         continue
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index aeccdfbe000db..bacc95d4c9154 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -131,6 +131,7 @@ class PendingUnbackedSymbolNotFound(RuntimeError):
 aten = torch._ops.ops.aten  # type: ignore[has-type]
 
 __all__ = [
+    "size_hint",
     "guard_or_false",
     "guard_or_true",
     "has_symbolic_sizes_strides",
@@ -255,6 +256,17 @@ def _nested_int_aware_sort(
     )
 
 
+def size_hint(x: int | torch.SymInt, *, allow_none: bool = False) -> int | None:
+    """Gets a size hint for a given expression from the underlying shapes we had.
+    Does not introduce a guard, so only use this when you can guarantee that
+    your code is still valid for arbitrary shapes (such as optimization decisions)
+    """
+    if isinstance(x, int):
+        return x
+    assert isinstance(x, torch.SymInt)
+    return x.node.shape_env.size_hint(x.node.expr, allow_none=allow_none)
+
+
 # Wrapper on lru_cache that reports statistics at process end
 def lru_cache(
     maxsize: Optional[int],
@@ -547,6 +559,7 @@ def rebind_unbacked(
         assert shape_env is not None
         for raw_u0, path in bindings.items():
             u1 = pytree.key_get(result, path)
+
             # Sometimes, things were previously unbacked bindings become constants.
             # There are two situations this can happen.
             #
@@ -602,7 +615,23 @@ def rebind_unbacked(
             if u1.node.hint is not None:
                 continue
 
-            raw_u1 = u1.node.expr
+            # unbacked symbols bindings might be replaced to other backed or
+            # unbacked replacements.
+            #
+            # Example:
+            #   u = x.item()
+            #   torch._check(u == 5)
+            #
+            # The safest approach is to retrieve raw_u1 from u1.node._expr
+            # and perform the rebinding on the original unbacked symbol,
+            # even if it’s no longer directly referenced.
+            #
+            # In other words, we should always rebind the original symbol
+            # before any replacements are applied.
+            #   u0 -> u0 == s1
+            raw_u1 = u1.node._expr
+
+            # TODO Do we still need this logic below?
             # Simplify SymBool binding
             if (
                 isinstance(raw_u1, sympy.Piecewise)
@@ -7008,52 +7037,16 @@ def trivial_solve(lhs: sympy.Expr, rhs: sympy.Expr) -> bool:
                         ok = len(free_unbacked_symbols(new_var)) == 0
                         if ok:
                             self._set_replacement(free[0], new_var, "solve")
+
             except NotImplementedError:
                 pass
-        if expr.has(Mod):
+        else:
+            # expression has mod.
             mod_expr = next(iter(expr.atoms(Mod)))
             try:
                 r = try_solve(expr, mod_expr, floordiv_inequality=False)
                 if r is not None and r[1] == 0:
                     self._add_divisible(mod_expr)
-                    # This is a little bit of extra logic to make things like
-                    # torch.empty(i0, q).view(c, -1, q) work out
-                    p, q = mod_expr.args
-                    if (
-                        isinstance(q, sympy.Number)
-                        and isinstance(p, sympy.Mul)
-                        and len(p.args) == 2
-                    ):
-                        c, i0 = p.args
-                        # Given Mod(c * i0, q) == 0
-                        if (
-                            isinstance(c, sympy.Number)
-                            and isinstance(i0, sympy.Symbol)
-                            and self.is_unbacked_symint(i0)
-                        ):
-                            # We have Mod(i0, q / c) == 0, which means we can
-                            # rewrite i0 as (q / gcd(q, c)) * i1
-                            d = q / sympy.gcd(q, c)  # TODO: CleanDiv?
-                            i1 = self.create_unbacked_symint().node.expr
-                            # Propagate the value ranges.  It doesn't really
-                            # matter if we use truediv or floordiv, because we
-                            # have established divisibility.
-                            self._update_var_to_range(
-                                i1,
-                                SymPyValueRangeAnalysis.floordiv(
-                                    self.var_to_range[i0], ValueRanges.wrap(d)
-                                ),
-                            )
-                            # Propagate hints (real tensor tracing)
-                            if i0 in self.unbacked_var_to_val:
-                                self.set_unbacked_var_to_val(
-                                    i1, self.unbacked_var_to_val[i0] // d
-                                )
-                            # Propagate size-like-ness
-                            if i0 in self.size_like:
-                                self.size_like.add(i1)
-                            self._set_replacement(i0, d * i1, "divisibility")
-
             except NotImplementedError:
                 pass
         return
@@ -7344,19 +7337,14 @@ def _is_python_assert(self) -> bool:
             if insts[cur].opname in ("TO_BOOL", "COMPARE_OP"):
                 # Peek 1 instruction further.
                 cur += 1
-        inst = insts[cur]
 
-        if inst.opname == "POP_JUMP_IF_TRUE" and inst.arg is not None:
-            first = insts[cur + 1]
+        assert_insts = torch._dynamo.symbolic_convert.get_assert_bytecode_sequence(
+            False
+        )
 
-            starts_with_assert = (
-                first.opname == "LOAD_GLOBAL"
-                and first.argval == "AssertionError"
-                or first.opname == "LOAD_ASSERTION_ERROR"
-            )
-            if starts_with_assert and insts[cur + 2].opname == "RAISE_VARARGS":
-                return True
-        return False
+        cur_insts = insts[cur + 1 : cur + 1 + len(assert_insts)]
+        cur_insts = [inst.opname for inst in cur_insts]
+        return cur_insts == assert_insts
 
     def _log_real_tensor_propagation(
         self, orig_expr: sympy.Basic, unsound_result: sympy.Basic
diff --git a/torch/fx/experimental/unify_refinements.py b/torch/fx/experimental/unify_refinements.py
index bab662e0655a2..efafb146179a6 100644
--- a/torch/fx/experimental/unify_refinements.py
+++ b/torch/fx/experimental/unify_refinements.py
@@ -61,7 +61,7 @@ def substitute_solution_one_type(mapping, t):
     Apply the most general unifier to a type
     """
     if isinstance(t, Var):
-        if t in mapping.keys():
+        if t in mapping:
             return mapping[t]
         else:
             return t
@@ -69,7 +69,7 @@ def substitute_solution_one_type(mapping, t):
     elif isinstance(t, TensorType):
         new_type = []
         for typ in t.__args__:
-            if typ in mapping.keys():
+            if typ in mapping:
                 new_type.append(mapping[typ])
             else:
                 new_type.append(typ)
@@ -102,7 +102,7 @@ def substitute_all_types(graph, mapping):
         flag = False
         for k in mapping:
             old_mapping_val = mapping[k]
-            if mapping[k] in mapping.keys():
+            if mapping[k] in mapping:
                 new_key = mapping[k]
                 mapping[k] = mapping[new_key]
             if old_mapping_val != mapping[k]:
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index fc6f4c5b27021..36ef68a9a2e35 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -226,8 +226,10 @@ class PythonCode:
     # Values in global scope during execution of `src_def`.
     globals: dict[str, Any]
     # Optional mapping from the forward function's line number to
-    # node index.
+    # node index. Line number starts at the prologue (i.e. forward()).
     _lineno_map: Optional[dict[int, Optional[int]]]
+    # The line number of prologue in fn_code
+    _prologue_start: int = 0
 
 
 def _format_target(base: str, target: str) -> str:
@@ -295,7 +297,9 @@ def get_summary_str(self):
 
 
 # get File:lineno code from stack_trace
-def _parse_stack_trace(stack_trace: str):
+def _parse_stack_trace(
+    stack_trace: str, filter_fn: Optional[Callable[[str, str, str], bool]] = None
+):
     if stack_trace is None:
         return None
     pattern = re.compile(r"^File \"(.+)\", line (\d+), in (.+)$")
@@ -312,6 +316,8 @@ def _parse_stack_trace(stack_trace: str):
             name = matches.group(3)
             # next line should be the code
             code = lines[idx + 1].strip()
+            if filter_fn and not filter_fn(file, name, code):
+                continue
             return _ParsedStackTrace(file, lineno, name, code)
     return None
 
@@ -441,6 +447,7 @@ def _gen_python_code(
         colored: bool = False,
         # Render each argument on its own line
         expanded_def: bool = False,
+        record_func: bool = False,
     ) -> PythonCode:
         free_vars: list[str] = []
         body: list[str] = []
@@ -645,6 +652,15 @@ def emit_node(node: Node):
 
             if verbose:
                 # override annotation with more detailed information
+                try:
+                    from torch.distributed.tensor._api import DTensor, DTensorSpec
+
+                    dtensorspec_format_shard_order_str = (
+                        DTensorSpec.format_shard_order_str
+                    )
+                except ModuleNotFoundError:
+                    DTensor = None  # type: ignore[assignment,misc]
+                    dtensorspec_format_shard_order_str = None
                 from torch.fx.experimental.proxy_tensor import py_sym_types
                 from torch.fx.passes.shape_prop import TensorMetadata
 
@@ -675,6 +691,16 @@ def _tensor_annotation(t: torch.Tensor) -> str:
                     core = _tensor_annotation(meta_val)
                     if is_plain:
                         maybe_type_annotation = f': "{core}"'
+                    elif type(meta_val) is DTensor:
+                        assert dtensorspec_format_shard_order_str is not None
+                        dtensor_meta = dtensorspec_format_shard_order_str(
+                            meta_val._spec.placements,  # type: ignore[attr-defined]
+                            meta_val._spec.shard_order,  # type: ignore[attr-defined]
+                        )
+                        cls = meta_val.__class__.__name__
+                        maybe_type_annotation = (
+                            f': "{cls}({core}, {dim_green(dtensor_meta)})"'
+                        )
                     else:
                         cls = meta_val.__class__.__name__
                         maybe_type_annotation = f': "{cls}({core})"'
@@ -796,6 +822,10 @@ def _tensor_annotation(t: torch.Tensor) -> str:
                 return
             raise NotImplementedError(f"node: {node.op} {node.target}")
 
+        if record_func:
+            body.append(
+                "_rf = torch._C._profiler._RecordFunctionFast('## ENTER_GRAPH_PLACEHOLDER_KEY ##'); _rf.__enter__()\n"
+            )
         for i, node in enumerate(nodes):
             # NOTE: emit_node does not emit a string with newline. It depends
             # on delete_unused_values to append one
@@ -805,8 +835,22 @@ def _tensor_annotation(t: torch.Tensor) -> str:
             # node index, which will be deleted later
             # after going through _body_transformer
             body.append(f"# COUNTER: {i}\n")
+            do_record = record_func and node.op in (
+                "call_function",
+                "call_method",
+                "call_module",
+            )
+            if do_record:
+                # The double hash ## convention is used by post-processing to find the fx markers
+                body.append(
+                    f"_rf_{node.name} = torch._C._profiler._RecordFunctionFast('## {i} ##'); _rf_{node.name}.__enter__()\n"
+                )
             emit_node(node)
             delete_unused_values(node)
+            if do_record:
+                body.append(f"_rf_{node.name}.__exit__(None, None, None)\n")
+        if record_func:
+            body.append("_rf.__exit__(None, None, None)\n")
 
         if len(body) == 0:
             # If the Graph has no non-placeholder nodes, no lines for the body
@@ -854,7 +898,14 @@ def _tensor_annotation(t: torch.Tensor) -> str:
 
 {prologue}
 {code}"""
-        return PythonCode(fn_code, globals_, _lineno_map=lineno_map)
+        # The +4 accounts for the empty lines before prologue in fn_code
+        prologue_start = wrap_stmts.count("\n") + 4
+        return PythonCode(
+            fn_code,
+            globals_,
+            _lineno_map=lineno_map,
+            _prologue_start=prologue_start,
+        )
 
 
 # Ideally, we'd like to refactor all of the pytree logic into this codegen
@@ -1098,7 +1149,7 @@ def find_nodes(self, *, op: str, target: Optional["Target"] = None):
             return [*self.table[(op, None)].keys()]
 
         # op is call_method, get_attr, call_module
-        return [node for node in self.table[(op, None)].keys() if node.target == target]
+        return [node for node in self.table[(op, None)] if node.target == target]
 
 
 @compatibility(is_backward_compatible=True)
@@ -1751,6 +1802,7 @@ def python_code(
         include_device: bool = False,
         colored: bool = False,
         expanded_def: bool = False,
+        record_func: bool = False,
     ) -> PythonCode:
         """
         Turn this ``Graph`` into valid Python code.
@@ -1818,6 +1870,7 @@ def override_node_repr(graph: Graph):
                 include_device=include_device,
                 colored=colored,
                 expanded_def=expanded_def,
+                record_func=record_func,
             )
 
     def _python_code(
@@ -1830,6 +1883,7 @@ def _python_code(
         include_device: bool = False,
         colored: bool = False,
         expanded_def: bool = False,
+        record_func: bool = False,
     ) -> PythonCode:
         return self._codegen._gen_python_code(
             self.nodes,
@@ -1840,6 +1894,7 @@ def _python_code(
             include_device=include_device,
             colored=colored,
             expanded_def=expanded_def,
+            record_func=record_func,
         )
 
     def __str__(self) -> str:
diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
index 159926bc8ba49..ab33d7bf321c9 100644
--- a/torch/fx/graph_module.py
+++ b/torch/fx/graph_module.py
@@ -1,6 +1,8 @@
 # mypy: allow-untyped-defs
+import base64
 import contextlib
 import copy
+import hashlib
 import itertools
 import linecache
 import os
@@ -18,6 +20,7 @@
 from torch.package import Importer, PackageExporter, PackageImporter, sys_importer
 
 from ._compatibility import compatibility
+from .experimental import _config as fx_experimental_config
 from .graph import (
     _BoxedCodeGen,
     _custom_builtins,
@@ -36,6 +39,7 @@
 ]
 
 _USER_PRESERVED_ATTRIBUTES_KEY = "_user_preserved_attributes"
+FX_GRAPH_MODULE_FILE_PREFIX = "fx_generated_"
 
 
 # Normal exec loses the source code, however we can work with
@@ -61,7 +65,13 @@ def cache(self, src: str, globals: dict[str, Any], co_fields=None):
 
         key = self._get_key()
         if co_fields:
-            key += f" from {co_fields['co_filename']}:{co_fields['co_firstlineno']} in {co_fields['co_name']}"
+            if "co_filename" in co_fields:
+                # If only co_filename is provided, use it directly as the key
+                if "co_firstlineno" not in co_fields or "co_name" not in co_fields:
+                    key = co_fields["co_filename"]
+                else:
+                    # Full co_fields with all three components
+                    key += f" from {co_fields['co_filename']}:{co_fields['co_firstlineno']} in {co_fields['co_name']}"
         self.eval_cache[key] = src
 
         # Don't mutate globals so that this loader is only used
@@ -353,6 +363,36 @@ def _print_readable(
     return output
 
 
+def _metadata_hash(code: str, node_metadata: dict) -> str:
+    """
+    Create a content-addressed hash from code and metadata.
+
+    Args:
+        code: The source code string
+        lineno_map: Mapping from line numbers to node indices
+        node_metadata: Metadata for each node
+
+    Returns:
+        A 51-character base32-encoded hash
+    """
+    import json
+
+    # Create a deterministic string representation of all components
+    # We use JSON to ensure consistent serialization
+    hash_data = {
+        "code": code,
+        "node_metadata": node_metadata,
+    }
+    hashing_str = json.dumps(hash_data).encode("utf-8")
+
+    # [:51] to strip off the "Q====" suffix common to every hash value.
+    return (
+        base64.b32encode(hashlib.sha256(hashing_str).digest())[:51]
+        .decode("utf-8")
+        .lower()
+    )
+
+
 class _WrappedCall:
     def __init__(self, cls, cls_call):
         self.cls = cls
@@ -819,15 +859,64 @@ def recompile(self) -> PythonCode:
         called after editing the contained ``graph``, otherwise the generated
         code of this ``GraphModule`` will be out of date.
         """
+        # Do not import anything inside recompile, it might slow down the
+        # function and cause perf regression. Import outside of the method instead.
         if isinstance(self._graph._codegen, _PyTreeCodeGen):
             self._in_spec = self._graph._codegen.pytree_info.in_spec
             self._out_spec = self._graph._codegen.pytree_info.out_spec
-        python_code = self._graph.python_code(root_module="self")
+
+        python_code = self._graph.python_code(
+            root_module="self",
+            record_func=fx_experimental_config.enrich_profiler_metadata,
+        )
         self._code = python_code.src
         self._lineno_map = python_code._lineno_map
+        self._prologue_start = python_code._prologue_start
 
         cls = type(self)
         co_fields = self._graph._co_fields if hasattr(self._graph, "_co_fields") else {}
+
+        if fx_experimental_config.enrich_profiler_metadata:
+            # Generate metadata and register for profiler augmentation
+            node_metadata: dict[int, dict[str, Any]] = {}
+            for i, node in enumerate(self._graph.nodes):
+                node_metadata[i] = {
+                    "name": node.name,
+                    "op": node.op,
+                    "target": str(node.target),
+                    "stack_trace": node.meta.get("stack_trace", None),
+                }
+
+            # Generate a content-addressed filename based on hash of code and metadata
+            # This ensures the same code+metadata always generates the same filename
+            hash_value = _metadata_hash(self._code, node_metadata)
+            file_stem = f"{FX_GRAPH_MODULE_FILE_PREFIX}_{hash_value}"
+            filename = f"{file_stem}.py"
+
+            # Only include co_filename to use it directly as the cache key
+            co_fields = {
+                "co_filename": filename,
+            }
+
+            # Store metadata in global in-memory registry
+            metadata = {
+                "lineno_map": python_code._lineno_map,
+                "prologue_start": python_code._prologue_start,
+                "node_metadata": node_metadata,
+            }
+
+            # Register metadata in the global registry
+            from torch.fx.traceback import _register_fx_metadata
+
+            _register_fx_metadata(filename, metadata)
+
+            # Replace the placeholder in generated code with actual filename
+            # The double hash ## convention is used by post-processing to find the fx markers
+            self._code = self._code.replace(
+                "torch._C._profiler._RecordFunctionFast('## ENTER_GRAPH_PLACEHOLDER_KEY ##')",
+                f"torch._C._profiler._RecordFunctionFast('## {filename} ##')",
+            )
+
         cls.forward = _forward_from_src(self._code, python_code.globals, co_fields)
 
         # Determine whether this class explicitly defines a __call__ implementation
diff --git a/torch/fx/interpreter.py b/torch/fx/interpreter.py
index a3114a14a657e..5b40e8a66147f 100644
--- a/torch/fx/interpreter.py
+++ b/torch/fx/interpreter.py
@@ -1,11 +1,12 @@
 # mypy: allow-untyped-defs
 import inspect
+import logging
 from contextlib import contextmanager
 from typing import Any, Optional, TYPE_CHECKING, Union
 
 import torch
 import torch.fx.traceback as fx_traceback
-from torch._logging import trace_structured
+from torch._logging import LazyString, trace_structured
 from torch.hub import tqdm
 
 from . import config
@@ -21,10 +22,35 @@
 if TYPE_CHECKING:
     from collections.abc import Iterator
 
+log = logging.getLogger(__name__)
 
 __all__ = ["Interpreter", "Transformer"]
 
 
+def _format_fx_node(n):
+    """
+    Format a torch.fx.Node into a human-readable string for debug logging.
+
+    Args:
+        n (torch.fx.Node): The FX node being executed.
+
+    Returns:
+        str: A formatted string describing the node operation, including its
+        name, target, positional arguments, and keyword arguments.
+    """
+    module_prefix = getattr(n.target, "__module__", "")
+    module_prefix = f"{module_prefix}." if module_prefix else ""
+
+    # Handle positional and keyword arguments
+    args = ", ".join(map(str, n.args))
+    kwargs = ", ".join(f"{k}={v}" for k, v in n.kwargs.items())
+    joined = ", ".join(filter(None, [args, kwargs]))
+
+    return (
+        f"{n.name} = {module_prefix}{getattr(n.target, '__name__', n.target)}({joined})"
+    )
+
+
 @compatibility(is_backward_compatible=True)
 class Interpreter:
     """
@@ -220,11 +246,23 @@ def boxed_run(self, args_list):
         calling convention, where you pass a list of arguments, which will be cleared
         by the interpreter.  This ensures that input tensors are promptly deallocated.
         """
-        args_iter = iter(args_list)
-        env = {}
-        for n in self.graph.nodes:
-            if n.op == "placeholder":
-                env[n] = next(args_iter)
+        # Collect placeholder nodes first
+        placeholder_nodes = [n for n in self.graph.nodes if n.op == "placeholder"]
+
+        # Check argument count
+        if len(args_list) != len(placeholder_nodes):
+            detail = (
+                "extra arguments"
+                if len(args_list) > len(placeholder_nodes)
+                else "missing arguments"
+            )
+            raise RuntimeError(
+                f"Interpreter.boxed_run expected {len(placeholder_nodes)} arguments for placeholders "
+                f"but received {len(args_list)} ({detail})"
+            )
+
+        # Assign arguments to placeholders
+        env = dict(zip(placeholder_nodes, args_list))
         args_list.clear()
         return self.run(initial_env=env)
 
@@ -249,6 +287,7 @@ def run_node(self, n: Node) -> Any:
         Returns:
             Any: The result of executing ``n``
         """
+        log.debug("run_node %s", LazyString(lambda: _format_fx_node(n)))
         with self._set_current_node(n):
             args, kwargs = self.fetch_args_kwargs_from_env(n)
             assert isinstance(args, tuple)
diff --git a/torch/fx/node.py b/torch/fx/node.py
index 1d72a75a6ccf4..294e15c550235 100644
--- a/torch/fx/node.py
+++ b/torch/fx/node.py
@@ -90,6 +90,7 @@
 _side_effectful_functions: set[Callable[..., Any]] = {
     torch._assert,
     torch._assert_async,
+    _ops.aten._async_error.default,
     _ops.aten._assert_async.msg,
     _ops.aten._assert_scalar.default,
     _ops.aten._assert_tensor_metadata.default,
@@ -496,7 +497,7 @@ def insert_arg(self, idx: int, arg: Argument) -> None:
         _new_input_nodes: dict[Node, None] = {}
         _fx_map_arg(arg, _new_input_nodes.setdefault)
 
-        for new_use in _new_input_nodes.keys():
+        for new_use in _new_input_nodes:
             if new_use not in self._input_nodes:
                 self._input_nodes.setdefault(new_use)
                 new_use.users.setdefault(self)
@@ -754,26 +755,6 @@ def is_impure(self, impure_random: bool = True) -> bool:
 
             return self.target in _side_effectful_functions
 
-        def subgraph_has_impure_ops(module: torch.fx.GraphModule) -> bool:
-            """
-            Return True if a GraphModule type subgraph contains any impure op, else False.
-            """
-            assert isinstance(module, torch.fx.GraphModule), (
-                "caller should only pass GraphModule to subgraph_has_impure_ops check"
-            )
-            for node in module.graph.nodes:
-                if node.op == "call_function" and node.is_impure(impure_random):
-                    return True
-                if (
-                    # pyrefly: ignore [invalid-argument]
-                    node.op == "call_module"
-                    # pyrefly: ignore [not-callable]
-                    and (submodule := module.get_submodule(node.target))
-                    and isinstance(submodule, torch.fx.GraphModule)
-                ):
-                    return subgraph_has_impure_ops(submodule)
-            return False
-
         # Check if an impure module.
         if self.op == "call_module":
             assert self.graph.owning_module is not None, (
@@ -783,10 +764,11 @@ def subgraph_has_impure_ops(module: torch.fx.GraphModule) -> bool:
             assert target_mod is not None, (
                 f"Did not find expected submodule target {self.target}"
             )
-            if isinstance(target_mod, torch.fx.GraphModule):
-                return subgraph_has_impure_ops(target_mod)
-            else:
-                return getattr(target_mod, "_is_impure", False)
+            # NOTE: here we can end up considering GraphModule submodules pure,
+            # even if they contain impure ops. It may not be safe to change
+            # because this function is used by graph.eliminate_dead_code,
+            # and some users depend on current elimination behavior.
+            return getattr(target_mod, "_is_impure", False)
 
         return False
 
diff --git a/torch/fx/passes/_tensorify_python_scalars.py b/torch/fx/passes/_tensorify_python_scalars.py
index 7d7a4c04cff2f..089780e84705b 100644
--- a/torch/fx/passes/_tensorify_python_scalars.py
+++ b/torch/fx/passes/_tensorify_python_scalars.py
@@ -207,12 +207,19 @@ def _sympy_interp(expr: sympy.Expr) -> MetaProxy:
                 and node.target is torch.ops.aten._local_scalar_dense.default
             ):
                 dtype = node.args[0].meta["val"].dtype
-                if not dtype.is_floating_point:
-                    continue
 
                 assert isinstance(node.args[0], fx.Node), node.args[0]
 
                 s = node.meta["val"].node.expr
+
+                expr_to_sym_proxy[s] = MetaProxy(
+                    node, tracer=tracer, fake_mode=fake_mode
+                )
+
+                # only tensorify if the dtype is floating point
+                if not dtype.is_floating_point:
+                    continue
+
                 expr_to_tensor_proxy[s] = MetaProxy(
                     node.args[0], tracer=tracer, fake_mode=fake_mode
                 )
@@ -220,9 +227,7 @@ def _sympy_interp(expr: sympy.Expr) -> MetaProxy:
                 expr_to_tensor_proxy[s] = torch.ops.prims.convert_element_type.default(
                     expr_to_tensor_proxy[s], torch.float64
                 )
-                expr_to_sym_proxy[s] = MetaProxy(
-                    node, tracer=tracer, fake_mode=fake_mode
-                )
+
             # pyrefly: ignore [bad-argument-type]
             elif (sym_expr := _get_sym_val(node)) is not None:
                 if sym_expr not in expr_to_sym_proxy and not isinstance(
diff --git a/torch/fx/passes/regional_inductor.py b/torch/fx/passes/regional_inductor.py
index 4b87a58bf2be6..c3f9c22d252d3 100644
--- a/torch/fx/passes/regional_inductor.py
+++ b/torch/fx/passes/regional_inductor.py
@@ -112,7 +112,7 @@ def _compile_submod(gm, prefix):
     return gm
 
 
-def _needs_inductor_compile(node):
+def _needs_inductor_compile(node: torch.fx.Node):
     return (
         node.op not in ("placeholder", "output")
         and hasattr(node, "meta")
diff --git a/torch/fx/passes/runtime_assert.py b/torch/fx/passes/runtime_assert.py
index 58aa801062824..e475a5bc9b6df 100644
--- a/torch/fx/passes/runtime_assert.py
+++ b/torch/fx/passes/runtime_assert.py
@@ -165,6 +165,7 @@ def _node_metadata_hook(
         node: torch.fx.Node,
         stack_trace: Optional[str] = None,
         nn_module_stack: Optional[dict[str, Any]] = None,
+        custom: Optional[dict[str, Any]] = None,
     ) -> None:
         fake_args = pytree.tree_map(
             lambda arg: (
@@ -188,6 +189,8 @@ def _node_metadata_hook(
             node.meta["stack_trace"] = stack_trace
         if nn_module_stack is not None:
             node.meta["nn_module_stack"] = nn_module_stack
+        if custom is not None:
+            node.meta["custom"] = custom
 
     # Track asserts/checks we've added
     added_asserts: set[sympy.Expr] = set()
@@ -373,11 +376,9 @@ def has_new_untracked_symbols():
                     shape_env, node.meta.get("unbacked_bindings", {})
                 )
 
-                assert resolved_unbacked_bindings is not None
-
                 def has_new_unbacked_bindings():
-                    # pyrefly: ignore [missing-attribute]
-                    for key in resolved_unbacked_bindings.keys():
+                    assert resolved_unbacked_bindings is not None
+                    for key in resolved_unbacked_bindings:
                         if key not in expr_to_proxy:
                             return True
                     return False
@@ -617,6 +618,9 @@ def convert(s):
                                 _node_metadata_hook,
                                 stack_trace=node.meta.get("stack_trace"),
                                 nn_module_stack=node.meta.get("nn_module_stack"),
+                                # nodes added in `apply_runtime_assertion_pass` will have the same annotation
+                                # as the input node to the assertion
+                                custom=node.meta.get("custom"),
                             ),
                         ):
                             if (min_val := convert(vr.lower)) is not None:
diff --git a/torch/fx/passes/splitter_base.py b/torch/fx/passes/splitter_base.py
index 6cf708a619069..8d90f9d55cfdb 100644
--- a/torch/fx/passes/splitter_base.py
+++ b/torch/fx/passes/splitter_base.py
@@ -204,7 +204,7 @@ def to_dict(self):
         Create dict dump on all events.
         """
         ret: dict[str, list[str]] = {}
-        for name in self.node_events.keys():
+        for name in self.node_events:
             ret[name] = []
             for idx in self.node_events.get(name, []):
                 event = self.events[idx]
@@ -218,7 +218,7 @@ def print_all(self, writer=None):
         """
         if not writer:
             writer = self.writer
-        for name in self.node_events.keys():
+        for name in self.node_events:
             writer(f"Node: {name}:")
             self.print_node(name, recursive=False, tab="  ", writer=writer)
 
diff --git a/torch/fx/passes/utils/source_matcher_utils.py b/torch/fx/passes/utils/source_matcher_utils.py
index 043c65e6b77d2..82259b8a36ab7 100644
--- a/torch/fx/passes/utils/source_matcher_utils.py
+++ b/torch/fx/passes/utils/source_matcher_utils.py
@@ -113,7 +113,7 @@ def make_partition(nodes: list[Node], module_type: type) -> SourcePartition:
                 # get_attr nodes won't be output nodes
                 continue
 
-            for user in node.users.keys():
+            for user in node.users:
                 if user not in nodes:
                     output_nodes.add(node)
 
@@ -157,7 +157,7 @@ def check_subgraphs_connected(
     """
 
     for node in reversed(subgraph1.nodes):
-        for user in node.users.keys():
+        for user in node.users:
             if user in subgraph2.nodes:
                 return True
     return False
diff --git a/torch/fx/proxy.py b/torch/fx/proxy.py
index 552046d30cc97..87cb8d18be74d 100644
--- a/torch/fx/proxy.py
+++ b/torch/fx/proxy.py
@@ -187,6 +187,10 @@ def create_node(
             stack_trace = current_meta.get("stack_trace")
             if stack_trace:
                 node.stack_trace = stack_trace
+
+                if fx_traceback.GRADIENT_ACC_SPECIAL_STACK in stack_trace:
+                    node.meta["is_gradient_acc"] = True
+
             # Explicitly set the stack_trace, nn_module_stack and source_fn on the node.meta
             # If other meta fields are needed, they can be added here
             for field in _COPY_META_FIELDS:
diff --git a/torch/fx/traceback.py b/torch/fx/traceback.py
index a143119cd78b0..b78ef313f24f5 100644
--- a/torch/fx/traceback.py
+++ b/torch/fx/traceback.py
@@ -38,6 +38,31 @@
 current_replay_node: Optional[Node] = None
 should_preserve_node_meta = False
 
+GRADIENT_ACC_SPECIAL_STACK = (
+    "Gradient addition node due to multiple use of tensor around:"
+)
+# =============================================================================
+# FX Metadata Registry for Memory Profiler
+# =============================================================================
+# Global in-memory registry for FX metadata
+# Maps module_name -> metadata dict containing lineno_map and node_metadata
+_FX_METADATA_REGISTRY: dict[str, dict[str, Any]] = {}
+
+
+def _register_fx_metadata(module_name: str, metadata: dict[str, Any]) -> None:
+    """
+    Register FX metadata in the global in-memory registry.
+
+    This is called automatically during graph module compilation to store metadata
+    for later use by memory profiler augmentation.
+
+    Args:
+        module_name: The module identifier (content-addressed filename)
+        metadata: Metadata dict containing lineno_map, node_metadata, and source_code
+    """
+    # TODO: add logging to tlparse
+    _FX_METADATA_REGISTRY[module_name] = metadata
+
 
 @compatibility(is_backward_compatible=False)
 class NodeSourceAction(Enum):
@@ -255,6 +280,8 @@ def annotate(annotation_dict: dict):
     tracing system by updating the global `current_meta["custom"]` dictionary.
     The annotations are automatically reverted after the context exits.
 
+    Gradient accumulation nodes will not be annotated.
+
     This is intended for advanced users who need to attach additional metadata to the fx nodes
     (e.g., for debugging, analysis, or external tooling) during export tracing.
 
diff --git a/torch/header_only_apis.txt b/torch/header_only_apis.txt
index 70165a7493e59..598ca377f794b 100644
--- a/torch/header_only_apis.txt
+++ b/torch/header_only_apis.txt
@@ -42,6 +42,9 @@ fp16_ieee_to_fp32_value
 # fp32_from_bits called from fp16_ieee_to_fp32_value
 # fp32_to_bits called from fp16_ieee_from_fp32_value
 
+# torch/headeronly/util/HeaderOnlyArrayRef.h
+HeaderOnlyArrayRef
+
 # c10/util/complex.h, torch/headeronly/util/complex.h
 complex
 
@@ -54,6 +57,44 @@ _isnan
 Philox4_32
 randn
 
+# torch/headeronly/Metaprogramming.h
+infer_function_traits_t
+make_function_traits_t
+tuple_elements
+tuple_map
+tuple_slice
+tuple_take
+
+# torch/headeronly/TypeList.h
+# false_t => tested through from_tuple_t and various other APIs
+all
+concat_t
+contains
+count_if
+drop_if_nonempty_t
+drop_t
+filter_t
+find_if
+from_tuple_t
+head_t
+head_with_default_t
+map_t
+map_types_to_values
+reverse_t
+size
+take_t
+to_tuple_t
+true_for_any_type
+typelist
+
+# torch/headeronly/TypeTraits.h
+is_equality_comparable
+is_function_type
+is_hashable
+is_instantiation_of
+is_stateless_lambda
+is_type_condition
+
 # ATen/cpu/vec/vec.h
 Vectorized
 clamp_min
@@ -120,6 +161,7 @@ COMPILE_TIME_MAX_DEVICE_TYPES
 NumScalarTypes
 ScalarType
 # dummy_int1_7_t, dummy_uint1_7_t tested through ScalarType
+CppTypeToScalarType
 ScalarTypeToCPPTypeT
 AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF_F8NZ
 AT_FORALL_SCALAR_TYPES_WITH_COMPLEX
@@ -136,3 +178,29 @@ AT_FORALL_COMPLEX_TYPES
 toString
 <<
 toUnderlying
+
+# torch/headeronly/core/Dispatch_v2.h
+THO_DISPATCH_V2_TMPL
+THO_PRIVATE_CASE_TYPE_USING_HINT_TMPL
+THO_DISPATCH_CASE_TMPL
+THO_DISPATCH_SWITCH_TMPL
+# AT_WRAP, THO_AP_VAR_TMPL, AT_CONCAT, AT_CONCAT_AUX, AT_EXPAND are tested through THO_DISPATCH_V2_TMPL
+# scalar_type is tested through THO_DISPATCH_SWITCH_TMPL
+AT_FLOAT8_TYPES
+AT_INTEGRAL_TYPES
+AT_FLOATING_TYPES
+AT_BAREBONES_UNSIGNED_TYPES
+AT_INTEGRAL_TYPES_V2
+AT_COMPLEX_TYPES
+AT_QINT_TYPES
+AT_ALL_TYPES
+AT_ALL_TYPES_AND_COMPLEX
+THO_DISPATCH_V2
+# THO_EMPTY, THO_DISPATCH_CASE, THO_DISPATCH_SWITCH, THO_PRIVATE_CASE_TYPE_USING_HINT are tested through THO_DISPATCH_V2
+
+# torch/headeronly/core/TensorAccessor.h
+HeaderOnlyTensorAccessor
+HeaderOnlyGenericPackedTensorAccessor
+# HeaderOnlyTensorAccessorBase and
+# HeaderOnlyGenericPackedTensorAccessorBase are tested through
+# HeaderOnlyTensorAccessor and HeaderOnlyGenericPackedTensorAccessor
diff --git a/torch/headeronly/core/DeviceType.h b/torch/headeronly/core/DeviceType.h
index 980052b79c713..9db3ef2568d34 100644
--- a/torch/headeronly/core/DeviceType.h
+++ b/torch/headeronly/core/DeviceType.h
@@ -1,3 +1,5 @@
+#pragma once
+
 // This is directly synchronized with caffe2/proto/caffe2.proto, but
 // doesn't require me to figure out how to get Protobuf headers into
 // ATen/core (which would require a lot more build system hacking.)
diff --git a/torch/headeronly/core/Dispatch.h b/torch/headeronly/core/Dispatch.h
new file mode 100644
index 0000000000000..43293ef701dda
--- /dev/null
+++ b/torch/headeronly/core/Dispatch.h
@@ -0,0 +1,73 @@
+#pragma once
+
+#include <torch/headeronly/core/ScalarType.h>
+#include <torch/headeronly/macros/Macros.h>
+
+// THO_PRIVATE_CASE_TYPE_USING_HINT_TMPL is same as
+// AT_PRIVATE_CASE_TYPE_USING_HINT but with a custom PRELUDE macro:
+#define THO_PRIVATE_CASE_TYPE_USING_HINT_TMPL(PRELUDE, enum_type, HINT, ...) \
+  case enum_type: {                                                          \
+    PRELUDE(enum_type);                                                      \
+    using HINT [[maybe_unused]] =                                            \
+        torch::headeronly::impl::ScalarTypeToCPPTypeT<enum_type>;            \
+    return __VA_ARGS__();                                                    \
+  }
+
+// THO_DISPATCH_CASE_TMPL is same as AT_DISPATCH_CASE but with a
+// custom CASE_TYPE_USING_HINT macro:
+#define THO_DISPATCH_CASE_TMPL(CASE_TYPE_USING_HINT, enum_type, ...) \
+  CASE_TYPE_USING_HINT(enum_type, scalar_t, __VA_ARGS__)
+
+namespace detail {
+inline torch::headeronly::ScalarType scalar_type(
+    torch::headeronly::ScalarType s) {
+  return s;
+}
+} // namespace detail
+
+// THO_DISPATCH_SWITCH_TMPL is same as AT_DISPATCH_SWITCH but with
+// custom PRELUDE and CHECK_NOT_IMPLEMENTED macros:
+#define THO_DISPATCH_SWITCH_TMPL(                                           \
+    PRELUDE, CHECK_NOT_IMPLEMENTED, TYPE, NAME, ...)                        \
+  [&] {                                                                     \
+    const auto& the_type = TYPE;                                            \
+    constexpr const char* at_dispatch_name = NAME;                          \
+    /* don't use TYPE again in case it is an expensive or side-effect op */ \
+    torch::headeronly::ScalarType _st = ::detail::scalar_type(the_type);    \
+    PRELUDE(at_dispatch_name, _st);                                         \
+    C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum")             \
+    switch (_st) {                                                          \
+      __VA_ARGS__                                                           \
+      default:                                                              \
+        CHECK_NOT_IMPLEMENTED(                                              \
+            false,                                                          \
+            '"',                                                            \
+            at_dispatch_name,                                               \
+            "\" not implemented for '",                                     \
+            torch::headeronly::toString(_st),                               \
+            "'");                                                           \
+    }                                                                       \
+    C10_DIAGNOSTIC_POP()                                                    \
+  }()
+
+// THO_EMPTY is a helper macro that discards its arguments.
+#define THO_EMPTY(...)
+
+// THO_PRIVATE_CASE_TYPE_USING_HINT is same as
+// AT_PRIVATE_CASE_TYPE_USING_HINT with call to macro
+// AT_PRIVATE_CHECK_SELECTIVE_BUILD removed.
+#define THO_PRIVATE_CASE_TYPE_USING_HINT(enum_type, HINT, ...) \
+  THO_PRIVATE_CASE_TYPE_USING_HINT_TMPL(THO_EMPTY, enum_type, HINT, __VA_ARGS__)
+
+// THO_DISPATCH_SWITCH is same as AT_DISPATCH_SWITCH with call to
+// macro RECORD_KERNEL_FUNCTION_DTYPE removed and using
+// STD_TORCH_CHECK instead of TORCH_CHECK_NOT_IMPLEMENTED.
+#define THO_DISPATCH_SWITCH(TYPE, NAME, ...) \
+  THO_DISPATCH_SWITCH_TMPL(THO_EMPTY, STD_TORCH_CHECK, TYPE, NAME, __VA_ARGS__)
+
+// THO_DISPATCH_CASE is same as AT_DISPATCH_CASE but using
+// THO_PRIVATE_CASE_TYPE_USING_HINT instead of
+// AT_PRIVATE_CASE_TYPE_USING_HINT.
+#define THO_DISPATCH_CASE(enum_type, ...) \
+  THO_DISPATCH_CASE_TMPL(                 \
+      THO_PRIVATE_CASE_TYPE_USING_HINT, enum_type, __VA_ARGS__)
diff --git a/torch/headeronly/core/Dispatch_v2.h b/torch/headeronly/core/Dispatch_v2.h
new file mode 100644
index 0000000000000..13cbd2ee85e5f
--- /dev/null
+++ b/torch/headeronly/core/Dispatch_v2.h
@@ -0,0 +1,170 @@
+#pragma once
+
+#include <torch/headeronly/core/Dispatch.h>
+#include <torch/headeronly/core/ScalarType.h>
+
+// This file provides THO_DISPATCH_V2_TMPL macro that is a generalized
+// version of the original AT_DISPATCH_V2 (see ATen/Dispatch_v2.h for
+// documentation): THO_DISPATCH_V2_TMPL extends AT_DISPATCH_V2 with
+// extra DISPATCH_SWITCH and DISPATCH_CASE arguments for specifying
+// custom implementations of the original AT_DISPATCH_SWITCH and
+// AT_DISPATCH_CASE macros. Use the provided macros
+// THO_DISPATCH_SWITCH_TMPL and THO_DISPATCH_CASE_TMPL to define the
+// custom implementations of the switch and case macros, respectively.
+
+// Public API macros
+
+// THO_DISPATCH_V2_TMPL is same as AT_DISPATCH_V2 but with custom
+// DISPATCH_SWITCH and DISPATCH_CASE macro arguments:
+#define THO_DISPATCH_V2_TMPL(                              \
+    DISPATCH_SWITCH, DISPATCH_CASE, TYPE, NAME, BODY, ...) \
+  DISPATCH_SWITCH(                                         \
+      TYPE,                                                \
+      NAME,                                                \
+      THO_AP_VAR_TMPL(DISPATCH_CASE, AT_WRAP(BODY), TYPE, __VA_ARGS__))
+
+// THO_DISPATCH_V2 is same as AT_DISPATCH_V2 but using
+// THO_DISPATCH_SWITCH and THO_DISPATCH_CASE instead of
+// AT_DISPATCH_SWITCH and AT_DISPATCH_CASE, respectively.
+#define THO_DISPATCH_V2(TYPE, NAME, BODY, ...) \
+  THO_DISPATCH_V2_TMPL(                        \
+      THO_DISPATCH_SWITCH, THO_DISPATCH_CASE, TYPE, NAME, BODY, __VA_ARGS__)
+
+// Type collection macros
+
+// This macro lets you pass an arbitrary expression that may contain internal
+// commas to another macro without having the commas causing the expression
+// to be interpreted as being multiple arguments
+#define AT_WRAP(...) __VA_ARGS__
+
+#define AT_FLOAT8_TYPES                               \
+  torch::headeronly::ScalarType::Float8_e5m2,         \
+      torch::headeronly::ScalarType::Float8_e5m2fnuz, \
+      torch::headeronly::ScalarType::Float8_e4m3fn,   \
+      torch::headeronly::ScalarType::Float8_e4m3fnuz, \
+      torch::headeronly::ScalarType::Float8_e8m0fnu
+
+#define AT_INTEGRAL_TYPES                                                      \
+  torch::headeronly::ScalarType::Byte, torch::headeronly::ScalarType::Char,    \
+      torch::headeronly::ScalarType::Int, torch::headeronly::ScalarType::Long, \
+      torch::headeronly::ScalarType::Short
+#define AT_FLOATING_TYPES \
+  torch::headeronly::ScalarType::Double, torch::headeronly::ScalarType::Float
+#define AT_BAREBONES_UNSIGNED_TYPES          \
+  torch::headeronly::ScalarType::UInt16,     \
+      torch::headeronly::ScalarType::UInt32, \
+      torch::headeronly::ScalarType::UInt64
+#define AT_INTEGRAL_TYPES_V2 \
+  AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)
+#define AT_COMPLEX_TYPES                        \
+  torch::headeronly::ScalarType::ComplexDouble, \
+      torch::headeronly::ScalarType::ComplexFloat
+#define AT_QINT_TYPES                                                          \
+  torch::headeronly::ScalarType::QInt8, torch::headeronly::ScalarType::QUInt8, \
+      torch::headeronly::ScalarType::QInt32
+// NB: not *actually* all types
+#define AT_ALL_TYPES AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_FLOATING_TYPES)
+#define AT_ALL_TYPES_AND_COMPLEX \
+  AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_COMPLEX_TYPES)
+
+// Helper macros
+
+// THO_AP_VAR_TMPL is same as AT_AP_VAR but with a custom
+// DISPATCH_CASE macro argument:
+#define THO_AP_VAR_TMPL(C, N, T, ...) \
+  AT_EXPAND(                          \
+      AT_CONCAT(THO_AP, AT_NUM_ARGS(__VA_ARGS__))(C, AT_WRAP(N), __VA_ARGS__))
+#define AT_CONCAT(a, b) AT_CONCAT_AUX(a, b)
+#define AT_CONCAT_AUX(a, b) a##b
+#define AT_EXPAND(X) X
+
+// Ensure we never have too many scalar types for the expansion here to
+// support.  To bump this, you must regenerate the macros below.
+static_assert(static_cast<int>(torch::headeronly::ScalarType::NumOptions) < 60);
+
+// Python code to regenerate generate code below:
+#if 0
+
+num_args = 60
+
+nums = ', '.join(str(i) for i in reversed(range(num_args+1)))
+args = ', '.join(f'_{i}' for i in range(1, num_args+1))
+
+print(f'#define AT_NUM_ARGS(...) AT_EXPAND(AT_NUM_ARGS_AUX(__VA_ARGS__, {nums}))')
+print(f'#define AT_NUM_ARGS_AUX({args}, N, ...) N')
+
+for i in range(1, num_args+1):
+    args = ', '.join(f'_{i}' for i in range(1, i+1))
+    cases = ' '.join([f'C(_{j}, N)' for j in range(1, i+1)])
+    print(f'#define THO_AP{i}(C, N, {args}) {cases}')
+
+#endif
+
+// Begin generated code
+// clang-format off
+
+#define AT_NUM_ARGS(...) AT_EXPAND(AT_NUM_ARGS_AUX(__VA_ARGS__, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0))
+#define AT_NUM_ARGS_AUX(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58, _59, _60, N, ...) N
+#define THO_AP1(C, N, _1) C(_1, N)
+#define THO_AP2(C, N, _1, _2) C(_1, N) C(_2, N)
+#define THO_AP3(C, N, _1, _2, _3) C(_1, N) C(_2, N) C(_3, N)
+#define THO_AP4(C, N, _1, _2, _3, _4) C(_1, N) C(_2, N) C(_3, N) C(_4, N)
+#define THO_AP5(C, N, _1, _2, _3, _4, _5) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N)
+#define THO_AP6(C, N, _1, _2, _3, _4, _5, _6) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N)
+#define THO_AP7(C, N, _1, _2, _3, _4, _5, _6, _7) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N)
+#define THO_AP8(C, N, _1, _2, _3, _4, _5, _6, _7, _8) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N)
+#define THO_AP9(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N)
+#define THO_AP10(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N)
+#define THO_AP11(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N)
+#define THO_AP12(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N)
+#define THO_AP13(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N)
+#define THO_AP14(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N)
+#define THO_AP15(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N)
+#define THO_AP16(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N)
+#define THO_AP17(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N)
+#define THO_AP18(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N)
+#define THO_AP19(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N)
+#define THO_AP20(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N)
+#define THO_AP21(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N)
+#define THO_AP22(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N)
+#define THO_AP23(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N)
+#define THO_AP24(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N)
+#define THO_AP25(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N)
+#define THO_AP26(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N)
+#define THO_AP27(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N)
+#define THO_AP28(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N)
+#define THO_AP29(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N)
+#define THO_AP30(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N) C(_30, N)
+#define THO_AP31(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N) C(_30, N) C(_31, N)
+#define THO_AP32(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N) C(_30, N) C(_31, N) C(_32, N)
+#define THO_AP33(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N) C(_30, N) C(_31, N) C(_32, N) C(_33, N)
+#define THO_AP34(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N) C(_30, N) C(_31, N) C(_32, N) C(_33, N) C(_34, N)
+#define THO_AP35(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N) C(_30, N) C(_31, N) C(_32, N) C(_33, N) C(_34, N) C(_35, N)
+#define THO_AP36(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N) C(_30, N) C(_31, N) C(_32, N) C(_33, N) C(_34, N) C(_35, N) C(_36, N)
+#define THO_AP37(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N) C(_30, N) C(_31, N) C(_32, N) C(_33, N) C(_34, N) C(_35, N) C(_36, N) C(_37, N)
+#define THO_AP38(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N) C(_30, N) C(_31, N) C(_32, N) C(_33, N) C(_34, N) C(_35, N) C(_36, N) C(_37, N) C(_38, N)
+#define THO_AP39(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N) C(_30, N) C(_31, N) C(_32, N) C(_33, N) C(_34, N) C(_35, N) C(_36, N) C(_37, N) C(_38, N) C(_39, N)
+#define THO_AP40(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N) C(_30, N) C(_31, N) C(_32, N) C(_33, N) C(_34, N) C(_35, N) C(_36, N) C(_37, N) C(_38, N) C(_39, N) C(_40, N)
+#define THO_AP41(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N) C(_30, N) C(_31, N) C(_32, N) C(_33, N) C(_34, N) C(_35, N) C(_36, N) C(_37, N) C(_38, N) C(_39, N) C(_40, N) C(_41, N)
+#define THO_AP42(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N) C(_30, N) C(_31, N) C(_32, N) C(_33, N) C(_34, N) C(_35, N) C(_36, N) C(_37, N) C(_38, N) C(_39, N) C(_40, N) C(_41, N) C(_42, N)
+#define THO_AP43(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N) C(_30, N) C(_31, N) C(_32, N) C(_33, N) C(_34, N) C(_35, N) C(_36, N) C(_37, N) C(_38, N) C(_39, N) C(_40, N) C(_41, N) C(_42, N) C(_43, N)
+#define THO_AP44(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N) C(_30, N) C(_31, N) C(_32, N) C(_33, N) C(_34, N) C(_35, N) C(_36, N) C(_37, N) C(_38, N) C(_39, N) C(_40, N) C(_41, N) C(_42, N) C(_43, N) C(_44, N)
+#define THO_AP45(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N) C(_30, N) C(_31, N) C(_32, N) C(_33, N) C(_34, N) C(_35, N) C(_36, N) C(_37, N) C(_38, N) C(_39, N) C(_40, N) C(_41, N) C(_42, N) C(_43, N) C(_44, N) C(_45, N)
+#define THO_AP46(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N) C(_30, N) C(_31, N) C(_32, N) C(_33, N) C(_34, N) C(_35, N) C(_36, N) C(_37, N) C(_38, N) C(_39, N) C(_40, N) C(_41, N) C(_42, N) C(_43, N) C(_44, N) C(_45, N) C(_46, N)
+#define THO_AP47(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N) C(_30, N) C(_31, N) C(_32, N) C(_33, N) C(_34, N) C(_35, N) C(_36, N) C(_37, N) C(_38, N) C(_39, N) C(_40, N) C(_41, N) C(_42, N) C(_43, N) C(_44, N) C(_45, N) C(_46, N) C(_47, N)
+#define THO_AP48(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N) C(_30, N) C(_31, N) C(_32, N) C(_33, N) C(_34, N) C(_35, N) C(_36, N) C(_37, N) C(_38, N) C(_39, N) C(_40, N) C(_41, N) C(_42, N) C(_43, N) C(_44, N) C(_45, N) C(_46, N) C(_47, N) C(_48, N)
+#define THO_AP49(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N) C(_30, N) C(_31, N) C(_32, N) C(_33, N) C(_34, N) C(_35, N) C(_36, N) C(_37, N) C(_38, N) C(_39, N) C(_40, N) C(_41, N) C(_42, N) C(_43, N) C(_44, N) C(_45, N) C(_46, N) C(_47, N) C(_48, N) C(_49, N)
+#define THO_AP50(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N) C(_30, N) C(_31, N) C(_32, N) C(_33, N) C(_34, N) C(_35, N) C(_36, N) C(_37, N) C(_38, N) C(_39, N) C(_40, N) C(_41, N) C(_42, N) C(_43, N) C(_44, N) C(_45, N) C(_46, N) C(_47, N) C(_48, N) C(_49, N) C(_50, N)
+#define THO_AP51(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N) C(_30, N) C(_31, N) C(_32, N) C(_33, N) C(_34, N) C(_35, N) C(_36, N) C(_37, N) C(_38, N) C(_39, N) C(_40, N) C(_41, N) C(_42, N) C(_43, N) C(_44, N) C(_45, N) C(_46, N) C(_47, N) C(_48, N) C(_49, N) C(_50, N) C(_51, N)
+#define THO_AP52(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N) C(_30, N) C(_31, N) C(_32, N) C(_33, N) C(_34, N) C(_35, N) C(_36, N) C(_37, N) C(_38, N) C(_39, N) C(_40, N) C(_41, N) C(_42, N) C(_43, N) C(_44, N) C(_45, N) C(_46, N) C(_47, N) C(_48, N) C(_49, N) C(_50, N) C(_51, N) C(_52, N)
+#define THO_AP53(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N) C(_30, N) C(_31, N) C(_32, N) C(_33, N) C(_34, N) C(_35, N) C(_36, N) C(_37, N) C(_38, N) C(_39, N) C(_40, N) C(_41, N) C(_42, N) C(_43, N) C(_44, N) C(_45, N) C(_46, N) C(_47, N) C(_48, N) C(_49, N) C(_50, N) C(_51, N) C(_52, N) C(_53, N)
+#define THO_AP54(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N) C(_30, N) C(_31, N) C(_32, N) C(_33, N) C(_34, N) C(_35, N) C(_36, N) C(_37, N) C(_38, N) C(_39, N) C(_40, N) C(_41, N) C(_42, N) C(_43, N) C(_44, N) C(_45, N) C(_46, N) C(_47, N) C(_48, N) C(_49, N) C(_50, N) C(_51, N) C(_52, N) C(_53, N) C(_54, N)
+#define THO_AP55(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N) C(_30, N) C(_31, N) C(_32, N) C(_33, N) C(_34, N) C(_35, N) C(_36, N) C(_37, N) C(_38, N) C(_39, N) C(_40, N) C(_41, N) C(_42, N) C(_43, N) C(_44, N) C(_45, N) C(_46, N) C(_47, N) C(_48, N) C(_49, N) C(_50, N) C(_51, N) C(_52, N) C(_53, N) C(_54, N) C(_55, N)
+#define THO_AP56(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N) C(_30, N) C(_31, N) C(_32, N) C(_33, N) C(_34, N) C(_35, N) C(_36, N) C(_37, N) C(_38, N) C(_39, N) C(_40, N) C(_41, N) C(_42, N) C(_43, N) C(_44, N) C(_45, N) C(_46, N) C(_47, N) C(_48, N) C(_49, N) C(_50, N) C(_51, N) C(_52, N) C(_53, N) C(_54, N) C(_55, N) C(_56, N)
+#define THO_AP57(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N) C(_30, N) C(_31, N) C(_32, N) C(_33, N) C(_34, N) C(_35, N) C(_36, N) C(_37, N) C(_38, N) C(_39, N) C(_40, N) C(_41, N) C(_42, N) C(_43, N) C(_44, N) C(_45, N) C(_46, N) C(_47, N) C(_48, N) C(_49, N) C(_50, N) C(_51, N) C(_52, N) C(_53, N) C(_54, N) C(_55, N) C(_56, N) C(_57, N)
+#define THO_AP58(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N) C(_30, N) C(_31, N) C(_32, N) C(_33, N) C(_34, N) C(_35, N) C(_36, N) C(_37, N) C(_38, N) C(_39, N) C(_40, N) C(_41, N) C(_42, N) C(_43, N) C(_44, N) C(_45, N) C(_46, N) C(_47, N) C(_48, N) C(_49, N) C(_50, N) C(_51, N) C(_52, N) C(_53, N) C(_54, N) C(_55, N) C(_56, N) C(_57, N) C(_58, N)
+#define THO_AP59(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58, _59) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N) C(_30, N) C(_31, N) C(_32, N) C(_33, N) C(_34, N) C(_35, N) C(_36, N) C(_37, N) C(_38, N) C(_39, N) C(_40, N) C(_41, N) C(_42, N) C(_43, N) C(_44, N) C(_45, N) C(_46, N) C(_47, N) C(_48, N) C(_49, N) C(_50, N) C(_51, N) C(_52, N) C(_53, N) C(_54, N) C(_55, N) C(_56, N) C(_57, N) C(_58, N) C(_59, N)
+#define THO_AP60(C, N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58, _59, _60) C(_1, N) C(_2, N) C(_3, N) C(_4, N) C(_5, N) C(_6, N) C(_7, N) C(_8, N) C(_9, N) C(_10, N) C(_11, N) C(_12, N) C(_13, N) C(_14, N) C(_15, N) C(_16, N) C(_17, N) C(_18, N) C(_19, N) C(_20, N) C(_21, N) C(_22, N) C(_23, N) C(_24, N) C(_25, N) C(_26, N) C(_27, N) C(_28, N) C(_29, N) C(_30, N) C(_31, N) C(_32, N) C(_33, N) C(_34, N) C(_35, N) C(_36, N) C(_37, N) C(_38, N) C(_39, N) C(_40, N) C(_41, N) C(_42, N) C(_43, N) C(_44, N) C(_45, N) C(_46, N) C(_47, N) C(_48, N) C(_49, N) C(_50, N) C(_51, N) C(_52, N) C(_53, N) C(_54, N) C(_55, N) C(_56, N) C(_57, N) C(_58, N) C(_59, N) C(_60, N)
+
+// End generated code
+// clang-format on
diff --git a/torch/headeronly/core/ScalarType.h b/torch/headeronly/core/ScalarType.h
index 19262e51de529..ce43ce6866cd9 100644
--- a/torch/headeronly/core/ScalarType.h
+++ b/torch/headeronly/core/ScalarType.h
@@ -266,6 +266,21 @@ enum class ScalarType : int8_t {
 constexpr uint16_t NumScalarTypes =
     static_cast<uint16_t>(ScalarType::NumOptions);
 
+// Map from C++ type to ScalarType enum
+template <typename T>
+struct CppTypeToScalarType;
+
+#define SPECIALIZE_CppTypeToScalarType(cpp_type, scalar_type)                  \
+  template <>                                                                  \
+  struct CppTypeToScalarType<cpp_type>                                         \
+      : std::                                                                  \
+            integral_constant<c10::ScalarType, c10::ScalarType::scalar_type> { \
+  };
+
+AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_CppTypeToScalarType)
+
+#undef SPECIALIZE_CppTypeToScalarType
+
 namespace impl {
 
 // These are used to map ScalarTypes to C++ types.
@@ -321,6 +336,13 @@ inline std::ostream& operator<<(
   return stream << toString(scalar_type);
 }
 
+inline bool isQIntType(ScalarType t) {
+  // Don't forget to extend this when adding new QInt types
+  return t == ScalarType::QInt8 || t == ScalarType::QUInt8 ||
+      t == ScalarType::QInt32 || t == ScalarType::QUInt4x2 ||
+      t == ScalarType::QUInt2x4;
+}
+
 inline ScalarType toUnderlying(ScalarType t) {
   switch (t) {
     case ScalarType::QUInt8:
@@ -340,12 +362,14 @@ inline ScalarType toUnderlying(ScalarType t) {
 } // namespace c10
 
 HIDDEN_NAMESPACE_BEGIN(torch, headeronly)
+using c10::CppTypeToScalarType;
 using c10::dummy_int1_7_t;
 using c10::dummy_uint1_7_t;
 using c10::NumScalarTypes;
 using c10::ScalarType;
 using c10::toString;
 using c10::operator<<;
+using c10::isQIntType;
 using c10::toUnderlying;
 
 namespace impl {
diff --git a/torch/headeronly/core/TensorAccessor.h b/torch/headeronly/core/TensorAccessor.h
new file mode 100644
index 0000000000000..9019c7ac3104d
--- /dev/null
+++ b/torch/headeronly/core/TensorAccessor.h
@@ -0,0 +1,462 @@
+#pragma once
+
+#include <torch/headeronly/macros/Macros.h>
+#include <torch/headeronly/util/Exception.h>
+#include <torch/headeronly/util/HeaderOnlyArrayRef.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <type_traits>
+
+namespace torch::headeronly {
+
+// The PtrTraits argument to the TensorAccessor/GenericPackedTensorAccessor
+// is used to enable the __restrict__ keyword/modifier for the data
+// passed to cuda.
+template <typename T>
+struct DefaultPtrTraits {
+  typedef T* PtrType;
+};
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+template <typename T>
+struct RestrictPtrTraits {
+  typedef T* __restrict__ PtrType;
+};
+#endif
+
+namespace detail {
+// Template classes in torch::headeronly::detail namespace are used
+// to construct accessor template classes with custom ArrayRef and
+// index bound check implementations. For instance,
+// at::TensorAccessor and torch::headeronly::TensorAccessor template
+// classes use c10::IntArrayRef and
+// torch::headeronly::IntHeaderOnlyArrayRef classes, respectively,
+// as return value types of sizes() and strides() methods.
+
+// TensorAccessorBase and TensorAccessor are used for both CPU and CUDA tensors.
+// For CUDA tensors it is used in device code (only). This means that we
+// restrict ourselves to functions and types available there (e.g. IntArrayRef
+// isn't).
+
+// The PtrTraits argument is only relevant to cuda to support `__restrict__`
+// pointers.
+template <
+    class ArrayRefCls,
+    typename T,
+    size_t N,
+    template <typename U> class PtrTraits = DefaultPtrTraits,
+    typename index_t = int64_t>
+class TensorAccessorBase {
+ public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+
+  C10_HOST_DEVICE TensorAccessorBase(
+      PtrType data_,
+      const index_t* sizes_,
+      const index_t* strides_)
+      : data_(data_), sizes_(sizes_), strides_(strides_) {}
+  C10_HOST ArrayRefCls sizes() const {
+    return ArrayRefCls(sizes_, N);
+  }
+  C10_HOST ArrayRefCls strides() const {
+    return ArrayRefCls(strides_, N);
+  }
+  C10_HOST_DEVICE index_t stride(index_t i) const {
+    return strides_[i];
+  }
+  C10_HOST_DEVICE index_t size(index_t i) const {
+    return sizes_[i];
+  }
+  C10_HOST_DEVICE PtrType data() {
+    return data_;
+  }
+  C10_HOST_DEVICE const PtrType data() const {
+    return data_;
+  }
+
+ protected:
+  PtrType data_;
+  const index_t* sizes_;
+  const index_t* strides_;
+};
+
+// The `TensorAccessor` is typically instantiated for CPU `Tensor`s using
+// `Tensor.accessor<T, N>()`.
+// For CUDA `Tensor`s, `GenericPackedTensorAccessor` is used on the host and
+// only indexing on the device uses `TensorAccessor`s.
+template <
+    class ArrayRefCls,
+    typename T,
+    size_t N,
+    template <typename U> class PtrTraits = DefaultPtrTraits,
+    typename index_t = int64_t>
+class TensorAccessor
+    : public TensorAccessorBase<ArrayRefCls, T, N, PtrTraits, index_t> {
+ public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+
+  C10_HOST_DEVICE TensorAccessor(
+      PtrType data_,
+      const index_t* sizes_,
+      const index_t* strides_)
+      : TensorAccessorBase<ArrayRefCls, T, N, PtrTraits, index_t>(
+            data_,
+            sizes_,
+            strides_) {}
+
+  C10_HOST_DEVICE TensorAccessor<ArrayRefCls, T, N - 1, PtrTraits, index_t>
+  operator[](index_t i) {
+    return TensorAccessor<ArrayRefCls, T, N - 1, PtrTraits, index_t>(
+        this->data_ + this->strides_[0] * i,
+        this->sizes_ + 1,
+        this->strides_ + 1);
+  }
+
+  C10_HOST_DEVICE const TensorAccessor<
+      ArrayRefCls,
+      T,
+      N - 1,
+      PtrTraits,
+      index_t>
+  operator[](index_t i) const {
+    return TensorAccessor<ArrayRefCls, T, N - 1, PtrTraits, index_t>(
+        this->data_ + this->strides_[0] * i,
+        this->sizes_ + 1,
+        this->strides_ + 1);
+  }
+};
+
+template <
+    class ArrayRefCls,
+    typename T,
+    template <typename U> class PtrTraits,
+    typename index_t>
+class TensorAccessor<ArrayRefCls, T, 1, PtrTraits, index_t>
+    : public TensorAccessorBase<ArrayRefCls, T, 1, PtrTraits, index_t> {
+ public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+
+  C10_HOST_DEVICE TensorAccessor(
+      PtrType data_,
+      const index_t* sizes_,
+      const index_t* strides_)
+      : TensorAccessorBase<ArrayRefCls, T, 1, PtrTraits, index_t>(
+            data_,
+            sizes_,
+            strides_) {}
+  C10_HOST_DEVICE T& operator[](index_t i) {
+    // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+    return this->data_[this->strides_[0] * i];
+  }
+  C10_HOST_DEVICE const T& operator[](index_t i) const {
+    return this->data_[this->strides_[0] * i];
+  }
+};
+
+// GenericPackedTensorAccessorBase and GenericPackedTensorAccessor are used on
+// for CUDA `Tensor`s on the host and as in contrast to `TensorAccessor`s, they
+// copy the strides and sizes on instantiation (on the host) in order to
+// transfer them on the device when calling kernels. On the device, indexing of
+// multidimensional tensors gives to `TensorAccessor`s. Use RestrictPtrTraits as
+// PtrTraits if you want the tensor's data pointer to be marked as __restrict__.
+// Instantiation from data, sizes, strides is only needed on the host and
+// std::copy isn't available on the device, so those functions are host only.
+template <
+    typename IndexBoundsCheck,
+    typename T,
+    size_t N,
+    template <typename U> class PtrTraits = DefaultPtrTraits,
+    typename index_t = int64_t>
+class GenericPackedTensorAccessorBase {
+ public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+  C10_HOST GenericPackedTensorAccessorBase(
+      PtrType data_,
+      const index_t* sizes_,
+      const index_t* strides_)
+      : data_(data_) {
+    std::copy(sizes_, sizes_ + N, std::begin(this->sizes_));
+    std::copy(strides_, strides_ + N, std::begin(this->strides_));
+  }
+
+  // if index_t is not int64_t, we want to have an int64_t constructor
+  template <
+      typename source_index_t,
+      class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
+  C10_HOST GenericPackedTensorAccessorBase(
+      PtrType data_,
+      const source_index_t* sizes_,
+      const source_index_t* strides_)
+      : data_(data_) {
+    for (size_t i = 0; i < N; ++i) {
+      this->sizes_[i] = sizes_[i];
+      this->strides_[i] = strides_[i];
+    }
+  }
+
+  C10_HOST_DEVICE index_t stride(index_t i) const {
+    return strides_[i];
+  }
+  C10_HOST_DEVICE index_t size(index_t i) const {
+    return sizes_[i];
+  }
+  C10_HOST_DEVICE PtrType data() {
+    return data_;
+  }
+  C10_HOST_DEVICE const PtrType data() const {
+    return data_;
+  }
+
+ protected:
+  PtrType data_;
+  // NOLINTNEXTLINE(*c-arrays*)
+  index_t sizes_[N];
+  // NOLINTNEXTLINE(*c-arrays*)
+  index_t strides_[N];
+  C10_HOST void bounds_check_(index_t i) const {
+    IndexBoundsCheck _(i);
+  }
+};
+
+template <
+    typename ItemAccessor,
+    typename IndexBoundsCheck,
+    typename T,
+    size_t N,
+    template <typename U> class PtrTraits = DefaultPtrTraits,
+    typename index_t = int64_t>
+class GenericPackedTensorAccessor : public GenericPackedTensorAccessorBase<
+                                        IndexBoundsCheck,
+                                        T,
+                                        N,
+                                        PtrTraits,
+                                        index_t> {
+ public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+
+  C10_HOST GenericPackedTensorAccessor(
+      PtrType data_,
+      const index_t* sizes_,
+      const index_t* strides_)
+      : GenericPackedTensorAccessorBase<
+            IndexBoundsCheck,
+            T,
+            N,
+            PtrTraits,
+            index_t>(data_, sizes_, strides_) {}
+
+  // if index_t is not int64_t, we want to have an int64_t constructor
+  template <
+      typename source_index_t,
+      class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
+  C10_HOST GenericPackedTensorAccessor(
+      PtrType data_,
+      const source_index_t* sizes_,
+      const source_index_t* strides_)
+      : GenericPackedTensorAccessorBase<
+            IndexBoundsCheck,
+            T,
+            N,
+            PtrTraits,
+            index_t>(data_, sizes_, strides_) {}
+
+  C10_DEVICE ItemAccessor operator[](index_t i) {
+    index_t* new_sizes = this->sizes_ + 1;
+    index_t* new_strides = this->strides_ + 1;
+    return ItemAccessor(
+        this->data_ + this->strides_[0] * i, new_sizes, new_strides);
+  }
+
+  C10_DEVICE const ItemAccessor operator[](index_t i) const {
+    const index_t* new_sizes = this->sizes_ + 1;
+    const index_t* new_strides = this->strides_ + 1;
+    return ItemAccessor(
+        this->data_ + this->strides_[0] * i, new_sizes, new_strides);
+  }
+
+  /// Returns a PackedTensorAccessor of the same dimension after transposing the
+  /// two dimensions given. Does not actually move elements; transposition is
+  /// made by permuting the size/stride arrays. If the dimensions are not valid,
+  /// asserts.
+  C10_HOST GenericPackedTensorAccessor<
+      ItemAccessor,
+      IndexBoundsCheck,
+      T,
+      N,
+      PtrTraits,
+      index_t>
+  transpose(index_t dim1, index_t dim2) const {
+    this->bounds_check_(dim1);
+    this->bounds_check_(dim2);
+    GenericPackedTensorAccessor<
+        ItemAccessor,
+        IndexBoundsCheck,
+        T,
+        N,
+        PtrTraits,
+        index_t>
+        result(this->data_, this->sizes_, this->strides_);
+    std::swap(result.strides_[dim1], result.strides_[dim2]);
+    std::swap(result.sizes_[dim1], result.sizes_[dim2]);
+    return result;
+  }
+};
+
+template <
+    typename ItemAccessor,
+    typename IndexBoundsCheck,
+    typename T,
+    template <typename U> class PtrTraits,
+    typename index_t>
+class GenericPackedTensorAccessor<
+    ItemAccessor,
+    IndexBoundsCheck,
+    T,
+    1,
+    PtrTraits,
+    index_t>
+    : public GenericPackedTensorAccessorBase<
+          IndexBoundsCheck,
+          T,
+          1,
+          PtrTraits,
+          index_t> {
+ public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+  C10_HOST GenericPackedTensorAccessor(
+      PtrType data_,
+      const index_t* sizes_,
+      const index_t* strides_)
+      : GenericPackedTensorAccessorBase<
+            IndexBoundsCheck,
+            T,
+            1,
+            PtrTraits,
+            index_t>(data_, sizes_, strides_) {}
+
+  // if index_t is not int64_t, we want to have an int64_t constructor
+  template <
+      typename source_index_t,
+      class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
+  C10_HOST GenericPackedTensorAccessor(
+      PtrType data_,
+      const source_index_t* sizes_,
+      const source_index_t* strides_)
+      : GenericPackedTensorAccessorBase<
+            IndexBoundsCheck,
+            T,
+            1,
+            PtrTraits,
+            index_t>(data_, sizes_, strides_) {}
+
+  C10_DEVICE T& operator[](index_t i) {
+    return this->data_[this->strides_[0] * i];
+  }
+  C10_DEVICE const T& operator[](index_t i) const {
+    return this->data_[this->strides_[0] * i];
+  }
+
+  // Same as in the general N-dimensional case, but note that in the
+  // 1-dimensional case the returned PackedTensorAccessor will always be an
+  // identical copy of the original
+  C10_HOST GenericPackedTensorAccessor<
+      ItemAccessor,
+      IndexBoundsCheck,
+      T,
+      1,
+      PtrTraits,
+      index_t>
+  transpose(index_t dim1, index_t dim2) const {
+    this->bounds_check_(dim1);
+    this->bounds_check_(dim2);
+    return GenericPackedTensorAccessor<
+        ItemAccessor,
+        IndexBoundsCheck,
+        T,
+        1,
+        PtrTraits,
+        index_t>(this->data_, this->sizes_, this->strides_);
+  }
+};
+
+template <size_t N, typename index_t>
+struct HeaderOnlyIndexBoundsCheck {
+  HeaderOnlyIndexBoundsCheck(index_t i) {
+    STD_TORCH_CHECK(
+        0 <= i && i < index_t{N},
+        "Index ",
+        i,
+        " is not within bounds of a tensor of dimension ",
+        N);
+  }
+};
+
+} // namespace detail
+
+// HeaderOnlyTensorAccessorBase is same as at::TensorAccessorBase
+// except sizes() and strides() return IntHeaderOnlyArrayRef instead
+// of IntArrayRef.
+template <
+    typename T,
+    size_t N,
+    template <typename U> class PtrTraits = DefaultPtrTraits,
+    typename index_t = int64_t>
+using HeaderOnlyTensorAccessorBase = detail::TensorAccessorBase<
+    torch::headeronly::IntHeaderOnlyArrayRef,
+    T,
+    N,
+    PtrTraits,
+    index_t>;
+
+// HeaderOnlyTensorAccessor is same as at::TensorAccessor except
+// sizes() and strides() return IntHeaderOnlyArrayRef instead of
+// IntArrayRef.
+template <
+    typename T,
+    size_t N,
+    template <typename U> class PtrTraits = DefaultPtrTraits,
+    typename index_t = int64_t>
+using HeaderOnlyTensorAccessor = detail::TensorAccessor<
+    torch::headeronly::IntHeaderOnlyArrayRef,
+    T,
+    N,
+    PtrTraits,
+    index_t>;
+
+// HeaderOnlyGenericPackedTensorAccessorBase is same as
+// at::GenericPackedTensorAccessorBase except sizes() and strides()
+// return IntHeaderOnlyArrayRef instead of IntArrayRef.
+template <
+    typename T,
+    size_t N,
+    template <typename U> class PtrTraits = DefaultPtrTraits,
+    typename index_t = int64_t>
+using HeaderOnlyGenericPackedTensorAccessorBase =
+    detail::GenericPackedTensorAccessorBase<
+        detail::HeaderOnlyIndexBoundsCheck<N, index_t>,
+        T,
+        N,
+        PtrTraits,
+        index_t>;
+
+// HeaderOnlyGenericPackedTensorAccessor is same as
+// at::GenericPackedTensorAccessor except sizes() and strides() return
+// IntHeaderOnlyArrayRef instead of IntArrayRef, and bounds check uses
+// STD_TORCH_CHECK instead of TORCH_CHECK_INDEX.
+template <
+    typename T,
+    size_t N,
+    template <typename U> class PtrTraits = DefaultPtrTraits,
+    typename index_t = int64_t>
+using HeaderOnlyGenericPackedTensorAccessor =
+    detail::GenericPackedTensorAccessor<
+        HeaderOnlyTensorAccessor<T, N - 1, PtrTraits, index_t>,
+        detail::HeaderOnlyIndexBoundsCheck<N, index_t>,
+        T,
+        N,
+        PtrTraits,
+        index_t>;
+
+} // namespace torch::headeronly
diff --git a/torch/headeronly/util/HeaderOnlyArrayRef.h b/torch/headeronly/util/HeaderOnlyArrayRef.h
new file mode 100644
index 0000000000000..751ffef32bb1d
--- /dev/null
+++ b/torch/headeronly/util/HeaderOnlyArrayRef.h
@@ -0,0 +1,248 @@
+#pragma once
+
+#include <torch/headeronly/macros/Macros.h>
+#include <torch/headeronly/util/Exception.h>
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <functional>
+#include <initializer_list>
+#include <iterator>
+#include <type_traits>
+#include <vector>
+
+namespace c10 {
+
+/// HeaderOnlyArrayRef - A subset of ArrayRef that is implemented only
+/// in headers. This will be a base class from which ArrayRef inherits, so that
+/// we can keep much of the implementation shared.
+///
+/// [HeaderOnlyArrayRef vs ArrayRef note]
+/// As HeaderOnlyArrayRef is a subset of ArrayRef, it has slightly less
+/// functionality than ArrayRef. We document the minor differences below:
+/// 1. ArrayRef has an extra convenience constructor for SmallVector.
+/// 2. ArrayRef uses TORCH_CHECK. HeaderOnlyArrayRef uses header-only
+///    STD_TORCH_CHECK, which will output a std::runtime_error vs a
+///    c10::Error. Consequently, you should use ArrayRef when possible
+///    and HeaderOnlyArrayRef only when necessary to support headeronly code.
+/// In all other aspects, HeaderOnlyArrayRef is identical to ArrayRef, with the
+/// positive benefit of being header-only and thus independent of libtorch.so.
+template <typename T>
+class HeaderOnlyArrayRef {
+ public:
+  using iterator = const T*;
+  using const_iterator = const T*;
+  using size_type = size_t;
+  using value_type = T;
+
+  using reverse_iterator = std::reverse_iterator<iterator>;
+
+ protected:
+  /// The start of the array, in an external buffer.
+  const T* Data;
+
+  /// The number of elements.
+  size_type Length;
+
+ public:
+  /// @name Constructors
+  /// @{
+
+  /// Construct an empty HeaderOnlyArrayRef.
+  /* implicit */ constexpr HeaderOnlyArrayRef() : Data(nullptr), Length(0) {}
+
+  /// Construct a HeaderOnlyArrayRef from a single element.
+  // TODO Make this explicit
+  constexpr HeaderOnlyArrayRef(const T& OneElt) : Data(&OneElt), Length(1) {}
+
+  /// Construct a HeaderOnlyArrayRef from a pointer and length.
+  constexpr HeaderOnlyArrayRef(const T* data, size_t length)
+      : Data(data), Length(length) {}
+
+  /// Construct a HeaderOnlyArrayRef from a range.
+  constexpr HeaderOnlyArrayRef(const T* begin, const T* end)
+      : Data(begin), Length(end - begin) {}
+
+  template <
+      typename Container,
+      typename U = decltype(std::declval<Container>().data()),
+      typename = std::enable_if_t<
+          (std::is_same_v<U, T*> || std::is_same_v<U, T const*>)>>
+  /* implicit */ HeaderOnlyArrayRef(const Container& container)
+      : Data(container.data()), Length(container.size()) {}
+
+  /// Construct a HeaderOnlyArrayRef from a std::vector.
+  // The enable_if stuff here makes sure that this isn't used for
+  // std::vector<bool>, because ArrayRef can't work on a std::vector<bool>
+  // bitfield.
+  template <typename A>
+  /* implicit */ HeaderOnlyArrayRef(const std::vector<T, A>& Vec)
+      : Data(Vec.data()), Length(Vec.size()) {
+    static_assert(
+        !std::is_same_v<T, bool>,
+        "HeaderOnlyArrayRef<bool> cannot be constructed from a std::vector<bool> bitfield.");
+  }
+
+  /// Construct a HeaderOnlyArrayRef from a std::array
+  template <size_t N>
+  /* implicit */ constexpr HeaderOnlyArrayRef(const std::array<T, N>& Arr)
+      : Data(Arr.data()), Length(N) {}
+
+  /// Construct a HeaderOnlyArrayRef from a C array.
+  template <size_t N>
+  // NOLINTNEXTLINE(*c-arrays*)
+  /* implicit */ constexpr HeaderOnlyArrayRef(const T (&Arr)[N])
+      : Data(Arr), Length(N) {}
+
+  /// Construct a HeaderOnlyArrayRef from a std::initializer_list.
+  /* implicit */ constexpr HeaderOnlyArrayRef(
+      const std::initializer_list<T>& Vec)
+      : Data(
+            std::begin(Vec) == std::end(Vec) ? static_cast<T*>(nullptr)
+                                             : std::begin(Vec)),
+        Length(Vec.size()) {}
+
+  /// @}
+  /// @name Simple Operations
+  /// @{
+
+  constexpr iterator begin() const {
+    return this->Data;
+  }
+  constexpr iterator end() const {
+    return this->Data + this->Length;
+  }
+
+  // These are actually the same as iterator, since ArrayRef only
+  // gives you const iterators.
+  constexpr const_iterator cbegin() const {
+    return this->Data;
+  }
+  constexpr const_iterator cend() const {
+    return this->Data + this->Length;
+  }
+
+  constexpr reverse_iterator rbegin() const {
+    return reverse_iterator(end());
+  }
+  constexpr reverse_iterator rend() const {
+    return reverse_iterator(begin());
+  }
+
+  /// Check if all elements in the array satisfy the given expression
+  constexpr bool allMatch(const std::function<bool(const T&)>& pred) const {
+    return std::all_of(cbegin(), cend(), pred);
+  }
+
+  /// empty - Check if the array is empty.
+  constexpr bool empty() const {
+    return this->Length == 0;
+  }
+
+  constexpr const T* data() const {
+    return this->Data;
+  }
+
+  /// size - Get the array size.
+  constexpr size_t size() const {
+    return this->Length;
+  }
+
+  /// front - Get the first element.
+  constexpr const T& front() const {
+    STD_TORCH_CHECK(
+        !this->empty(),
+        "HeaderOnlyArrayRef: attempted to access front() of empty list");
+    return this->Data[0];
+  }
+
+  /// back - Get the last element.
+  constexpr const T& back() const {
+    STD_TORCH_CHECK(
+        !this->empty(),
+        "HeaderOnlyArrayRef: attempted to access back() of empty list");
+    return this->Data[this->Length - 1];
+  }
+
+  /// equals - Check for element-wise equality.
+  constexpr bool equals(HeaderOnlyArrayRef RHS) const {
+    return this->Length == RHS.Length &&
+        std::equal(begin(), end(), RHS.begin());
+  }
+
+  /// slice(n, m) - Take M elements of the array starting at element N
+  constexpr HeaderOnlyArrayRef<T> slice(size_t N, size_t M) const {
+    STD_TORCH_CHECK(
+        N + M <= this->size(),
+        "HeaderOnlyArrayRef: invalid slice, N = ",
+        N,
+        "; M = ",
+        M,
+        "; size = ",
+        this->size());
+    return HeaderOnlyArrayRef<T>(this->data() + N, M);
+  }
+
+  /// slice(n) - Chop off the first N elements of the array.
+  constexpr HeaderOnlyArrayRef<T> slice(size_t N) const {
+    STD_TORCH_CHECK(
+        N <= this->size(),
+        "HeaderOnlyArrayRef: invalid slice, N = ",
+        N,
+        "; size = ",
+        this->size());
+    return slice(N, this->size() - N);
+  }
+
+  /// @}
+  /// @name Operator Overloads
+  /// @{
+  constexpr const T& operator[](size_t Index) const {
+    return this->Data[Index];
+  }
+
+  /// Vector compatibility
+  constexpr const T& at(size_t Index) const {
+    STD_TORCH_CHECK(
+        Index < this->Length,
+        "HeaderOnlyArrayRef: invalid index Index = ",
+        Index,
+        "; Length = ",
+        this->Length);
+    return this->Data[Index];
+  }
+
+  /// Disallow accidental assignment from a temporary.
+  ///
+  /// The declaration here is extra complicated so that "arrayRef = {}"
+  /// continues to select the move assignment operator.
+  template <typename U>
+  std::enable_if_t<std::is_same_v<U, T>, HeaderOnlyArrayRef<T>>& operator=(
+      // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
+      U&& Temporary) = delete;
+
+  /// Disallow accidental assignment from a temporary.
+  ///
+  /// The declaration here is extra complicated so that "arrayRef = {}"
+  /// continues to select the move assignment operator.
+  template <typename U>
+  std::enable_if_t<std::is_same_v<U, T>, HeaderOnlyArrayRef<T>>& operator=(
+      std::initializer_list<U>) = delete;
+
+  /// @}
+  /// @name Expensive Operations
+  /// @{
+  std::vector<T> vec() const {
+    return std::vector<T>(this->Data, this->Data + this->Length);
+  }
+
+  /// @}
+};
+
+} // namespace c10
+
+namespace torch::headeronly {
+using c10::HeaderOnlyArrayRef;
+using IntHeaderOnlyArrayRef = HeaderOnlyArrayRef<int64_t>;
+} // namespace torch::headeronly
diff --git a/torch/headeronly/util/Metaprogramming.h b/torch/headeronly/util/Metaprogramming.h
new file mode 100644
index 0000000000000..2589f338d35db
--- /dev/null
+++ b/torch/headeronly/util/Metaprogramming.h
@@ -0,0 +1,237 @@
+#pragma once
+
+#include <torch/headeronly/macros/Macros.h>
+#include <torch/headeronly/util/TypeList.h>
+#include <type_traits>
+
+namespace c10::guts {
+
+/**
+ * Access information about result type or arguments from a function type.
+ * Example:
+ * using A = function_traits<int (float, double)>::return_type // A == int
+ * using A = function_traits<int (float, double)>::parameter_types::tuple_type
+ * // A == tuple<float, double>
+ */
+template <class Func>
+struct function_traits {
+  static_assert(
+      !std::is_same_v<Func, Func>,
+      "In function_traits<Func>, Func must be a plain function type.");
+};
+template <class Result, class... Args>
+struct function_traits<Result(Args...)> {
+  using func_type = Result(Args...);
+  using return_type = Result;
+  using parameter_types = typelist::typelist<Args...>;
+  static constexpr auto number_of_parameters = sizeof...(Args);
+};
+
+/**
+ * infer_function_traits: creates a `function_traits` type for a simple
+ * function (pointer) or functor (lambda/struct). Currently does not support
+ * class methods.
+ */
+
+template <typename Functor>
+struct infer_function_traits {
+  using type = function_traits<
+      c10::guts::detail::strip_class_t<decltype(&Functor::operator())>>;
+};
+
+template <typename Result, typename... Args>
+struct infer_function_traits<Result (*)(Args...)> {
+  using type = function_traits<Result(Args...)>;
+};
+
+template <typename Result, typename... Args>
+struct infer_function_traits<Result(Args...)> {
+  using type = function_traits<Result(Args...)>;
+};
+
+template <typename T>
+using infer_function_traits_t = typename infer_function_traits<T>::type;
+
+/**
+ * make_function_traits: creates a `function_traits` type given a Return type
+ * and a typelist of Argument types
+ *
+ * Example:
+ * bool f(int, int);
+ *
+ * infer_function_traits_t<f> == make_function_traits_t<bool,
+ * typelist::typelist<int, int>>
+ */
+template <typename Result, typename ArgList>
+struct make_function_traits {
+  static_assert(
+      false_t<ArgList>::value,
+      "In guts::make_function_traits<Result, TypeList>, the ArgList argument must be typelist<...>.");
+};
+
+template <typename Result, typename... Args>
+struct make_function_traits<Result, typelist::typelist<Args...>> {
+  using type = function_traits<Result(Args...)>;
+};
+
+template <typename Result, typename ArgList>
+using make_function_traits_t =
+    typename make_function_traits<Result, ArgList>::type;
+
+/**
+ * make_offset_index_sequence<Start, N>
+ * Like make_index_sequence<N>, but starting from Start instead of 0.
+ *
+ * Example:
+ *  make_offset_index_sequence<10, 3> == std::index_sequence<10, 11, 12>
+ */
+template <size_t Start, size_t N, size_t... Is>
+struct make_offset_index_sequence_impl
+    : make_offset_index_sequence_impl<Start, N - 1, Start + N - 1, Is...> {
+  static_assert(
+      static_cast<int>(Start) >= 0,
+      "make_offset_index_sequence: Start < 0");
+  static_assert(static_cast<int>(N) >= 0, "make_offset_index_sequence: N < 0");
+};
+
+template <size_t Start, size_t... Is>
+struct make_offset_index_sequence_impl<Start, 0, Is...> {
+  typedef std::index_sequence<Is...> type;
+};
+
+template <size_t Start, size_t N>
+using make_offset_index_sequence =
+    typename make_offset_index_sequence_impl<Start, N>::type;
+
+/**
+ * Use tuple_elements to extract a position-indexed subset of elements
+ * from the argument tuple into a result tuple.
+ *
+ * Example:
+ *  std::tuple<int, const char*, double> t = std::make_tuple(0, "HEY", 2.0);
+ *  std::tuple<int, double> result = tuple_elements(t, std::index_sequence<0,
+ * 2>());
+ */
+template <class Tuple, size_t... Is>
+constexpr auto tuple_elements(Tuple t, std::index_sequence<Is...> /*unused*/) {
+  return std::tuple<std::tuple_element_t<Is, Tuple>...>(std::get<Is>(t)...);
+}
+
+/**
+ * Use tuple_take to extract the first or last n elements from the argument
+ * tuple into a result tuple.
+ *
+ * Example:
+ *  std::tuple<int, const char*, double> t = std::make_tuple(0, "HEY", 2.0);
+ *  std::tuple<int, const char*> first_two = tuple_take<decltype(t), 2>(t);
+ *  std::tuple<const char*, double> last_two = tuple_take<decltype(t), -2>(t);
+ */
+template <class Tuple, int N, class Enable = void>
+struct TupleTake {};
+
+template <class Tuple, int N>
+struct TupleTake<Tuple, N, std::enable_if_t<N >= 0, void>> {
+  static auto call(Tuple t) {
+    constexpr size_t size = std::tuple_size<Tuple>();
+    static_assert(N <= size, "tuple_take: N > size");
+    return tuple_elements(t, std::make_index_sequence<N>{});
+  }
+};
+
+template <class Tuple, int N>
+    struct TupleTake < Tuple,
+    N, std::enable_if_t<N<0, void>> {
+  static auto call(Tuple t) {
+    constexpr size_t size = std::tuple_size<Tuple>();
+    static_assert(-N <= size, "tuple_take: -N > size");
+    return tuple_elements(t, make_offset_index_sequence<size + N, -N>{});
+  }
+};
+
+template <class Tuple, int N>
+auto tuple_take(Tuple t) {
+  return TupleTake<Tuple, N>::call(t);
+}
+
+/**
+ * Use tuple_slice to extract a contiguous subtuple from the argument.
+ *
+ * Example:
+ *  std::tuple<int, const char*, double, bool> t = std::make_tuple(0,
+ * "HEY", 2.0, false); std::tuple<int, const char*> middle_two =
+ * tuple_slice<decltype(t), 1, 2>(t);
+ */
+template <class Tuple, size_t Start, size_t N>
+constexpr auto tuple_slice(Tuple t) {
+  constexpr size_t size = std::tuple_size<Tuple>();
+  static_assert(Start + N <= size, "tuple_slice: Start + N > size");
+  return tuple_elements(t, make_offset_index_sequence<Start, N>{});
+}
+
+/**
+ * Use tuple_map to run a mapping function over a tuple to get a new tuple.
+ *
+ * Example 1:
+ *   auto result = tuple_map(std::tuple<int32_t, int32_t, int32_t>(3, 4, 5), []
+ * (int32_t a) -> int16_t {return a+1;});
+ *   // result == std::tuple<int16_t, int16_t, int16_t>(4, 5, 6)
+ *
+ * Example 2:
+ *   struct Mapper {
+ *     std::string operator()(int32_t a) const {
+ *       return std::to_string(a);
+ *     }
+ *     int64_t operator()(const std::string& a) const {
+ *        return atoi(a.c_str());
+ *     }
+ *   };
+ *   auto result = tuple_map(std::tuple<int32_t, std::string>(3, "4"),
+ * Mapper());
+ *   // result == std::tuple<std::string, int64_t>("3", 4)
+ *
+ * Example 3:
+ *   struct A final {
+ *    int32_t func() {
+ *      return 5;
+ *    }
+ *  };
+ *  struct B final {
+ *    std::string func() {
+ *      return "5";
+ *    }
+ *  };
+ *  auto result = tuple_map(std::make_tuple(A(), B()), [] (auto a) { return
+ * a.func(); });
+ *  // result == std::tuple<int32_t, std::string>(5, "5");
+ */
+namespace detail {
+template <class Mapper, class... Args, size_t... Indices>
+auto tuple_map(
+    // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
+    std::tuple<Args...>&& tuple,
+    const Mapper& mapper,
+    std::index_sequence<Indices...> /*unused*/) {
+  return std::tuple<decltype(mapper(std::forward<Args>(std::get<Indices>(
+      tuple))))...>(mapper(std::forward<Args>(std::get<Indices>(tuple)))...);
+}
+} // namespace detail
+
+template <class Mapper, class... Args>
+auto tuple_map(std::tuple<Args...>&& tuple, const Mapper& mapper) {
+  return detail::tuple_map(
+      std::move(tuple), mapper, std::index_sequence_for<Args...>());
+}
+
+} // namespace c10::guts
+
+HIDDEN_NAMESPACE_BEGIN(torch, headeronly, guts)
+
+using c10::guts::function_traits;
+using c10::guts::infer_function_traits_t;
+using c10::guts::make_function_traits_t;
+using c10::guts::tuple_elements;
+using c10::guts::tuple_map;
+using c10::guts::tuple_slice;
+using c10::guts::tuple_take;
+
+HIDDEN_NAMESPACE_END(torch, headeronly, guts)
diff --git a/torch/headeronly/util/TypeList.h b/torch/headeronly/util/TypeList.h
new file mode 100644
index 0000000000000..cd81f0cc1dcf9
--- /dev/null
+++ b/torch/headeronly/util/TypeList.h
@@ -0,0 +1,548 @@
+#pragma once
+
+#include <torch/headeronly/macros/Macros.h>
+#include <torch/headeronly/util/TypeTraits.h>
+#include <algorithm>
+#include <cstddef>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+namespace c10::guts {
+
+template <class... T>
+struct false_t : std::false_type {};
+template <template <class> class... T>
+struct false_higher_t : std::false_type {};
+
+namespace typelist {
+
+/**
+ * Type holding a list of types for compile time type computations
+ */
+template <class... Items>
+struct typelist final {
+ public:
+  typelist() = delete; // not for instantiation
+};
+
+/**
+ * Returns the number of types in a typelist
+ * Example:
+ *   3  ==  size<typelist<int, int, double>>::value
+ */
+template <class TypeList>
+struct size final {
+  static_assert(
+      false_t<TypeList>::value,
+      "In typelist::size<T>, T must be typelist<...>.");
+};
+template <class... Types>
+struct size<typelist<Types...>> final {
+  static constexpr size_t value = sizeof...(Types);
+};
+
+/**
+ * Transforms a list of types into a tuple holding these types.
+ * Example:
+ *   std::tuple<int, string>  ==  to_tuple_t<typelist<int, string>>
+ */
+template <class TypeList>
+struct to_tuple final {
+  static_assert(
+      false_t<TypeList>::value,
+      "In typelist::to_tuple<T>, T must be typelist<...>.");
+};
+template <class... Types>
+struct to_tuple<typelist<Types...>> final {
+  using type = std::tuple<Types...>;
+};
+template <class TypeList>
+using to_tuple_t = typename to_tuple<TypeList>::type;
+
+/**
+ * Creates a typelist containing the types of a given tuple.
+ * Example:
+ *   typelist<int, string>  ==  from_tuple_t<std::tuple<int, string>>
+ */
+template <class Tuple>
+struct from_tuple final {
+  static_assert(
+      false_t<Tuple>::value,
+      "In typelist::from_tuple<T>, T must be std::tuple<...>.");
+};
+template <class... Types>
+struct from_tuple<std::tuple<Types...>> final {
+  using type = typelist<Types...>;
+};
+template <class Tuple>
+using from_tuple_t = typename from_tuple<Tuple>::type;
+
+/**
+ * Concatenates multiple type lists.
+ * Example:
+ *   typelist<int, string, int>  ==  concat_t<typelist<int, string>,
+ * typelist<int>>
+ */
+template <class... TypeLists>
+struct concat final {
+  static_assert(
+      false_t<TypeLists...>::value,
+      "In typelist::concat<T1, ...>, the T arguments each must be typelist<...>.");
+};
+template <class... Head1Types, class... Head2Types, class... TailLists>
+struct concat<typelist<Head1Types...>, typelist<Head2Types...>, TailLists...>
+    final {
+  using type =
+      typename concat<typelist<Head1Types..., Head2Types...>, TailLists...>::
+          type;
+};
+template <class... HeadTypes>
+struct concat<typelist<HeadTypes...>> final {
+  using type = typelist<HeadTypes...>;
+};
+template <>
+struct concat<> final {
+  using type = typelist<>;
+};
+template <class... TypeLists>
+using concat_t = typename concat<TypeLists...>::type;
+
+/**
+ * Filters the types in a type list by a type trait.
+ * Examples:
+ *   typelist<int&, const string&&>  ==  filter_t<std::is_reference,
+ * typelist<void, string, int&, bool, const string&&, int>>
+ */
+template <template <class> class Condition, class TypeList>
+struct filter final {
+  static_assert(
+      false_t<TypeList>::value,
+      "In typelist::filter<Condition, TypeList>, the TypeList argument must be typelist<...>.");
+};
+template <template <class> class Condition, class Head, class... Tail>
+struct filter<Condition, typelist<Head, Tail...>> final {
+  static_assert(
+      is_type_condition<Condition>::value,
+      "In typelist::filter<Condition, TypeList>, the Condition argument must be a condition type trait, i.e. have a static constexpr bool ::value member.");
+  using type = std::conditional_t<
+      Condition<Head>::value,
+      concat_t<
+          typelist<Head>,
+          typename filter<Condition, typelist<Tail...>>::type>,
+      typename filter<Condition, typelist<Tail...>>::type>;
+};
+template <template <class> class Condition>
+struct filter<Condition, typelist<>> final {
+  static_assert(
+      is_type_condition<Condition>::value,
+      "In typelist::filter<Condition, TypeList>, the Condition argument must be a condition type trait, i.e. have a static constexpr bool ::value member.");
+  using type = typelist<>;
+};
+template <template <class> class Condition, class TypeList>
+using filter_t = typename filter<Condition, TypeList>::type;
+
+/**
+ * Counts how many types in the list fulfill a type trait
+ * Examples:
+ *   2  ==  count_if<std::is_reference, typelist<void, string, int&, bool, const
+ * string&&, int>>
+ */
+template <template <class> class Condition, class TypeList>
+struct count_if final {
+  static_assert(
+      is_type_condition<Condition>::value,
+      "In typelist::count_if<Condition, TypeList>, the Condition argument must be a condition type trait, i.e. have a static constexpr bool ::value member.");
+  static_assert(
+      is_instantiation_of<typelist, TypeList>::value,
+      "In typelist::count_if<Condition, TypeList>, the TypeList argument must be typelist<...>.");
+  // TODO Direct implementation might be faster
+  static constexpr size_t value = size<filter_t<Condition, TypeList>>::value;
+};
+
+/**
+ * Checks if a typelist contains a certain type.
+ * Examples:
+ *  contains<typelist<int, string>, string> == true_type
+ *  contains<typelist<int, string>, double> == false_type
+ */
+namespace detail {
+template <class TypeList, class Type, class Enable = void>
+struct contains {};
+template <class Type>
+struct contains<typelist<>, Type, void> : std::false_type {};
+template <class Type, class Head, class... Tail>
+struct contains<
+    typelist<Head, Tail...>,
+    Type,
+    std::enable_if_t<std::is_same_v<Head, Type>>> : std::true_type {};
+template <class Type, class Head, class... Tail>
+struct contains<
+    typelist<Head, Tail...>,
+    Type,
+    std::enable_if_t<!std::is_same_v<Head, Type>>>
+    : contains<typelist<Tail...>, Type> {};
+} // namespace detail
+template <class TypeList, class Type>
+using contains = typename detail::contains<TypeList, Type>::type;
+
+/**
+ * Returns true iff the type trait is true for all types in the type list
+ * Examples:
+ *   true   ==  all<std::is_reference, typelist<int&, const float&&, const
+ * MyClass&>>::value false  ==  all<std::is_reference, typelist<int&, const
+ * float&&, MyClass>>::value
+ */
+template <template <class> class Condition, class TypeList>
+struct all {
+  static_assert(
+      false_t<TypeList>::value,
+      "In typelist::all<Condition, TypeList>, the TypeList argument must be typelist<...>.");
+};
+template <template <class> class Condition, class... Types>
+struct all<Condition, typelist<Types...>>
+    : std::conjunction<Condition<Types>...> {
+  static_assert(
+      is_type_condition<Condition>::value,
+      "In typelist::all<Condition, TypeList>, the Condition argument must be a condition type trait, i.e. have a static constexpr bool ::value member.");
+};
+
+/**
+ * Returns true iff the type trait is true for any type in the type list
+ * Examples:
+ *   true   ==  true_for_any_type<std::is_reference, typelist<int, const
+ * float&&, const MyClass>>::value false  ==
+ * true_for_any_type<std::is_reference, typelist<int, const float,
+ * MyClass>>::value
+ */
+template <template <class> class Condition, class TypeList>
+struct true_for_any_type final {
+  static_assert(
+      false_t<TypeList>::value,
+      "In typelist::true_for_any_type<Condition, TypeList>, the TypeList argument must be typelist<...>.");
+};
+template <template <class> class Condition, class... Types>
+struct true_for_any_type<Condition, typelist<Types...>> final
+    : std::disjunction<Condition<Types>...> {
+  static_assert(
+      is_type_condition<Condition>::value,
+      "In typelist::true_for_any_type<Condition, TypeList>, the Condition argument must be a condition type trait, i.e. have a static constexpr bool ::value member.");
+};
+
+/**
+ * Maps types of a type list using a type trait
+ * Example:
+ *  typelist<int&, double&, string&>  ==  map_t<std::add_lvalue_reference_t,
+ * typelist<int, double, string>>
+ */
+template <template <class> class Mapper, class TypeList>
+struct map final {
+  static_assert(
+      false_t<TypeList>::value,
+      "In typelist::map<Mapper, TypeList>, the TypeList argument must be typelist<...>.");
+};
+template <template <class> class Mapper, class... Types>
+struct map<Mapper, typelist<Types...>> final {
+  using type = typelist<Mapper<Types>...>;
+};
+template <template <class> class Mapper, class TypeList>
+using map_t = typename map<Mapper, TypeList>::type;
+
+/**
+ * Returns the first element of a type list.
+ * Example:
+ *   int  ==  head_t<typelist<int, string>>
+ */
+template <class TypeList>
+struct head final {
+  static_assert(
+      false_t<TypeList>::value,
+      "In typelist::head<T>, the T argument must be typelist<...>.");
+};
+template <class Head, class... Tail>
+struct head<typelist<Head, Tail...>> final {
+  using type = Head;
+};
+template <class TypeList>
+using head_t = typename head<TypeList>::type;
+
+/**
+ * Returns the first element of a type list, or the specified default if the
+ * type list is empty. Example: int  ==  head_t<bool, typelist<int, string>>
+ *   bool  ==  head_t<bool, typelist<>>
+ */
+template <class Default, class TypeList>
+struct head_with_default final {
+  using type = Default;
+};
+template <class Default, class Head, class... Tail>
+struct head_with_default<Default, typelist<Head, Tail...>> final {
+  using type = Head;
+};
+template <class Default, class TypeList>
+using head_with_default_t = typename head_with_default<Default, TypeList>::type;
+
+/**
+ * Returns the N-th element of a type list.
+ * Example:
+ * int == element_t<1, typelist<float, int, char>>
+ */
+
+/// Base template.
+template <size_t Index, class TypeList>
+struct element final {
+  static_assert(
+      false_t<TypeList>::value,
+      "In typelist::element<T>, the T argument must be typelist<...>.");
+};
+
+/// Successful case, we have reached the zero index and can "return" the head
+/// type.
+template <class Head, class... Tail>
+struct element<0, typelist<Head, Tail...>> {
+  using type = Head;
+};
+
+/// Error case, we have an index but ran out of types! It will only be selected
+/// if `Ts...` is actually empty!
+template <size_t Index, class... Ts>
+struct element<Index, typelist<Ts...>> {
+  static_assert(
+      Index < sizeof...(Ts),
+      "Index is out of bounds in typelist::element");
+};
+
+/// Shave off types until we hit the <0, Head, Tail...> or <Index> case.
+template <size_t Index, class Head, class... Tail>
+struct element<Index, typelist<Head, Tail...>>
+    : element<Index - 1, typelist<Tail...>> {};
+
+/// Convenience alias.
+template <size_t Index, class TypeList>
+using element_t = typename element<Index, TypeList>::type;
+
+/**
+ * Returns the last element of a type list.
+ * Example:
+ *   int  ==  last_t<typelist<int, string>>
+ */
+template <class TypeList>
+struct last final {
+  static_assert(
+      false_t<TypeList>::value,
+      "In typelist::last<T>, the T argument must be typelist<...>.");
+};
+template <class Head, class... Tail>
+struct last<typelist<Head, Tail...>> final {
+  using type = typename last<typelist<Tail...>>::type;
+};
+template <class Head>
+struct last<typelist<Head>> final {
+  using type = Head;
+};
+template <class TypeList>
+using last_t = typename last<TypeList>::type;
+static_assert(std::is_same_v<int, last_t<typelist<double, float, int>>>);
+
+/**
+ * Take/drop a number of arguments from a typelist.
+ * Example:
+ *   typelist<int, string> == take_t<typelist<int, string, bool>, 2>
+ *   typelist<bool> == drop_t<typelist<int, string, bool>, 2>
+ */
+namespace detail {
+template <class TypeList, size_t offset, class IndexSequence>
+struct take_elements final {};
+
+template <class TypeList, size_t offset, size_t... Indices>
+struct take_elements<TypeList, offset, std::index_sequence<Indices...>> final {
+  using type = typelist<typename element<offset + Indices, TypeList>::type...>;
+};
+} // namespace detail
+
+template <class TypeList, size_t num>
+struct take final {
+  static_assert(
+      is_instantiation_of<typelist, TypeList>::value,
+      "In typelist::take<T, num>, the T argument must be typelist<...>.");
+  static_assert(
+      num <= size<TypeList>::value,
+      "Tried to typelist::take more elements than there are in the list");
+  using type = typename detail::
+      take_elements<TypeList, 0, std::make_index_sequence<num>>::type;
+};
+template <class TypeList, size_t num>
+using take_t = typename take<TypeList, num>::type;
+
+template <class TypeList, size_t num>
+struct drop final {
+  static_assert(
+      is_instantiation_of<typelist, TypeList>::value,
+      "In typelist::drop<T, num>, the T argument must be typelist<...>.");
+  static_assert(
+      num <= size<TypeList>::value,
+      "Tried to typelist::drop more elements than there are in the list");
+  using type = typename detail::take_elements<
+      TypeList,
+      num,
+      std::make_index_sequence<size<TypeList>::value - num>>::type;
+};
+template <class TypeList, size_t num>
+using drop_t = typename drop<TypeList, num>::type;
+
+/**
+ * Like drop, but returns an empty list rather than an assertion error if `num`
+ * is larger than the size of the TypeList.
+ * Example:
+ *   typelist<> == drop_if_nonempty_t<typelist<string, bool>, 2>
+ *   typelist<> == drop_if_nonempty_t<typelist<int, string, bool>, 3>
+ */
+template <class TypeList, size_t num>
+struct drop_if_nonempty final {
+  static_assert(
+      is_instantiation_of<typelist, TypeList>::value,
+      "In typelist::drop<T, num>, the T argument must be typelist<...>.");
+  using type = typename detail::take_elements<
+      TypeList,
+      std::min(num, size<TypeList>::value),
+      std::make_index_sequence<
+          size<TypeList>::value - std::min(num, size<TypeList>::value)>>::type;
+};
+template <class TypeList, size_t num>
+using drop_if_nonempty_t = typename drop_if_nonempty<TypeList, num>::type;
+
+/**
+ * Reverses a typelist.
+ * Example:
+ *   typelist<int, string>  == reverse_t<typelist<string, int>>
+ */
+template <class TypeList>
+struct reverse final {
+  static_assert(
+      false_t<TypeList>::value,
+      "In typelist::reverse<T>, the T argument must be typelist<...>.");
+};
+template <class Head, class... Tail>
+struct reverse<typelist<Head, Tail...>> final {
+  using type =
+      concat_t<typename reverse<typelist<Tail...>>::type, typelist<Head>>;
+};
+template <>
+struct reverse<typelist<>> final {
+  using type = typelist<>;
+};
+template <class TypeList>
+using reverse_t = typename reverse<TypeList>::type;
+
+/**
+ * Find the index of the first type in a typelist fulfilling a type trait
+ * condition. Example:
+ *
+ * 2 == find_if<typelist<char, int, char&, int&>, std::is_reference>::value
+ */
+template <class TypeList, template <class> class Condition, class Enable = void>
+struct find_if final {
+  static_assert(
+      false_t<TypeList>::value,
+      "In typelist::find_if<TypeList, Condition>, the TypeList argument must be typelist<...>.");
+};
+template <template <class> class Condition>
+struct find_if<typelist<>, Condition, void> final {
+  static_assert(
+      false_higher_t<Condition>::value,
+      "In typelist::find_if<Type/List, Condition>, didn't find any type fulfilling the Condition.");
+};
+template <class Head, class... Tail, template <class> class Condition>
+struct find_if<
+    typelist<Head, Tail...>,
+    Condition,
+    std::enable_if_t<Condition<Head>::value>>
+    final {
+  static constexpr size_t value = 0;
+};
+template <class Head, class... Tail, template <class> class Condition>
+struct find_if<
+    typelist<Head, Tail...>,
+    Condition,
+    std::enable_if_t<!Condition<Head>::value>>
+    final {
+  static constexpr size_t value =
+      1 + find_if<typelist<Tail...>, Condition>::value;
+};
+
+/**
+ * Maps a list of types into a list of values.
+ * Examples:
+ *   // Example 1
+ *   auto sizes =
+ *     map_types_to_values<typelist<int64_t, bool, uint32_t>>(
+ *       [] (auto t) { return sizeof(decltype(t)::type); }
+ *     );
+ *   //  sizes  ==  std::tuple<size_t, size_t, size_t>{8, 1, 4}
+ *
+ *   // Example 2
+ *   auto shared_ptrs =
+ *     map_types_to_values<typelist<int, double>>(
+ *       [] (auto t) { return make_shared<typename decltype(t)::type>(); }
+ *     );
+ *   // shared_ptrs == std::tuple<shared_ptr<int>, shared_ptr<double>>()
+ */
+namespace detail {
+template <class T>
+struct type_ final {
+  using type = T;
+};
+template <class TypeList>
+struct map_types_to_values final {
+  static_assert(
+      false_t<TypeList>::value,
+      "In typelist::map_types_to_values<T>, the T argument must be typelist<...>.");
+};
+template <class... Types>
+struct map_types_to_values<typelist<Types...>> final {
+  template <class Func>
+  static auto call(Func&& func) {
+    return std::tuple{std::forward<Func>(func)(type_<Types>())...};
+  }
+};
+} // namespace detail
+
+template <class TypeList, class Func>
+auto map_types_to_values(Func&& func) {
+  return detail::map_types_to_values<TypeList>::call(std::forward<Func>(func));
+}
+
+} // namespace typelist
+} // namespace c10::guts
+
+HIDDEN_NAMESPACE_BEGIN(torch, headeronly)
+
+namespace guts {
+using c10::guts::false_t;
+
+namespace typelist {
+using c10::guts::typelist::all;
+using c10::guts::typelist::concat_t;
+using c10::guts::typelist::contains;
+using c10::guts::typelist::count_if;
+using c10::guts::typelist::drop_if_nonempty_t;
+using c10::guts::typelist::drop_t;
+using c10::guts::typelist::filter_t;
+using c10::guts::typelist::find_if;
+using c10::guts::typelist::from_tuple_t;
+using c10::guts::typelist::head_t;
+using c10::guts::typelist::head_with_default_t;
+using c10::guts::typelist::map_t;
+using c10::guts::typelist::map_types_to_values;
+using c10::guts::typelist::reverse_t;
+using c10::guts::typelist::size;
+using c10::guts::typelist::take_t;
+using c10::guts::typelist::to_tuple_t;
+using c10::guts::typelist::true_for_any_type;
+using c10::guts::typelist::typelist;
+
+} // namespace typelist
+
+} // namespace guts
+
+HIDDEN_NAMESPACE_END(torch, headeronly)
diff --git a/torch/headeronly/util/TypeTraits.h b/torch/headeronly/util/TypeTraits.h
new file mode 100644
index 0000000000000..152b3f76eb094
--- /dev/null
+++ b/torch/headeronly/util/TypeTraits.h
@@ -0,0 +1,164 @@
+#pragma once
+
+#include <torch/headeronly/macros/Macros.h>
+
+#include <functional>
+#include <type_traits>
+
+namespace c10::guts {
+
+/**
+ * is_equality_comparable<T> is true_type iff the equality operator is defined
+ * for T.
+ */
+template <class T, class Enable = void>
+struct is_equality_comparable : std::false_type {};
+template <class T>
+struct is_equality_comparable<
+    T,
+    std::void_t<decltype(std::declval<T&>() == std::declval<T&>())>>
+    : std::true_type {};
+template <class T>
+using is_equality_comparable_t = typename is_equality_comparable<T>::type;
+
+/**
+ * is_hashable<T> is true_type iff std::hash is defined for T
+ */
+template <class T, class Enable = void>
+struct is_hashable : std::false_type {};
+template <class T>
+struct is_hashable<T, std::void_t<decltype(std::hash<T>()(std::declval<T&>()))>>
+    : std::true_type {};
+template <class T>
+using is_hashable_t = typename is_hashable<T>::type;
+
+/**
+ * is_function_type<T> is true_type iff T is a plain function type (i.e.
+ * "Result(Args...)")
+ */
+template <class T>
+struct is_function_type : std::false_type {};
+template <class Result, class... Args>
+struct is_function_type<Result(Args...)> : std::true_type {};
+template <class T>
+using is_function_type_t = typename is_function_type<T>::type;
+
+/**
+ * is_instantiation_of<T, I> is true_type iff I is a template instantiation of T
+ * (e.g. vector<int> is an instantiation of vector) Example:
+ *    is_instantiation_of_t<vector, vector<int>> // true
+ *    is_instantiation_of_t<pair, pair<int, string>> // true
+ *    is_instantiation_of_t<vector, pair<int, string>> // false
+ */
+template <template <class...> class Template, class T>
+struct is_instantiation_of : std::false_type {};
+template <template <class...> class Template, class... Args>
+struct is_instantiation_of<Template, Template<Args...>> : std::true_type {};
+template <template <class...> class Template, class T>
+using is_instantiation_of_t = typename is_instantiation_of<Template, T>::type;
+
+namespace detail {
+/**
+ * strip_class: helper to remove the class type from pointers to `operator()`.
+ */
+
+template <typename T>
+struct strip_class {};
+template <typename Class, typename Result, typename... Args>
+struct strip_class<Result (Class::*)(Args...)> {
+  using type = Result(Args...);
+};
+template <typename Class, typename Result, typename... Args>
+struct strip_class<Result (Class::*)(Args...) const> {
+  using type = Result(Args...);
+};
+template <typename T>
+using strip_class_t = typename strip_class<T>::type;
+} // namespace detail
+
+/**
+ * Evaluates to true_type, iff the given class is a Functor
+ * (i.e. has a call operator with some set of arguments)
+ */
+
+template <class Functor, class Enable = void>
+struct is_functor : std::false_type {};
+template <class Functor>
+struct is_functor<
+    Functor,
+    std::enable_if_t<is_function_type<
+        detail::strip_class_t<decltype(&Functor::operator())>>::value>>
+    : std::true_type {};
+
+/**
+ * lambda_is_stateless<T> is true iff the lambda type T is stateless
+ * (i.e. does not have a closure).
+ * Example:
+ *  auto stateless_lambda = [] (int a) {return a;};
+ *  lambda_is_stateless<decltype(stateless_lambda)> // true
+ *  auto stateful_lambda = [&] (int a) {return a;};
+ *  lambda_is_stateless<decltype(stateful_lambda)> // false
+ */
+namespace detail {
+template <class LambdaType, class FuncType>
+struct is_stateless_lambda__ final {
+  static_assert(
+      !std::is_same_v<LambdaType, LambdaType>,
+      "Base case shouldn't be hit");
+};
+// implementation idea: According to the C++ standard, stateless lambdas are
+// convertible to function pointers
+template <class LambdaType, class C, class Result, class... Args>
+struct is_stateless_lambda__<LambdaType, Result (C::*)(Args...) const>
+    : std::is_convertible<LambdaType, Result (*)(Args...)> {};
+template <class LambdaType, class C, class Result, class... Args>
+struct is_stateless_lambda__<LambdaType, Result (C::*)(Args...)>
+    : std::is_convertible<LambdaType, Result (*)(Args...)> {};
+
+// case where LambdaType is not even a functor
+template <class LambdaType, class Enable = void>
+struct is_stateless_lambda_ final : std::false_type {};
+// case where LambdaType is a functor
+template <class LambdaType>
+struct is_stateless_lambda_<
+    LambdaType,
+    std::enable_if_t<is_functor<LambdaType>::value>>
+    : is_stateless_lambda__<LambdaType, decltype(&LambdaType::operator())> {};
+} // namespace detail
+template <class T>
+using is_stateless_lambda = detail::is_stateless_lambda_<std::decay_t<T>>;
+
+/**
+ * is_type_condition<C> is true_type iff C<...> is a type trait representing a
+ * condition (i.e. has a constexpr static bool ::value member) Example:
+ *   is_type_condition<std::is_reference>  // true
+ */
+template <template <class> class C, class Enable = void>
+struct is_type_condition : std::false_type {};
+template <template <class> class C>
+struct is_type_condition<
+    C,
+    std::enable_if_t<
+        std::is_same_v<bool, std::remove_cv_t<decltype(C<int>::value)>>>>
+    : std::true_type {};
+
+/**
+ * is_fundamental<T> is true_type iff the lambda type T is a fundamental type
+ * (that is, arithmetic type, void, or nullptr_t). Example: is_fundamental<int>
+ * // true We define it here to resolve a MSVC bug. See
+ * https://github.com/pytorch/pytorch/issues/30932 for details.
+ */
+template <class T>
+struct is_fundamental : std::is_fundamental<T> {};
+} // namespace c10::guts
+
+HIDDEN_NAMESPACE_BEGIN(torch, headeronly, guts)
+
+using c10::guts::is_equality_comparable;
+using c10::guts::is_function_type;
+using c10::guts::is_hashable;
+using c10::guts::is_instantiation_of;
+using c10::guts::is_stateless_lambda;
+using c10::guts::is_type_condition;
+
+HIDDEN_NAMESPACE_END(torch, headeronly, guts)
diff --git a/torch/hub.py b/torch/hub.py
index 3d6183ee7b282..0862f4f84eaa0 100644
--- a/torch/hub.py
+++ b/torch/hub.py
@@ -781,7 +781,8 @@ def download_url_to_file(
 # We should remove this support since zipfile is now default zipfile format for torch.save().
 def _is_legacy_zip_format(filename: str) -> bool:
     if zipfile.is_zipfile(filename):
-        infolist = zipfile.ZipFile(filename).infolist()
+        with zipfile.ZipFile(filename) as zf:
+            infolist = zf.infolist()
         return len(infolist) == 1 and not infolist[0].is_dir()
     return False
 
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index 9decaeecc86d0..c277d2e3ab71a 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -279,7 +279,7 @@ def _hide_source_ranges() -> Iterator[None]:
         torch._C.Graph.set_global_print_source_ranges(old_enable_source_ranges)  # type: ignore[attr-defined]
 
 
-def enable_onednn_fusion(enabled: bool):
+def enable_onednn_fusion(enabled: bool) -> None:
     """Enable or disables onednn JIT fusion based on the parameter `enabled`."""
     torch._C._jit_set_llga_enabled(enabled)
 
diff --git a/torch/jit/_builtins.py b/torch/jit/_builtins.py
index 2aa2fae3fde51..f61fadf375bf1 100644
--- a/torch/jit/_builtins.py
+++ b/torch/jit/_builtins.py
@@ -162,7 +162,7 @@ def _get_builtin_table():
         return _builtin_table
     _builtin_table = {}
 
-    def register_all(mod):
+    def register_all(mod) -> None:
         for name in dir(mod):
             v = getattr(mod, name)
             if (
@@ -196,7 +196,7 @@ def register_all(mod):
     return _builtin_table
 
 
-def _register_builtin(fn, op):
+def _register_builtin(fn, op) -> None:
     _get_builtin_table()[id(fn)] = op
 
 
diff --git a/torch/jit/_check.py b/torch/jit/_check.py
index 261a2ce554b5f..36440769f063f 100644
--- a/torch/jit/_check.py
+++ b/torch/jit/_check.py
@@ -116,7 +116,7 @@ def _is_empty_container(self, node: ast.AST, ann_type: str) -> bool:
 
         return True
 
-    def visit_Assign(self, node):
+    def visit_Assign(self, node) -> None:
         """Store assignment state when assigning to a Call Node.
 
         If we're visiting a Call Node (the right-hand side of an
@@ -139,7 +139,7 @@ def visit_Assign(self, node):
         self.generic_visit(node)
         self.visiting_class_level_ann = False
 
-    def visit_AnnAssign(self, node):
+    def visit_AnnAssign(self, node) -> None:
         """Visit an AnnAssign node in an ``nn.Module``'s ``__init__`` method.
 
         It checks if it conforms to our attribute annotation rules."""
@@ -194,7 +194,7 @@ def visit_AnnAssign(self, node):
             stacklevel=2,
         )
 
-    def visit_Call(self, node):
+    def visit_Call(self, node) -> None:
         """Determine if a Call node is 'torch.jit.annotate' in __init__.
 
         Visit a Call node in an ``nn.Module``'s ``__init__``
diff --git a/torch/jit/_decomposition_utils.py b/torch/jit/_decomposition_utils.py
index 3a4b4ceff2cf3..48f24f0d85d6c 100644
--- a/torch/jit/_decomposition_utils.py
+++ b/torch/jit/_decomposition_utils.py
@@ -3,7 +3,7 @@
 from torch._ops import OpOverload, OpOverloadPacket
 
 
-def _register_decomposition(op: OpOverload, graph: torch._C.Graph):
+def _register_decomposition(op: OpOverload, graph: torch._C.Graph) -> None:
     assert not isinstance(op, OpOverloadPacket), (
         f"Must pass specific op overload, not overload packet, found {op}"
     )
diff --git a/torch/jit/_decompositions.py b/torch/jit/_decompositions.py
index c855606865adb..bb628f82a8ef0 100644
--- a/torch/jit/_decompositions.py
+++ b/torch/jit/_decompositions.py
@@ -20,7 +20,7 @@
 _P = ParamSpec("_P")
 
 
-def check_decomposition_has_type_annotations(f):
+def check_decomposition_has_type_annotations(f) -> None:
     inspect_empty = inspect._empty  # type: ignore[attr-defined]
     sig = inspect.signature(f)
     for param in sig.parameters.values():
diff --git a/torch/jit/_freeze.py b/torch/jit/_freeze.py
index b61a2dd6207d1..7fd8ac1a772bb 100644
--- a/torch/jit/_freeze.py
+++ b/torch/jit/_freeze.py
@@ -125,7 +125,7 @@ def forward(self, input):
 
 def run_frozen_optimizations(
     mod, optimize_numerics: bool = True, preserved_methods: Optional[list[str]] = None
-):
+) -> None:
     r"""
     Run a series of optimizations looking for patterns that occur in frozen graphs.
 
diff --git a/torch/jit/_fuser.py b/torch/jit/_fuser.py
index dc5dd80362971..a6a2f1cce67e8 100644
--- a/torch/jit/_fuser.py
+++ b/torch/jit/_fuser.py
@@ -83,7 +83,7 @@ def fuser(name):
 last_executed_optimized_graph = torch._C._last_executed_optimized_graph
 
 
-def _get_differentiable_graph_node(node, diff_node):
+def _get_differentiable_graph_node(node, diff_node) -> None:
     if node.kind() == "prim::DifferentiableGraph":
         diff_node.append(node)
     else:
diff --git a/torch/jit/_ir_utils.py b/torch/jit/_ir_utils.py
index d7f03ee3bc868..7a775717de07f 100644
--- a/torch/jit/_ir_utils.py
+++ b/torch/jit/_ir_utils.py
@@ -9,7 +9,7 @@ def __init__(
         self,
         insert_point_graph: torch._C.Graph,
         insert_point: Union[torch._C.Node, torch._C.Block],
-    ):
+    ) -> None:
         self.insert_point = insert_point
         self.g = insert_point_graph
         self.guard = None
diff --git a/torch/jit/_monkeytype_config.py b/torch/jit/_monkeytype_config.py
index a15d140dc7944..0f348590ea397 100644
--- a/torch/jit/_monkeytype_config.py
+++ b/torch/jit/_monkeytype_config.py
@@ -85,7 +85,7 @@ def get_qualified_name(func):
     class JitTypeTraceStoreLogger(CallTraceStoreLogger):
         """A JitTypeCallTraceLogger that stores logged traces in a CallTraceStore."""
 
-        def __init__(self, store: CallTraceStore):
+        def __init__(self, store: CallTraceStore) -> None:
             super().__init__(store)
 
         def log(self, trace: CallTrace) -> None:
@@ -100,7 +100,7 @@ def __init__(self) -> None:
             # value is list of all CallTrace
             self.trace_records: dict[str, list] = defaultdict(list)
 
-        def add(self, traces: Iterable[CallTrace]):
+        def add(self, traces: Iterable[CallTrace]) -> None:
             for t in traces:
                 qualified_name = get_qualified_name(t.func)
                 self.trace_records[qualified_name].append(t)
@@ -145,7 +145,7 @@ def get_args_types(self, qualified_name: str) -> dict:
             return self.consolidate_types(qualified_name)
 
     class JitTypeTraceConfig(monkeytype.config.Config):
-        def __init__(self, s: JitTypeTraceStore):
+        def __init__(self, s: JitTypeTraceStore) -> None:
             super().__init__()
             self.s = s
 
diff --git a/torch/jit/_recursive.py b/torch/jit/_recursive.py
index 3a2b3ef8b6001..75355cbd4b8e0 100644
--- a/torch/jit/_recursive.py
+++ b/torch/jit/_recursive.py
@@ -152,7 +152,7 @@ def _get_valid_constant(attr, v, owner_type):
 
 
 class SourceContext(torch._C._jit_tree_views.SourceRangeFactory):
-    def __init__(self, source, filename, file_lineno, leading_whitespace_len):
+    def __init__(self, source, filename, file_lineno, leading_whitespace_len) -> None:
         super().__init__(source, filename, file_lineno, leading_whitespace_len)
 
 
@@ -454,7 +454,7 @@ def get_or_create_concrete_type(self, nn_module):
 
 def create_methods_and_properties_from_stubs(
     concrete_type, method_stubs, property_stubs
-):
+) -> None:
     method_defs = [m.def_ for m in method_stubs]
     method_rcbs = [m.resolution_callback for m in method_stubs]
     method_defaults = [get_default_args(m.original_method) for m in method_stubs]
@@ -467,7 +467,7 @@ def create_methods_and_properties_from_stubs(
     )
 
 
-def create_hooks_from_stubs(concrete_type, hook_stubs, pre_hook_stubs):
+def create_hooks_from_stubs(concrete_type, hook_stubs, pre_hook_stubs) -> None:
     hook_defs = [h.def_ for h in hook_stubs]
     hook_rcbs = [h.resolution_callback for h in hook_stubs]
 
@@ -571,10 +571,10 @@ def create_script_module_impl(nn_module, concrete_type, stubs_fn):
     hook_stubs, pre_hook_stubs = get_hook_stubs(nn_module)
     ignored_properties = jit_ignored_properties(nn_module)
 
-    def init_fn(script_module):
+    def init_fn(script_module) -> None:
         # Initialize the ScriptModule:
         # 1. Copy the attributes/parameters/buffers from the original `nn_module` to the new ScriptModule.
-        for name in concrete_type.get_attributes().keys():
+        for name in concrete_type.get_attributes():
             orig_value = getattr(nn_module, name)
             orig_value = (
                 orig_value.value
@@ -725,7 +725,7 @@ def script_model_defines_attr(script_model, attr):
     return script_attr != default_attr
 
 
-def add_python_attr_to_scripted_model(script_model, orig, attr):
+def add_python_attr_to_scripted_model(script_model, orig, attr) -> None:
     if hasattr(orig, attr) and script_model_defines_attr(script_model, attr):
         setattr(script_model, attr, getattr(orig, attr))
 
@@ -777,7 +777,7 @@ def get_overload_name_mapping(overload_info):
     return overload_name_mappings
 
 
-def _check_no_signature(func):
+def _check_no_signature(func) -> None:
     signature = torch.jit.annotations.get_signature(
         func, None, fake_range(), inspect.ismethod(func)
     )
@@ -807,7 +807,7 @@ def make_stubs_for_overloads(overload_info):
     return overload_stubs
 
 
-def check_module_initialized(mod):
+def check_module_initialized(mod) -> None:
     assert isinstance(mod, torch.nn.Module)
     if not hasattr(mod, "_parameters"):
         raise RuntimeError(
@@ -1002,7 +1002,7 @@ def wrap_cpp_class(cpp_class):
 def wrap_cpp_module(cpp_module):
     """Wrap this torch._C.ScriptModule in a Python ScriptModule, recursively for all submodules."""
 
-    def init_fn(script_module):
+    def init_fn(script_module) -> None:
         for name, cpp_module in torch._C.ModuleDict(script_module._c).items():
             setattr(script_module, name, wrap_cpp_module(cpp_module))
         script_module._concrete_type = torch._C.ConcreteModuleType.from_jit_type(
@@ -1037,7 +1037,7 @@ def lazy_bind(concrete_type, unbound_method):
     """
 
     def lazy_binding_method(cpp_module, *args):
-        def init_fn(script_module):
+        def init_fn(script_module) -> None:
             orig_class = concrete_type.py_class
 
             # Copy @ignored/@unused methods from the original module to the new one.
diff --git a/torch/jit/_script.py b/torch/jit/_script.py
index a8bb3ba9bd8f5..8bf8248b7dfff 100644
--- a/torch/jit/_script.py
+++ b/torch/jit/_script.py
@@ -13,6 +13,7 @@
 import functools
 import inspect
 import pickle
+import sys
 import warnings
 from collections.abc import Callable
 from typing import Any, Union
@@ -351,6 +352,17 @@ class ScriptWarning(Warning):
 
 
 def script_method(fn):
+    if sys.version_info >= (3, 14):
+        warnings.warn(
+            "`torch.jit.script_method` is not supported in Python 3.14+ and may break. "
+            "Please switch to `torch.compile` or `torch.export`.",
+            DeprecationWarning,
+        )
+    else:
+        warnings.warn(
+            "`torch.jit.script_method` is deprecated. Please switch to `torch.compile` or `torch.export`.",
+            DeprecationWarning,
+        )
     if not _enabled:
         return fn
     # NOTE: we need to traverse two frames here because the meta-class frame
@@ -856,7 +868,7 @@ def __setattr__(self, attr, value):
                 self._c.setattr(attr, value)
             elif (
                 hasattr(self, "_concrete_type")
-                and attr in self._concrete_type.get_constants().keys()
+                and attr in self._concrete_type.get_constants()
             ):
                 # TODO: we don't have _concrete_type set after load(), and in general we lose constant information.
                 # We should encode constants as class type attributes (or something) so it persists across save/load.
@@ -1458,6 +1470,17 @@ def forward(self, a) -> MyModule:
             # Run the scripted_model with actual inputs
             print(scripted_model([20]))
     """
+    if sys.version_info >= (3, 14):
+        warnings.warn(
+            "`torch.jit.script` is not supported in Python 3.14+ and may break. "
+            "Please switch to `torch.compile` or `torch.export`.",
+            DeprecationWarning,
+        )
+    else:
+        warnings.warn(
+            "`torch.jit.script` is deprecated. Please switch to `torch.compile` or `torch.export`.",
+            DeprecationWarning,
+        )
     if not _enabled:
         return obj
     try:
diff --git a/torch/jit/_serialization.py b/torch/jit/_serialization.py
index 02004a1122013..41490aebd2c49 100644
--- a/torch/jit/_serialization.py
+++ b/torch/jit/_serialization.py
@@ -10,6 +10,8 @@
 """
 
 import os
+import sys
+import warnings
 
 import torch
 from torch._jit_internal import _get_model_id
@@ -18,7 +20,7 @@
 from torch.serialization import validate_cuda_device
 
 
-def save(m, f, _extra_files=None):
+def save(m, f, _extra_files=None) -> None:
     r"""
     Save an offline version of this module for use in a separate process.
 
@@ -77,6 +79,17 @@ def forward(self, x):
         extra_files = {'foo.txt': b'bar'}
         torch.jit.save(m, 'scriptmodule.pt', _extra_files=extra_files)
     """
+    if sys.version_info >= (3, 14):
+        warnings.warn(
+            "`torch.jit.save` is not supported in Python 3.14+ and may break. "
+            "Please switch to `torch.export`.",
+            DeprecationWarning,
+        )
+    else:
+        warnings.warn(
+            "`torch.jit.save` is deprecated. Please switch to `torch.export`.",
+            DeprecationWarning,
+        )
     log_torchscript_usage("save", model_id=_get_model_id(m))
     if _extra_files is None:
         _extra_files = {}
@@ -153,6 +166,17 @@ def load(f, map_location=None, _extra_files=None, _restore_shapes=False):
         import os
         os.remove("scriptmodule.pt")
     """
+    if sys.version_info >= (3, 14):
+        warnings.warn(
+            "`torch.jit.load` is not supported in Python 3.14+ and may break. "
+            "Please switch to `torch.export`.",
+            DeprecationWarning,
+        )
+    else:
+        warnings.warn(
+            "`torch.jit.load` is deprecated. Please switch to `torch.export`.",
+            DeprecationWarning,
+        )
     if isinstance(f, (str, os.PathLike)):
         if not os.path.exists(f):
             raise ValueError(f"The provided filename {f} does not exist")
@@ -213,7 +237,7 @@ def jit_module_from_flatbuffer(f):
         return wrap_cpp_module(torch._C._load_jit_module_from_bytes(f.read()))
 
 
-def save_jit_module_to_flatbuffer(m, f, _extra_files=None):
+def save_jit_module_to_flatbuffer(m, f, _extra_files=None) -> None:
     r"""
     Save an offline version of this module for use in a separate process.
 
diff --git a/torch/jit/_shape_functions.py b/torch/jit/_shape_functions.py
index f2a6f4a841763..1f95de46f6f2a 100644
--- a/torch/jit/_shape_functions.py
+++ b/torch/jit/_shape_functions.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-defs
 import math
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from collections.abc import Callable
+from typing import Any, Optional, Union
 
 
 number = Union[int, float]
diff --git a/torch/jit/_state.py b/torch/jit/_state.py
index f48dd80a0b36f..2ebbee553ab11 100644
--- a/torch/jit/_state.py
+++ b/torch/jit/_state.py
@@ -41,18 +41,18 @@ def parse_env(self, name, default, true_message, false_message):
             return False
         raise ValueError(f"Unknown setting of {name}. Try using 0 or 1.")
 
-    def __bool__(self):
+    def __bool__(self) -> bool:
         return self.enabled
 
 
 _enabled = EnabledProxy()
 
 
-def disable():
+def disable() -> None:
     _enabled.enabled = False
 
 
-def enable():
+def enable() -> None:
     _enabled.enabled = True
 
 
@@ -67,7 +67,7 @@ def enable():
 _name_to_pyclass: dict[str, type[Any]] = {}
 
 
-def _add_script_class(python_class, script_class):
+def _add_script_class(python_class, script_class) -> None:
     _script_classes[python_class] = script_class
     _name_to_pyclass[script_class.qualified_name()] = python_class
 
@@ -83,7 +83,7 @@ def _get_python_class(qualified_name):
     return _name_to_pyclass.get(qualified_name)
 
 
-def _clear_class_state():
+def _clear_class_state() -> None:
     _script_classes.clear()
     _name_to_pyclass.clear()
 
@@ -108,7 +108,7 @@ def _try_get_jit_cached_overloads(key):
         return None
 
 
-def _set_jit_overload_cache(key, compiled_fns):
+def _set_jit_overload_cache(key, compiled_fns) -> None:
     _jit_function_overload_caching[key] = [fn.qualified_name for fn in compiled_fns]
 
 
@@ -122,7 +122,7 @@ def _try_get_jit_cached_function(key):
         return None
 
 
-def _set_jit_function_cache(key, value):
+def _set_jit_function_cache(key, value) -> None:
     # only free functions currently supported
     assert isinstance(value, torch.jit.ScriptFunction)
     _jit_caching_layer[key] = value.qualified_name
diff --git a/torch/jit/_trace.py b/torch/jit/_trace.py
index 9576ebb1d41e4..005385f3fd969 100644
--- a/torch/jit/_trace.py
+++ b/torch/jit/_trace.py
@@ -15,6 +15,7 @@
 import inspect
 import os
 import re
+import sys
 import warnings
 from collections.abc import Callable
 from enum import Enum
@@ -989,6 +990,17 @@ def forward(self, x):
         module = torch.jit.trace(n, example_forward_input)
 
     """
+    if sys.version_info >= (3, 14):
+        warnings.warn(
+            "`torch.jit.trace` is not supported in Python 3.14+ and may break. "
+            "Please switch to `torch.compile` or `torch.export`.",
+            DeprecationWarning,
+        )
+    else:
+        warnings.warn(
+            "`torch.jit.trace` is deprecated. Please switch to `torch.compile` or `torch.export`.",
+            DeprecationWarning,
+        )
     if not _enabled:
         return func
     if optimize is not None:
@@ -1117,6 +1129,17 @@ def weighted_kernel_sum(self, weight):
         module = torch.jit.trace_module(n, inputs)
 
     """
+    if sys.version_info >= (3, 14):
+        warnings.warn(
+            "`torch.jit.trace_method` is not supported in Python 3.14+ and may break. "
+            "Please switch to `torch.compile` or `torch.export`.",
+            DeprecationWarning,
+        )
+    else:
+        warnings.warn(
+            "`torch.jit.trace_method` is deprecated. Please switch to `torch.compile` or `torch.export`.",
+            DeprecationWarning,
+        )
     if not _enabled:
         return mod
     if optimize is not None:
diff --git a/torch/jit/annotations.py b/torch/jit/annotations.py
index f1ede0bd2450d..cf1be7bac8f9d 100644
--- a/torch/jit/annotations.py
+++ b/torch/jit/annotations.py
@@ -68,7 +68,7 @@
 
 
 class Module:
-    def __init__(self, name, members):
+    def __init__(self, name, members) -> None:
         self.name = name
         self.members = members
 
@@ -95,7 +95,7 @@ class EvalEnv:
         "Await": _Await,
     }
 
-    def __init__(self, rcb):
+    def __init__(self, rcb) -> None:
         self.rcb = rcb
         if torch.distributed.rpc.is_available():
             # pyrefly: ignore [unsupported-operation]
@@ -178,7 +178,7 @@ def get_param_names(fn, n_args):
         return [str(i) for i in range(n_args)]
 
 
-def check_fn(fn, loc):
+def check_fn(fn, loc) -> None:
     # Make sure the function definition is not a class instantiation
     try:
         source = dedent("".join(get_source_lines_and_file(fn)[0]))
@@ -368,7 +368,7 @@ def get_enum_value_type(e: type[enum.Enum], loc):
     return res
 
 
-def is_tensor(ann):
+def is_tensor(ann) -> bool:
     if issubclass(ann, torch.Tensor):
         return True
 
@@ -397,7 +397,7 @@ def is_tensor(ann):
     return False
 
 
-def _fake_rcb(inp):
+def _fake_rcb(inp) -> None:
     return None
 
 
diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py
index 5b1db800a7838..9f686a5a626f3 100644
--- a/torch/jit/frontend.py
+++ b/torch/jit/frontend.py
@@ -147,7 +147,7 @@ def is_reserved_name(name):
 
 
 class FrontendError(Exception):
-    def __init__(self, source_range, msg):
+    def __init__(self, source_range, msg) -> None:
         self.source_range = source_range
         self.msg = msg
 
@@ -155,7 +155,7 @@ def __init__(self, source_range, msg):
         # call stack when the FrontendError was raised
         self.error_report = torch._C.ErrorReport(self.source_range)
 
-    def __str__(self):
+    def __str__(self) -> str:
         return self.msg + self.error_report.what().lstrip()
 
 
@@ -164,7 +164,7 @@ class NotSupportedError(FrontendError):
 
 
 class UnsupportedNodeError(NotSupportedError):
-    def __init__(self, ctx, offending_node, reason=""):
+    def __init__(self, ctx, offending_node, reason="") -> None:
         # If we don't have a specific token, we default to length of 1
         node_type = type(offending_node)
         range_len = len(node_start_tokens.get(node_type, " "))
@@ -229,7 +229,7 @@ def get_class_properties(cls, self_name):
 def get_class_assigns(ctx, cls_ast):
     assigns = []
 
-    def maybe_build_assign(builder, entry):
+    def maybe_build_assign(builder, entry) -> None:
         nonlocal assigns
         try:
             assigns.append(builder(ctx, entry))
@@ -385,7 +385,7 @@ def _forward(self):
 
 
 # TODO: more robust handling of recognizing ignore context manager
-def is_torch_jit_ignore_context_manager(stmt):
+def is_torch_jit_ignore_context_manager(stmt) -> bool:
     # checks if the statement is torch.jit.ignore context manager
     if isinstance(stmt.items[0].context_expr, ast.Call):
         # extract torch part
@@ -535,7 +535,7 @@ def process_ins_outs(args):
                 outputs.append(OutputType(var_name, var_ann))
         return inputs, outputs
 
-    def create_unique_name_ext(ctx, stmt):
+    def create_unique_name_ext(ctx, stmt) -> str:
         # extension will be based on the full path filename plus
         # the line number of original context manager
         fn = re.sub(r"[^a-zA-Z0-9_]", "_", ctx.filename)
diff --git a/torch/jit/mobile/__init__.py b/torch/jit/mobile/__init__.py
index 32c2f5b321ee3..608d1c2f7798d 100644
--- a/torch/jit/mobile/__init__.py
+++ b/torch/jit/mobile/__init__.py
@@ -56,7 +56,7 @@ def _load_for_lite_interpreter(f, map_location=None):
 
 
 class LiteScriptModule:
-    def __init__(self, cpp_module):
+    def __init__(self, cpp_module) -> None:
         self._c = cpp_module
         super().__init__()
 
diff --git a/torch/jit/supported_ops.py b/torch/jit/supported_ops.py
index 6cbca07966da5..8a258280ea352 100644
--- a/torch/jit/supported_ops.py
+++ b/torch/jit/supported_ops.py
@@ -57,7 +57,7 @@ def _emit_schema(mod, name, schema, arg_start=0, padding=4):
 
 
 def _get_tensor_ops():
-    def is_tensor_method(schema):
+    def is_tensor_method(schema) -> bool:
         if len(schema.arguments) == 0:
             return False
         self = schema.arguments[0]
diff --git a/torch/jit/unsupported_tensor_ops.py b/torch/jit/unsupported_tensor_ops.py
index 162e4c5320685..12bca0fce337d 100644
--- a/torch/jit/unsupported_tensor_ops.py
+++ b/torch/jit/unsupported_tensor_ops.py
@@ -5,7 +5,7 @@
 import torch.jit
 
 
-def execWrapper(code, glob, loc):
+def execWrapper(code, glob, loc) -> None:
     exec(code, glob, loc)
 
 
diff --git a/torch/library.py b/torch/library.py
index b9b56f6aa7c46..76e5d27aae434 100644
--- a/torch/library.py
+++ b/torch/library.py
@@ -19,6 +19,7 @@
     CustomOpDef,
     device_types_t,
 )
+from torch._library.effects import EffectType
 from torch._library.infer_schema import infer_schema  # noqa: F401
 from torch._library.triton import triton_op, wrap_triton
 from torch._ops import OpOverload
@@ -398,6 +399,22 @@ def fallback(self, fn, dispatch_key="", *, with_keyset=False):
 
         self.m.fallback(dispatch_key, fn, with_keyset)
 
+    def _register_effectful_op(self, op_name: str, effect: Optional[EffectType]):
+        """
+        Registers an effect to an operator. This is used to register an op that
+        has side effects that is not capturable by the schema.
+
+        Args:
+            op_name: operator name (along with the overload) or OpOverload object.
+            effect: The effect of the op.
+        """
+        from torch._higher_order_ops.effects import (
+            _register_effectful_op as hoo_register_effect,
+        )
+
+        handle = hoo_register_effect(op_name, effect)
+        self._registration_handles.append(handle)
+
     def _destroy(self):
         if self.m is not None:
             self.m.reset()
@@ -1065,6 +1082,44 @@ def register(func):
         return register(func)
 
 
+def _register_effectful_op(
+    op: _op_identifier,
+    effect: Optional[EffectType],
+    *,
+    lib: Optional[Library] = None,
+) -> None:
+    r"""
+    To specify that an operator has side-effects, we must register an effect
+    type for the operator. This will prevent graph passes in torch.compile from
+    reordering operations with the same effect type.
+
+    Args:
+        op_name: Operator name (along with the overload) or OpOverload object.
+        effect: Effect type to register. None means the operator is not effectful.
+    """
+    if not isinstance(
+        op, (str, torch._ops.OpOverload, torch._library.custom_ops.CustomOpDef)
+    ):
+        raise ValueError(
+            f"register_effectful_op({op}): got unexpected type for op: {type(op)}"
+        )
+
+    if isinstance(op, torch._ops.OpOverload):
+        op = op._name
+    opdef = _maybe_get_opdef(op)
+    if opdef is not None:
+        opdef.register_effect(effect)
+    assert isinstance(op, str)
+
+    namespace, _ = torch._library.utils.parse_namespace(op)
+    if lib is None:
+        use_lib = Library(namespace, "FRAGMENT")
+        _keep_alive.append(use_lib)
+    else:
+        use_lib = lib
+    use_lib._register_effectful_op(op, effect)
+
+
 def register_autograd(
     op: _op_identifier,
     backward: Callable,
diff --git a/torch/masked/maskedtensor/core.py b/torch/masked/maskedtensor/core.py
index 111680c1f019e..cad5621b29bd6 100644
--- a/torch/masked/maskedtensor/core.py
+++ b/torch/masked/maskedtensor/core.py
@@ -88,7 +88,7 @@ def _helper(a, map_fn):
     for a in args:
         impl_args.append(_helper(a, map_fn))
     impl_kwargs = {}
-    for k in kwargs.keys():
+    for k in kwargs:
         impl_kwargs[k] = _helper(a, map_fn)
     return impl_args, impl_kwargs
 
diff --git a/torch/mtia/__init__.py b/torch/mtia/__init__.py
index c381d99747c0a..35ef04a67319d 100644
--- a/torch/mtia/__init__.py
+++ b/torch/mtia/__init__.py
@@ -396,6 +396,7 @@ def set_rng_state(
 
 
 from .memory import *  # noqa: F403
+from .mtia_graph import *  # noqa: F403
 
 
 __all__ = [
@@ -424,4 +425,6 @@ def set_rng_state(
     "set_rng_state",
     "get_rng_state",
     "is_bf16_supported",
+    "MTIAGraph",
+    "graph",
 ]
diff --git a/torch/mtia/mtia_graph.py b/torch/mtia/mtia_graph.py
new file mode 100644
index 0000000000000..bc5a8ea49dfea
--- /dev/null
+++ b/torch/mtia/mtia_graph.py
@@ -0,0 +1,96 @@
+from __future__ import annotations
+
+from typing import Optional, Union
+from typing_extensions import Self
+
+import torch
+
+
+_POOL_HANDLE = tuple[int, int]
+
+
+class MTIAGraph(torch._C._MTIAGraph):
+    """
+    Wrapper around a MTIA graph.
+    """
+
+    def __new__(cls, keep_graph: bool = False) -> Self:
+        return super().__new__(cls, keep_graph)
+
+    def capture_begin(self, pool: _POOL_HANDLE) -> None:
+        """
+        Begin capturing a MTIA graph.
+        """
+        super().capture_begin(pool)
+
+    def capture_end(self) -> None:
+        """
+        End the capture of a MTIA graph.
+        """
+        super().capture_end()
+
+    def instantiate(self) -> None:
+        """
+        Instantiate the captured MTIA graph.
+        """
+        super().instantiate()
+
+    def replay(self) -> None:
+        """
+        Replay the captured MTIA graph.
+        """
+        super().replay()
+
+    def reset(self) -> None:
+        """
+        Destroy the captured graph and reset the states.
+        """
+        super().reset()
+
+    def pool(self) -> _POOL_HANDLE:
+        """
+        Return an opaque token representing the id of this graph's memory pool
+        """
+        return super().pool()
+
+
+class graph:
+    default_capture_stream: Optional[torch.mtia.Stream] = None
+
+    def __init__(
+        self,
+        mtia_graph: MTIAGraph,
+        pool: Optional[_POOL_HANDLE] = None,
+        stream: Optional[torch.mtia.Stream] = None,
+    ):
+        if self.__class__.default_capture_stream is None:
+            self.__class__.default_capture_stream = torch.mtia.current_stream()
+
+        self.pool: Union[tuple[()], tuple[_POOL_HANDLE]] = (
+            () if pool is None else (pool,)
+        )
+        self.capture_stream = (
+            stream if stream is not None else self.__class__.default_capture_stream
+        )
+        assert self.capture_stream is not None
+        self.stream_ctx = torch.mtia.stream(self.capture_stream)
+        self.mtia_graph = mtia_graph
+
+    def __enter__(self) -> None:
+        torch.mtia.synchronize()
+        torch.mtia.empty_cache()
+
+        self.stream_ctx.__enter__()
+
+        pool_arg = self.pool[0] if self.pool else (0, 0)
+        self.mtia_graph.capture_begin(pool_arg)
+
+    def __exit__(self, *args: object) -> None:
+        self.mtia_graph.capture_end()
+        self.stream_ctx.__exit__(*args)
+
+
+__all__ = [
+    "MTIAGraph",
+    "graph",
+]
diff --git a/torch/nativert/backends/_lowered_aoti_module.py b/torch/nativert/backends/_lowered_aoti_module.py
index b3379f3b1a94e..c08e83211ef33 100644
--- a/torch/nativert/backends/_lowered_aoti_module.py
+++ b/torch/nativert/backends/_lowered_aoti_module.py
@@ -1,5 +1,3 @@
-from typing import Optional
-
 import torch
 from torch.export import ExportedProgram
 
@@ -10,7 +8,7 @@ def __init__(
         original_exported_program: ExportedProgram,
         backend_id: str,
         *,
-        module_name: Optional[str] = None,
+        module_name: str | None = None,
     ) -> None:
         super().__init__()
         self._backend_id = backend_id
@@ -22,7 +20,7 @@ def backend_id(self) -> str:
         return self._backend_id
 
     @property
-    def module_name(self) -> Optional[str]:
+    def module_name(self) -> str | None:
         return self._module_name
 
     @property
diff --git a/torch/nativert/executor/OpKernel.cpp b/torch/nativert/executor/OpKernel.cpp
index ee4a8503d5ce2..fa628733804a4 100644
--- a/torch/nativert/executor/OpKernel.cpp
+++ b/torch/nativert/executor/OpKernel.cpp
@@ -65,7 +65,7 @@ std::string readableArgs(
     } else {
       ss << arg;
     }
-    ss << "\n";
+    ss << '\n';
   }
   return ss.str();
 }
diff --git a/torch/nativert/executor/Weights.cpp b/torch/nativert/executor/Weights.cpp
index 4a64935945c4f..ea1f1498b5fb5 100644
--- a/torch/nativert/executor/Weights.cpp
+++ b/torch/nativert/executor/Weights.cpp
@@ -7,6 +7,7 @@
 #include <torch/csrc/jit/serialization/import_read.h>
 #include <torch/csrc/jit/serialization/pickle.h>
 #include <torch/nativert/executor/Weights.h>
+#include <unordered_map>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -55,92 +56,128 @@ Weights::Weights(
     const std::unordered_map<std::string, std::string>& constantPaths,
     std::string_view constantPathPrefix,
     std::function<bool(const std::string&)> skipSizeCheck,
-    std::function<bool(const std::string&)> skipDtypeCheck)
+    std::function<bool(const std::string&)> skipDtypeCheck,
+    std::shared_ptr<std::unordered_map<
+        std::string,
+        std::shared_ptr<torch::nativert::TensorMeta>>> maybeNewWeightsMeta)
     : graph_(graph),
       weightsMeta_(graph->weightsMeta()),
       version_(globalVersion_++),
       skipSizeCheck_(std::move(skipSizeCheck)),
       skipDtypeCheck_(std::move(skipDtypeCheck)) {
-  auto loadAndInsert =
-      [&](const std::string& tensorName,
-          std::string_view pathPrefix,
-          const std::unordered_map<std::string, std::string>& tensorPaths,
-          bool isUsed) {
-        auto pathIt = tensorPaths.find(tensorName);
-        TORCH_CHECK(
-            pathIt != tensorPaths.end(),
-            "Couldn't find ",
-            tensorName,
-            " in tensorPaths");
+  auto loadAndInsert = [&](const std::string& tensorName,
+                           std::string_view pathPrefix,
+                           const std::unordered_map<std::string, std::string>&
+                               tensorPaths,
+                           bool isUsed,
+                           std::shared_ptr<std::unordered_map<
+                               std::string,
+                               std::shared_ptr<torch::nativert::TensorMeta>>>
+                               maybeNewWeightsMeta) {
+    auto pathIt = tensorPaths.find(tensorName);
+    TORCH_CHECK(
+        pathIt != tensorPaths.end(),
+        "Couldn't find ",
+        tensorName,
+        " in tensorPaths");
 
-        const std::string tensorPath = std::string{pathPrefix} + pathIt->second;
-        VLOG(1) << "Loading weight from: " << tensorPath;
-        TORCH_CHECK(
-            pytorchStreamReader->hasRecord(tensorPath),
-            tensorPath,
-            " not found");
-
-        auto [tensorData, tensorDataSize] =
-            pytorchStreamReader->getRecord(tensorPath);
-
-        // TODO: We now have two copies of metadata for weights, one in
-        // model definition /models/<model_name>.json, another in
-        // /extra/xl_weights/<model_name>_model_param_config.json
-        // Currently, we only use the metadata from model definition.
-        std::optional<TensorMeta> tensorMeta;
-        if (weightsMeta_.find(tensorName) != weightsMeta_.end()) {
-          tensorMeta = weightsMeta_.at(tensorName);
-        } else {
-          TORCH_CHECK(false, "Tensor meta not found for: ", tensorName);
-        }
+    const std::string tensorPath = std::string{pathPrefix} + pathIt->second;
+    VLOG(1) << "Loading weight from: " << tensorPath;
+    TORCH_CHECK(
+        pytorchStreamReader->hasRecord(tensorPath), tensorPath, " not found");
+
+    auto [tensorData, tensorDataSize] =
+        pytorchStreamReader->getRecord(tensorPath);
+
+    // TODO: We now have two copies of metadata for weights, one in
+    // model definition /models/<model_name>.json, another in
+    // /extra/xl_weights/<model_name>_model_param_config.json
+    // Currently, we only use the metadata from model definition.
+    std::optional<TensorMeta> tensorMeta;
+    if (weightsMeta_.find(tensorName) != weightsMeta_.end()) {
+      tensorMeta = weightsMeta_.at(tensorName);
+    } else {
+      TORCH_CHECK(
+          false,
+          "Tensor meta not found for: ",
+          tensorName,
+          " in base weights.");
+    }
+    std::optional<TensorMeta> newTensorMeta;
+    if (maybeNewWeightsMeta) {
+      if (stateDictPaths.find(tensorName) == stateDictPaths.end()) {
+        TORCH_CHECK(false, "Tensor name not found in state dict paths");
+      }
 
-        if (tensorDataSize == 0 && tensorMeta->numel() > 0) {
-          VLOG(1) << "Tensor " << tensorName
-                  << " does not have data and create on Meta device";
-          allValues_[tensorName] = at::empty_strided(
-              tensorMeta->sizes(),
-              tensorMeta->strides(),
-              tensorMeta->asTensorOptions().device(at::kMeta));
-          return;
-        }
+      std::string paramName = stateDictPaths.at(tensorName);
+      if (maybeNewWeightsMeta->find(paramName) != maybeNewWeightsMeta->end()) {
+        newTensorMeta = *maybeNewWeightsMeta->at(paramName);
+      } else {
+        TORCH_CHECK(
+            false,
+            "Tensor meta not found for: ",
+            tensorName,
+            " in new weights from: ",
+            paramName);
+      }
+    }
+    std::optional<TensorMeta> curTensorMeta =
+        newTensorMeta ? newTensorMeta : tensorMeta;
+
+    if (tensorDataSize == 0 && tensorMeta->numel() > 0) {
+      VLOG(1) << "Tensor " << tensorName
+              << " does not have data and create on Meta device";
+      allValues_[tensorName] = at::empty_strided(
+          curTensorMeta->sizes(),
+          curTensorMeta->strides(),
+          curTensorMeta->asTensorOptions().device(at::kMeta));
+      return;
+    }
 
-        if (!isUsed) {
-          VLOG(1) << "Tensor " << tensorName << " is not used during inference";
-          auto targetDevice = tensorMeta->device();
-          allValues_[tensorName] =
-              at::scalar_tensor(0, at::TensorOptions().device(targetDevice));
-          return;
-        }
+    if (!isUsed) {
+      VLOG(1) << "Tensor " << tensorName << " is not used during inference";
+      auto targetDevice = curTensorMeta->device();
+      allValues_[tensorName] =
+          at::scalar_tensor(0, at::TensorOptions().device(targetDevice));
+      return;
+    }
 
-        size_t bytesPerEntry =
-            c10::scalarTypeToTypeMeta(tensorMeta->dtype()).itemsize();
-        auto device = tensorData.device();
-        auto storage = c10::Storage(
-            c10::Storage::use_byte_size_t(),
-            at::detail::computeStorageNbytes(
-                tensorMeta->sizes(), tensorMeta->strides(), bytesPerEntry),
-            std::move(tensorData), // ownership is transferred
-            nullptr,
-            false);
-        const auto tensorOptions = at::TensorOptions(device)
-                                       .dtype(tensorMeta->dtype())
-                                       .requires_grad(false);
-        auto tensor =
-            at::empty({0}, tensorOptions)
-                .set_(storage, 0, tensorMeta->sizes(), tensorMeta->strides());
-
-        auto targetDevice = tensorMeta->device();
-        VLOG(1) << "Loading weight " << tensorName << " on " << targetDevice;
-        if (!isSameDevice(targetDevice, tensor.device())) {
-          tensor = tensor.to(targetDevice);
-        }
+    size_t bytesPerEntry =
+        c10::scalarTypeToTypeMeta(curTensorMeta->dtype()).itemsize();
+    auto device = tensorData.device();
+    auto storage = c10::Storage(
+        c10::Storage::use_byte_size_t(),
+        at::detail::computeStorageNbytes(
+            curTensorMeta->sizes(), curTensorMeta->strides(), bytesPerEntry),
+        std::move(tensorData), // ownership is transferred
+        nullptr,
+        false);
+    const auto tensorOptions = at::TensorOptions(device)
+                                   .dtype(curTensorMeta->dtype())
+                                   .requires_grad(false);
+    auto tensor =
+        at::empty({0}, tensorOptions)
+            .set_(storage, 0, curTensorMeta->sizes(), curTensorMeta->strides());
+
+    auto targetDevice = tensorMeta->device();
+    VLOG(1) << "Loading weight " << tensorName << " on " << targetDevice;
+    if (!isSameDevice(targetDevice, tensor.device())) {
+      tensor = tensor.to(targetDevice);
+    }
+    if (tensor.dtype() != tensorMeta->dtype()) {
+      tensor = tensor.to(tensorMeta->dtype());
+    }
 
-        allValues_[tensorName] = tensor;
-      };
+    allValues_[tensorName] = tensor;
+  };
 
   auto loadAndInsertParamsBuffers = [&](const auto& tensorName, bool isUsed) {
     return loadAndInsert(
-        std::string(tensorName), stateDictPathPrefix, stateDictPaths, isUsed);
+        std::string(tensorName),
+        stateDictPathPrefix,
+        stateDictPaths,
+        isUsed,
+        maybeNewWeightsMeta);
   };
 
   size_t weightIndex = 0;
@@ -190,7 +227,8 @@ Weights::Weights(
             std::string(constantName),
             constantPathPrefix,
             constantPaths,
-            isUsed);
+            isUsed,
+            nullptr);
         weightIndex++;
       } else {
         TORCH_CHECK(false, "Unknown constant path: ", fileName);
diff --git a/torch/nativert/executor/Weights.h b/torch/nativert/executor/Weights.h
index 39653d0bed561..acc3379198354 100644
--- a/torch/nativert/executor/Weights.h
+++ b/torch/nativert/executor/Weights.h
@@ -45,7 +45,11 @@ class Weights {
       const std::unordered_map<std::string, std::string>& constantPaths,
       std::string_view constantPathPrefix,
       std::function<bool(const std::string&)> skipSizeCheck = {},
-      std::function<bool(const std::string&)> skipDtypeCheck = {});
+      std::function<bool(const std::string&)> skipDtypeCheck = {},
+      std::shared_ptr<std::unordered_map<
+          std::string,
+          std::shared_ptr<torch::nativert::TensorMeta>>> maybeNewWeightsMeta =
+          nullptr);
 
   at::Tensor at(const std::string& name) const;
   at::Tensor& at(const std::string& name);
diff --git a/torch/nativert/executor/memory/FunctionSchema.cpp b/torch/nativert/executor/memory/FunctionSchema.cpp
index 264ed702cbc0d..80347dad2965a 100644
--- a/torch/nativert/executor/memory/FunctionSchema.cpp
+++ b/torch/nativert/executor/memory/FunctionSchema.cpp
@@ -11,8 +11,8 @@ bool FunctionSchema::alias(size_t input_idx, size_t output_idx) const {
     }
   }
 
-  VLOG(1) << "checking aliasing spec for " << c10_fn_schema_.name() << " "
-          << (c10_fn_schema_.is_varret() ? "varret" : "non-varret") << " "
+  VLOG(1) << "checking aliasing spec for " << c10_fn_schema_.name() << ' '
+          << (c10_fn_schema_.is_varret() ? "varret" : "non-varret") << ' '
           << (c10_fn_schema_.is_vararg() ? "vararg" : "non-vararg");
 
   if (!aliasing_spec_.empty()) {
diff --git a/torch/nativert/graph/Graph.cpp b/torch/nativert/graph/Graph.cpp
index 260af58a2a492..47d082f44332f 100644
--- a/torch/nativert/graph/Graph.cpp
+++ b/torch/nativert/graph/Graph.cpp
@@ -1031,7 +1031,7 @@ std::ostream& operator<<(std::ostream& out, const Constant& constant) {
         } else if constexpr (is_same_v<T, c10::Layout>) {
           out << kLayoutPrefix << arg;
         } else if constexpr (is_same_v<T, c10::Device>) {
-          out << kDevicePrefix << "{" << arg << "}";
+          out << kDevicePrefix << '{' << arg << '}';
         } else if constexpr (is_same_v<T, vector<string>>) {
           out << fmt::format("[{}]", fmt::join(arg, ","));
         } else if constexpr (is_same_v<T, unique_ptr<Graph>>) {
@@ -1054,16 +1054,16 @@ void printValue(std::ostream& out, const Value* v) {
 }
 
 void printNamedArgument(std::ostream& out, const NamedArgument& nv) {
-  out << nv.name << "=" << *nv.value;
+  out << nv.name << '=' << *nv.value;
 }
 
 void printAttribute(std::ostream& out, const Attribute& nv) {
-  out << nv.name << "=" << nv.value;
+  out << nv.name << '=' << nv.value;
 }
 } // namespace
 
 std::ostream& operator<<(std::ostream& out, const Value& v) {
-  out << "%" << v.name();
+  out << '%' << v.name();
   // If a list, distinguish it by adding a []
   // Looks like %my_list[]
   if (v.type() == Type::Kind::TensorList) {
@@ -1085,14 +1085,14 @@ std::ostream& operator<<(std::ostream& out, const Node& node) {
     printList(out, false, node.inputs(), [](std::ostream& out, const auto& nv) {
       out << *nv.value;
     });
-    out << ")";
+    out << ')';
     return out;
   }
 
   printList(out, false, node.outputs_, printValue);
 
   out << " = ";
-  out << node.target_ << "(";
+  out << node.target_ << '(';
   printList(out, false, node.inputs_, printNamedArgument);
   if (!node.inputs_.empty() && !node.attributes_.empty()) {
     // Emit a connective ',' between inputs and attributes.
@@ -1100,13 +1100,13 @@ std::ostream& operator<<(std::ostream& out, const Node& node) {
   }
 
   printList(out, false, node.attributes_, printAttribute);
-  out << ")";
+  out << ')';
   return out;
 }
 
 std::ostream& operator<<(std::ostream& out, const Graph& graph) {
   for (const auto& node : graph.nodes_) {
-    out << node << "\n";
+    out << node << '\n';
   }
   return out;
 }
diff --git a/torch/nativert/graph/GraphSignature.cpp b/torch/nativert/graph/GraphSignature.cpp
index cd07af807198f..569fff36a945c 100644
--- a/torch/nativert/graph/GraphSignature.cpp
+++ b/torch/nativert/graph/GraphSignature.cpp
@@ -313,7 +313,7 @@ GraphSignature::GraphSignature(const torch::_export::GraphSignature& storage) {
   }
 
   if (FLAGS_caffe2_log_level > 2) {
-    std::cout << *this << "\n";
+    std::cout << *this << '\n';
   }
 }
 
@@ -401,14 +401,14 @@ std::ostream& operator<<(std::ostream& out, const GraphSignature& sig) {
   if (!sig.inputsToParameters().empty()) {
     out << "inputsToParameters: {\n";
     for (const auto& [inputName, paramName] : sig.inputsToParameters()) {
-      out << "\t" << inputName << " : " << paramName << "\n";
+      out << '\t' << inputName << " : " << paramName << '\n';
     }
     out << "}\n";
   }
   if (!sig.inputsToBuffers().empty()) {
     out << "inputsToBuffers: {\n";
     for (const auto& [inputName, bufferName] : sig.inputsToBuffers()) {
-      out << "\t" << inputName << " : " << bufferName << "\n";
+      out << '\t' << inputName << " : " << bufferName << '\n';
     }
     out << "}\n";
   }
@@ -416,28 +416,28 @@ std::ostream& operator<<(std::ostream& out, const GraphSignature& sig) {
     out << "inputsToTensorConstants: {\n";
     for (const auto& [inputName, tensorConstantName] :
          sig.inputsToTensorConstants()) {
-      out << "\t" << inputName << " : " << tensorConstantName << "\n";
+      out << '\t' << inputName << " : " << tensorConstantName << '\n';
     }
     out << "}\n";
   }
   if (!sig.inputsToCustomObjs().empty()) {
     out << "inputsToCustomObjs: {\n";
     for (const auto& [inputName, customObjName] : sig.inputsToCustomObjs()) {
-      out << "\t" << inputName << " : " << customObjName << "\n";
+      out << '\t' << inputName << " : " << customObjName << '\n';
     }
     out << "}\n";
   }
   if (!sig.userOutputs().empty()) {
     out << "userOutputs: {\n";
     for (const auto& outputName : sig.userOutputs()) {
-      out << "\t" << outputName.value_or("Constant") << "\n";
+      out << '\t' << outputName.value_or("Constant") << '\n';
     }
     out << "}\n";
   }
   if (!sig.buffersToMutate().empty()) {
     out << "buffersToMutate: {\n";
     for (const auto& [outputName, mutatedBufferName] : sig.buffersToMutate()) {
-      out << "\t" << outputName << " : " << mutatedBufferName << "\n";
+      out << '\t' << outputName << " : " << mutatedBufferName << '\n';
     }
     out << "}\n";
   }
@@ -445,7 +445,7 @@ std::ostream& operator<<(std::ostream& out, const GraphSignature& sig) {
     out << "userInputsToMutate: {\n";
     for (const auto& [outputName, mutatedUserInputName] :
          sig.userInputsToMutate()) {
-      out << "\t" << outputName << " : " << mutatedUserInputName << "\n";
+      out << '\t' << outputName << " : " << mutatedUserInputName << '\n';
     }
     out << "}\n";
   }
@@ -453,7 +453,7 @@ std::ostream& operator<<(std::ostream& out, const GraphSignature& sig) {
     if (!sig.gradientsToParameters().empty()) {
       out << "gradientsToParameters: {\n";
       for (const auto& [outputName, paramName] : sig.gradientsToParameters()) {
-        out << "\t" << outputName << " : " << paramName << "\n";
+        out << '\t' << outputName << " : " << paramName << '\n';
       }
       out << "}\n";
     }
@@ -461,11 +461,11 @@ std::ostream& operator<<(std::ostream& out, const GraphSignature& sig) {
       out << "gradientsToUserInputs: {\n";
       for (const auto& [outputName, userInputName] :
            sig.gradientsToUserInputs()) {
-        out << "\t" << outputName << " : " << userInputName << "\n";
+        out << '\t' << outputName << " : " << userInputName << '\n';
       }
       out << "}\n";
     }
-    out << "lossOutput: " << sig.lossOutput() << "\n";
+    out << "lossOutput: " << sig.lossOutput() << '\n';
   }
   return out;
 }
diff --git a/torch/nativert/graph/passes/pass_manager/PassManager.cpp b/torch/nativert/graph/passes/pass_manager/PassManager.cpp
index e023f223ed6f1..4dbb0012877d8 100644
--- a/torch/nativert/graph/passes/pass_manager/PassManager.cpp
+++ b/torch/nativert/graph/passes/pass_manager/PassManager.cpp
@@ -35,7 +35,7 @@ bool GraphPassManager::run_pass(Graph* graph, const GraphPassIdentifier& name) {
 
 bool GraphPassManager::pass_pre_run_hook(Graph* graph, const GraphPass& pass) {
   if (opts_.logGraphBetweenPasses()) {
-    LOG(INFO) << "Before pass: " << pass.name() << "\n"
+    LOG(INFO) << "Before pass: " << pass.name() << '\n'
               << graph->toString() << "-------------------------";
   }
   return false;
@@ -43,7 +43,7 @@ bool GraphPassManager::pass_pre_run_hook(Graph* graph, const GraphPass& pass) {
 
 bool GraphPassManager::pass_post_run_hook(Graph* graph, const GraphPass& pass) {
   if (opts_.logGraphBetweenPasses()) {
-    LOG(INFO) << "After pass: " << pass.name() << "\n"
+    LOG(INFO) << "After pass: " << pass.name() << '\n'
               << graph->toString() << "-------------------------";
   }
   return false;
diff --git a/torch/nested/__init__.py b/torch/nested/__init__.py
index 5aa739efd2edb..3ace31a3523c8 100644
--- a/torch/nested/__init__.py
+++ b/torch/nested/__init__.py
@@ -25,9 +25,9 @@
 
 
 def as_nested_tensor(
-    ts: Union[Tensor, list[Tensor], tuple[Tensor, ...]],
-    dtype: Optional[DType] = None,
-    device: Optional[Device] = None,
+    ts: Tensor | list[Tensor] | tuple[Tensor, ...],
+    dtype: DType | None = None,
+    device: Device | None = None,
     layout=None,
 ) -> Tensor:
     r"""
@@ -281,8 +281,8 @@ def nested_tensor(
 def narrow(
     tensor: Tensor,
     dim: int,
-    start: Union[int, Tensor],
-    length: Union[int, Tensor],
+    start: int | Tensor,
+    length: int | Tensor,
     layout=torch.strided,
 ) -> Tensor:
     r"""
@@ -358,11 +358,11 @@ def narrow(
 
 def nested_tensor_from_jagged(
     values: Tensor,
-    offsets: Optional[Tensor] = None,
-    lengths: Optional[Tensor] = None,
-    jagged_dim: Optional[int] = None,
-    min_seqlen: Optional[int] = None,
-    max_seqlen: Optional[int] = None,
+    offsets: Tensor | None = None,
+    lengths: Tensor | None = None,
+    jagged_dim: int | None = None,
+    min_seqlen: int | None = None,
+    max_seqlen: int | None = None,
 ) -> Tensor:
     r"""
     Constructs a jagged layout nested tensor from the given jagged components. The jagged layout
diff --git a/torch/nested/_internal/nested_int.py b/torch/nested/_internal/nested_int.py
index 59090b331d501..b347258b5f463 100644
--- a/torch/nested/_internal/nested_int.py
+++ b/torch/nested/_internal/nested_int.py
@@ -35,7 +35,7 @@ def _ge(lhs: Any, rhs: Any) -> bool:
 
 
 class NestedIntNode:
-    def __init__(self, t_id: int, coeff: int):
+    def __init__(self, t_id: int, coeff: int) -> None:
         self.t_id = t_id
         self.coeff = coeff
 
diff --git a/torch/nested/_internal/nested_tensor.py b/torch/nested/_internal/nested_tensor.py
index 8d446a7bd518d..cf4e3fecf4e6c 100644
--- a/torch/nested/_internal/nested_tensor.py
+++ b/torch/nested/_internal/nested_tensor.py
@@ -131,7 +131,7 @@ def __new__(
 
         return r
 
-    def __init__(self, values, offsets, *, lengths=None, **kwargs):
+    def __init__(self, values, offsets, *, lengths=None, **kwargs) -> None:
         super().__init__()
 
         self._values = values
@@ -243,7 +243,7 @@ def _is_contiguous_or_false(self):
             self._values, memory_format=torch.contiguous_format
         )
 
-    def __repr__(self):  # type: ignore[override]
+    def __repr__(self) -> str:  # type: ignore[override]
         # We should implement this in torch/_tensor_str.py instead
         grad_fn_str = (
             f", requires_grad={self.requires_grad}" if self.requires_grad else ""
diff --git a/torch/nested/_internal/ops.py b/torch/nested/_internal/ops.py
index a84a5b681d638..200ccd653f6c3 100644
--- a/torch/nested/_internal/ops.py
+++ b/torch/nested/_internal/ops.py
@@ -3,7 +3,6 @@
 import math
 import operator
 from typing import *  # noqa: F403
-from typing import Optional
 
 import torch
 import torch.nn.functional as F
@@ -143,7 +142,7 @@ def check_schema(schema_str: str, func, *args, **kwargs) -> None:
         name, arg_type = named_arg_type.split(": ")
         is_optional = arg_type.endswith("?")
         normalized_arg_type = arg_type[:-1] if is_optional else arg_type
-        if normalized_arg_type not in arg_type_check_fns.keys():
+        if normalized_arg_type not in arg_type_check_fns:
             raise AssertionError(f"Unknown arg type: {normalized_arg_type}")
 
         if i >= len(args):
@@ -249,7 +248,7 @@ def inner(*args, **kwargs):
 register_jagged_func = functools.partial(register_func, JAGGED_OPS_TABLE)
 
 
-def lookup_jagged(func, *args, **kwargs) -> Optional[Callable]:
+def lookup_jagged(func, *args, **kwargs) -> Callable | None:
     dispatch_func = JAGGED_OPS_TABLE.get(func, None)
     if dispatch_func is not None:
         return dispatch_func
@@ -400,7 +399,7 @@ def jagged_torch_function(func, *args, **kwargs):
     # Handle flatten() here because it's CompositeImplicit.
     if func.__name__ == "flatten":
 
-        def _flatten_sig(input, start_dim=0, end_dim=-1):
+        def _flatten_sig(input, start_dim=0, end_dim=-1) -> None:
             pass
 
         _, new_kwargs = normalize_function(  # type: ignore[misc]
@@ -466,7 +465,7 @@ def _flatten_sig(input, start_dim=0, end_dim=-1):
     # Handle nested-specific input validation for CompositeImplicit rms_norm
     if func.__name__ == "rms_norm":
 
-        def _rms_norm_sig(input, normalized_shape, weight=None, eps=None):
+        def _rms_norm_sig(input, normalized_shape, weight=None, eps=None) -> None:
             pass
 
         _, new_kwargs = normalize_function(  # type: ignore[misc]
@@ -532,7 +531,7 @@ def prim_layout_default(func, *args, **kwargs):
     [torch.ops.aten.size.default],
     "self: jt_all",
 )
-def tensor_attr_unsupported_getter(func, *args, **kwargs):
+def tensor_attr_unsupported_getter(func, *args, **kwargs) -> None:
     if func is torch.ops.aten.size.default:
         raise RuntimeError(
             "NestedTensor does not support directly calling torch.ops.aten.size; "
@@ -1138,7 +1137,7 @@ def unbind_int(func, *args, **kwargs):
     lengths = inp.lengths()
     ragged_idx = inp._ragged_idx
 
-    def _torch_check(_lengths: list[int], _offsets: Optional[list[int]] = None):
+    def _torch_check(_lengths: list[int], _offsets: list[int] | None = None) -> None:
         # This torch._check are needed for torch.compile
         # symbolic shapes processing.
         # offsets and lengths are symbolic variables during compilation,
@@ -2615,7 +2614,7 @@ def _nested_select_backward_default(func, *args, **kwargs):
 
 
 @register_jagged_func(torch.ops.aten.record_stream.default, "self: jt_all, s: any")
-def record_stream_default(func, *args, **kwargs):
+def record_stream_default(func, *args, **kwargs) -> None:
     inp = args[0]
     stream = args[1]
     # ensure all components live until stream computation completes
diff --git a/torch/nested/_internal/sdpa.py b/torch/nested/_internal/sdpa.py
index fe385dc5c766f..328702ede3746 100644
--- a/torch/nested/_internal/sdpa.py
+++ b/torch/nested/_internal/sdpa.py
@@ -1,6 +1,5 @@
 # mypy: allow-untyped-defs
 import logging
-from typing import Optional
 
 import torch
 import torch.nn
@@ -27,11 +26,11 @@ def _validate_sdpa_input(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
-    attn_mask: Optional[torch.Tensor] = None,
+    attn_mask: torch.Tensor | None = None,
     dropout_p=0.0,
     is_causal=False,
     scale=None,
-):
+) -> None:
     if (
         not isinstance(query, NestedTensor)
         or not isinstance(key, NestedTensor)
@@ -364,7 +363,7 @@ def _cumulative_and_max_seq_len_nnz(qkv: torch.Tensor) -> tuple[torch.Tensor, in
     return cumulative_seqlen, max_seqlen, n_elem
 
 
-def _is_safe_to_get_storage_as_tensor(tensor: torch.Tensor):
+def _is_safe_to_get_storage_as_tensor(tensor: torch.Tensor) -> bool:
     # This function checks if a nested tensor is valid for
     # use with the flash-attention and efficient_attention kernels without
     # needing to call contiguous on the nested tensor input.
@@ -668,8 +667,8 @@ def _autocast(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
-    attn_mask: Optional[torch.Tensor],
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    attn_mask: torch.Tensor | None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor | None]:
     """
     [Autocasting SDPA for NJT]
 
@@ -714,7 +713,7 @@ def jagged_scaled_dot_product_attention(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
-    attn_mask: Optional[torch.Tensor] = None,
+    attn_mask: torch.Tensor | None = None,
     dropout_p=0.0,
     is_causal=False,
     scale=None,
diff --git a/torch/nn/attention/__init__.py b/torch/nn/attention/__init__.py
index 5e6e0fa5fae3b..438a8bc55caf0 100644
--- a/torch/nn/attention/__init__.py
+++ b/torch/nn/attention/__init__.py
@@ -19,8 +19,13 @@
     "SDPBackend",
     "sdpa_kernel",
     "WARN_FOR_UNFUSED_KERNELS",
+    "register_flash_attention_impl",
+    "activate_flash_attention_impl",
+    "list_flash_attention_impls",
+    "current_flash_attention_impl",
 ]
 
+
 # Note: [SDPA warnings]
 # TODO: Consider using this for sdpa regardless of subclasses
 # This only effects users of bias subclasses
@@ -90,7 +95,7 @@ def _cur_sdpa_kernel_backends(with_priority: bool = False):
     return backends
 
 
-def _sdpa_kernel(backends: Iterable, set_priority: bool = False):
+def _sdpa_kernel(backends: Iterable, set_priority: bool = False) -> None:
     for name, val in _backend_names.items():
         enabled = getattr(SDPBackend, val) in backends
         getattr(torch._C, f"_set_sdp_use_{name}")(enabled)
@@ -105,9 +110,7 @@ def _sdpa_kernel(backends: Iterable, set_priority: bool = False):
 
 
 @contextlib.contextmanager
-def sdpa_kernel(
-    backends: Union[list[SDPBackend], SDPBackend], set_priority: bool = False
-):
+def sdpa_kernel(backends: list[SDPBackend] | SDPBackend, set_priority: bool = False):
     r"""
     Context manager to select which backend to use for scaled dot product attention.
 
@@ -162,3 +165,23 @@ def _sdpa_kernel_variadic(*backends: SDPBackend):
 def _get_flash_version() -> str:
     """This returns the closest matching tag for the flash attention backend"""
     return "2.5.7"
+
+
+from . import _registry
+
+
+# Re-export registry types and functions for public API
+_FlashAttentionImpl = _registry._FlashAttentionImpl
+_RegisterFn = _registry._RegisterFn
+register_flash_attention_impl = _registry.register_flash_attention_impl
+activate_flash_attention_impl = _registry.activate_flash_attention_impl
+list_flash_attention_impls = _registry.list_flash_attention_impls
+current_flash_attention_impl = _registry.current_flash_attention_impl
+
+register_flash_attention_impl.__module__ = __name__
+activate_flash_attention_impl.__module__ = __name__
+list_flash_attention_impls.__module__ = __name__
+current_flash_attention_impl.__module__ = __name__
+
+# Import built-in implementations to trigger self-registration
+from . import _fa4  # noqa: F401
diff --git a/torch/nn/attention/_fa4.py b/torch/nn/attention/_fa4.py
new file mode 100644
index 0000000000000..7209c71adf635
--- /dev/null
+++ b/torch/nn/attention/_fa4.py
@@ -0,0 +1,444 @@
+"""UBER PROTOTYPE!!!"""
+# mypy: allow-untyped-defs
+
+from __future__ import annotations
+
+import importlib
+from dataclasses import dataclass
+from functools import cache
+from typing import Any, TYPE_CHECKING
+from typing_extensions import TypeVarTuple, Unpack
+
+from . import _registry
+
+
+if TYPE_CHECKING:
+    from types import ModuleType
+
+import torch
+from torch.library import Library
+
+
+__all__ = [
+    "register_flash_attention_fa4",
+]
+
+
+_FA4_MODULE_PATH: str | None = None
+
+
+@dataclass
+class _FA4Handle:
+    library: Library | None
+
+    def remove(self) -> None:
+        self.library = None
+
+
+@cache
+def _get_device_major(device: torch.device) -> int:
+    major, _ = torch.cuda.get_device_capability(device)
+    return major
+
+
+def register_flash_attention_fa4(
+    module_path: str = "flash_attn.cute.interface",
+) -> _FA4Handle:
+    """
+    Register FA4 flash attention kernels with the PyTorch dispatcher.
+
+    Args:
+        module_path: Python module path to the FA4 implementation.
+    """
+    global _FA4_MODULE_PATH
+    _ = _fa4_import_module(module_path)
+    _FA4_MODULE_PATH = module_path
+    return _FA4Handle(_fa4_register_kernels())
+
+
+@cache
+def _fa4_import_module(module_path: str) -> ModuleType:
+    module = importlib.import_module(module_path)
+    if not hasattr(module, "_flash_attn_fwd") or not hasattr(module, "_flash_attn_bwd"):
+        raise RuntimeError(f"Module '{module_path}' does not expose FA4 kernels")
+    return module
+
+
+def _fa4_register_kernels() -> Library:
+    lib = Library("aten", "IMPL", "CUDA")  # noqa: TOR901
+    lib.impl("_flash_attention_forward", _fa4_flash_attention_forward_impl, "CUDA")
+    lib.impl("_flash_attention_backward", _fa4_flash_attention_backward_impl, "CUDA")
+    lib.impl(
+        "_scaled_dot_product_flash_attention",
+        _fa4_scaled_dot_product_flash_attention_forward_impl,
+        "CUDA",
+    )
+    lib.impl(
+        "_scaled_dot_product_flash_attention_backward",
+        _fa4_scaled_dot_product_flash_attention_backward_impl,
+        "CUDA",
+    )
+    return lib
+
+
+def _fa4_common_support_error(
+    query: torch.Tensor,
+    tensors: tuple[torch.Tensor, ...],
+    cum_seq_q: torch.Tensor | None,
+    require_fp32: tuple[tuple[str, torch.Tensor], ...] = (),
+) -> str | None:
+    if not all(t.is_cuda for t in tensors):
+        return "inputs must be CUDA tensors"
+    if len({t.device for t in tensors}) != 1:
+        return "inputs must share device"
+    if query.dtype not in (torch.float16, torch.bfloat16):
+        return "query dtype must be float16 or bfloat16"
+    for name, tensor in require_fp32:
+        if tensor.dtype != torch.float32:
+            return f"{name} dtype must be float32"
+    if cum_seq_q is None and query.dim() != 4:
+        return "dense query must be 4D"
+    if cum_seq_q is not None and query.dim() != 3:
+        return "ragged query must be 3D"
+    if not torch.cuda.is_available():
+        return "CUDA not available"
+    if _get_device_major(query.device) not in (9, 10):
+        return "FA4 requires compute capability 9.0 or 10.0"
+    return None
+
+
+def _fa4_forward_support_error(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    dropout_p: float,
+    return_debug_mask: bool,
+    alibi_slopes: torch.Tensor | None,
+    seqused_k: torch.Tensor | None,
+    cum_seq_q: torch.Tensor | None,
+) -> str | None:
+    if dropout_p != 0.0:
+        return "dropout_p must be 0"
+    if return_debug_mask:
+        return "return_debug_mask must be False"
+    if alibi_slopes is not None:
+        return "alibi_slopes not supported"
+    if seqused_k is not None:
+        if seqused_k.dtype != torch.int32:
+            return "seqused_k must be int32"
+        if not seqused_k.is_cuda:
+            return "seqused_k must be CUDA"
+    error = _fa4_common_support_error(
+        query,
+        (query, key, value),
+        cum_seq_q,
+    )
+    if error is not None:
+        if error == "inputs must share device":
+            return "query, key, value must be on same device"
+        return error
+    return None
+
+
+def _fa4_backward_support_error(
+    grad_out: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    out: torch.Tensor,
+    logsumexp: torch.Tensor,
+    dropout_p: float,
+    cum_seq_q: torch.Tensor | None,
+    window_size_left: int | None,
+    window_size_right: int | None,
+) -> str | None:
+    if dropout_p != 0.0:
+        return "dropout_p must be 0"
+    if window_size_left is not None or window_size_right is not None:
+        return "windowed attention not supported"
+    error = _fa4_common_support_error(
+        query,
+        (grad_out, query, key, value, out, logsumexp),
+        cum_seq_q,
+        require_fp32=(("logsumexp", logsumexp),),
+    )
+    if error is not None:
+        return error
+    return None
+
+
+Ts = TypeVarTuple("Ts")
+
+
+def _transpose_dense(*tensors: Unpack[Ts]) -> tuple[Unpack[Ts]]:
+    return tuple(t.transpose(1, 2) for t in tensors)  # type: ignore[attr-defined]
+
+
+def _fa4_run_forward(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    cu_seq_q: torch.Tensor | None,
+    cu_seq_k: torch.Tensor | None,
+    scale: float | None,
+    is_causal: bool,
+    window_size_left: int | None,
+    window_size_right: int | None,
+    seqused_k: torch.Tensor | None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    if _FA4_MODULE_PATH is None:
+        raise RuntimeError("FA4 not registered")
+    module = _fa4_import_module(_FA4_MODULE_PATH)
+    kwargs: dict[str, Any] = {
+        "softmax_scale": scale,
+        "causal": is_causal,
+        "window_size_left": window_size_left,
+        "window_size_right": window_size_right,
+        "return_lse": True,
+        "cu_seqlens_q": cu_seq_q,
+        "cu_seqlens_k": cu_seq_k,
+        "seqused_k": seqused_k.contiguous() if seqused_k is not None else None,
+    }
+    out, lse = module._flash_attn_fwd(query, key, value, **kwargs)
+    return out, lse.contiguous()
+
+
+def _fa4_run_backward(
+    grad_out: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    out: torch.Tensor,
+    logsumexp: torch.Tensor,
+    cu_seq_q: torch.Tensor | None,
+    cu_seq_k: torch.Tensor | None,
+    scale: float | None,
+    is_causal: bool,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if _FA4_MODULE_PATH is None:
+        raise RuntimeError("FA4 not registered")
+    module = _fa4_import_module(_FA4_MODULE_PATH)
+    dq, dk, dv = module._flash_attn_bwd(
+        query,
+        key,
+        value,
+        out,
+        grad_out,
+        logsumexp.contiguous(),
+        softmax_scale=scale,
+        causal=is_causal,
+        cu_seqlens_q=cu_seq_q,
+        cu_seqlens_k=cu_seq_k,
+    )
+    return dq, dk, dv
+
+
+def _fa4_flash_attention_forward_impl(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    cum_seq_q: torch.Tensor | None,
+    cum_seq_k: torch.Tensor | None,
+    max_q: int,
+    max_k: int,
+    dropout_p: float,
+    is_causal: bool,
+    return_debug_mask: bool,
+    *,
+    scale: float | None = None,
+    window_size_left: int | None = None,
+    window_size_right: int | None = None,
+    seqused_k: torch.Tensor | None = None,
+    alibi_slopes: torch.Tensor | None = None,
+):
+    error = _fa4_forward_support_error(
+        query,
+        key,
+        value,
+        dropout_p,
+        return_debug_mask,
+        alibi_slopes,
+        seqused_k,
+        cum_seq_q,
+    )
+    if error is not None:
+        raise RuntimeError(f"FA4 flash_attention forward unsupported: {error}")
+    out, lse = _fa4_run_forward(
+        query,
+        key,
+        value,
+        cum_seq_q,
+        cum_seq_k,
+        scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        seqused_k,
+    )
+    rng_state = torch.zeros((2,), dtype=torch.uint64, device=query.device)
+    philox_offset = torch.zeros((), dtype=torch.uint64, device=query.device)
+    debug_mask = torch.empty(0, dtype=query.dtype, device=query.device)
+    return out, lse, rng_state, philox_offset, debug_mask
+
+
+def _fa4_flash_attention_backward_impl(
+    grad_out: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    out: torch.Tensor,
+    logsumexp: torch.Tensor,
+    cum_seq_q: torch.Tensor | None,
+    cum_seq_k: torch.Tensor | None,
+    max_q: int,
+    max_k: int,
+    dropout_p: float,
+    is_causal: bool,
+    rng_state: torch.Tensor,
+    unused: torch.Tensor,
+    *,
+    scale: float | None = None,
+    window_size_left: int | None = None,
+    window_size_right: int | None = None,
+):
+    error = _fa4_backward_support_error(
+        grad_out,
+        query,
+        key,
+        value,
+        out,
+        logsumexp,
+        dropout_p,
+        cum_seq_q,
+        window_size_left,
+        window_size_right,
+    )
+    if error is not None:
+        raise RuntimeError(f"FA4 flash_attention backward unsupported: {error}")
+    dq, dk, dv = _fa4_run_backward(
+        grad_out,
+        query,
+        key,
+        value,
+        out,
+        logsumexp,
+        cum_seq_q,
+        cum_seq_k,
+        scale,
+        is_causal,
+    )
+    return dq, dk, dv
+
+
+def _fa4_scaled_dot_product_flash_attention_forward_impl(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    return_debug_mask: bool = False,
+    *,
+    scale: float | None = None,
+):
+    error = _fa4_forward_support_error(
+        query,
+        key,
+        value,
+        dropout_p,
+        return_debug_mask,
+        None,
+        None,
+        None,
+    )
+    if error is not None:
+        raise RuntimeError(f"FA4 SDPA forward unsupported: {error}")
+    q, k, v = _transpose_dense(query, key, value)
+
+    max_q_flash = q.size(1)
+    max_k_flash = k.size(1)
+    out, lse, rng_state, philox_offset, debug_mask = _fa4_flash_attention_forward_impl(
+        q,
+        k,
+        v,
+        None,
+        None,
+        max_q_flash,
+        max_k_flash,
+        dropout_p,
+        is_causal,
+        return_debug_mask,
+        scale=scale,
+    )
+    (out,) = _transpose_dense(out)
+    max_q = query.size(2)
+    max_k = key.size(2)
+    return (
+        out,
+        lse,
+        None,
+        None,
+        max_q,
+        max_k,
+        rng_state,
+        philox_offset,
+        debug_mask,
+    )
+
+
+def _fa4_scaled_dot_product_flash_attention_backward_impl(
+    grad_out: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    out: torch.Tensor,
+    logsumexp: torch.Tensor,
+    cum_seq_q: torch.Tensor | None,
+    cum_seq_k: torch.Tensor | None,
+    max_q: int,
+    max_k: int,
+    dropout_p: float,
+    is_causal: bool,
+    philox_seed: torch.Tensor,
+    philox_offset: torch.Tensor,
+    *,
+    scale: float | None = None,
+):
+    error = _fa4_backward_support_error(
+        grad_out,
+        query,
+        key,
+        value,
+        out,
+        logsumexp,
+        dropout_p,
+        None,
+        None,
+        None,
+    )
+    if error is not None:
+        raise RuntimeError(f"FA4 SDPA backward unsupported: {error}")
+    q, k, v, o, go = _transpose_dense(query, key, value, out, grad_out)
+    max_q = query.size(2)
+    max_k = key.size(2)
+    dq, dk, dv = _fa4_flash_attention_backward_impl(
+        go,
+        q,
+        k,
+        v,
+        o,
+        logsumexp,
+        None,
+        None,
+        max_q,
+        max_k,
+        dropout_p,
+        is_causal,
+        philox_seed,
+        philox_offset,
+        scale=scale,
+    )
+    dq, dk, dv = _transpose_dense(dq, dk, dv)
+    return dq, dk, dv
+
+
+_registry.register_flash_attention_impl("FA4", register_fn=register_flash_attention_fa4)
diff --git a/torch/nn/attention/_registry.py b/torch/nn/attention/_registry.py
new file mode 100644
index 0000000000000..883252d56f8b6
--- /dev/null
+++ b/torch/nn/attention/_registry.py
@@ -0,0 +1,109 @@
+# mypy: allow-untyped-defs
+"""Registry for flash attention implementations.
+
+This module contains the registration system for flash attention implementations.
+It has no torch dependencies to avoid circular imports during initialization.
+"""
+
+from collections.abc import Callable
+from typing import Literal, Protocol
+
+
+class FlashAttentionHandle(Protocol):
+    def remove(self) -> None: ...
+
+
+_RegisterFn = Callable[..., FlashAttentionHandle | None]
+_FlashAttentionImpl = Literal["FA4"]
+
+_FLASH_ATTENTION_IMPLS: dict[str, _RegisterFn] = {}
+
+_FLASH_ATTENTION_ACTIVE: str | None = None
+_FLASH_ATTENTION_HANDLES: dict[str, FlashAttentionHandle] = {}
+
+
+def register_flash_attention_impl(
+    impl: str | _FlashAttentionImpl,
+    *,
+    register_fn: _RegisterFn,
+) -> None:
+    """
+    Register the callable that activates a flash attention impl.
+
+    .. note::
+        This function is intended for SDPA backend providers to register their
+        implementations. End users should use :func:`activate_flash_attention_impl`
+        to activate a registered implementation.
+
+    Args:
+        impl: Implementation identifier (e.g., ``"FA4"``).
+        register_fn: Callable that performs the actual dispatcher registration.
+            This function will be invoked by :func:`activate_flash_attention_impl`
+            and should register custom kernels with the PyTorch dispatcher.
+            It may optionally return a handle implementing
+            :class:`FlashAttentionHandle` to keep any necessary state alive.
+
+    Example:
+        >>> def my_impl_register(module_path: str = "my_flash_impl"):
+        ...     # Register custom kernels with torch dispatcher
+        ...     pass  # doctest: +SKIP
+        >>> register_flash_attention_impl(
+        ...     "MyImpl", register_fn=my_impl_register
+        ... )  # doctest: +SKIP
+    """
+    _FLASH_ATTENTION_IMPLS[impl] = register_fn
+
+
+def activate_flash_attention_impl(
+    impl: str | _FlashAttentionImpl,
+) -> None:
+    """
+    Activate into the dispatcher a previously registered flash attention impl.
+
+    .. note::
+        Backend providers should NOT automatically activate their implementation
+        on import. Users should explicitly opt-in by calling this function or via
+        environment variables to ensure multiple provider libraries can coexist.
+
+    Args:
+        impl: Implementation identifier to activate. See
+            :func:`~torch.nn.attention.list_flash_attention_impls` for available
+            implementations.
+            If the backend's :func:`register_flash_attention_impl` callable
+            returns a :class:`FlashAttentionHandle`, the registry keeps that
+            handle alive for the lifetime of the process (until explicit
+            uninstall support exists).
+
+    Example:
+        >>> activate_flash_attention_impl("FA4")  # doctest: +SKIP
+    """
+    global _FLASH_ATTENTION_ACTIVE
+    register_fn = _FLASH_ATTENTION_IMPLS.get(impl)
+    if register_fn is None:
+        raise ValueError(
+            f"Unknown flash attention impl '{impl}'. "
+            f"Available implementations: {list_flash_attention_impls()}"
+        )
+    # TODO: The only way to actually register a new impl is to unregister the current impl
+    # reinstall the default impl and then register the new impl
+    if _FLASH_ATTENTION_ACTIVE == impl:
+        return
+
+    handle = register_fn()
+    if handle is not None:
+        _FLASH_ATTENTION_HANDLES[impl] = handle
+    _FLASH_ATTENTION_ACTIVE = impl
+
+
+def list_flash_attention_impls() -> list[str]:
+    """Return the names of all available flash attention implementations."""
+    return sorted(_FLASH_ATTENTION_IMPLS.keys())
+
+
+def current_flash_attention_impl() -> str | None:
+    """
+    Return the currently activated flash attention impl name, if any.
+
+    ``None`` indicates that no custom impl has been activated.
+    """
+    return _FLASH_ATTENTION_ACTIVE
diff --git a/torch/nn/attention/_utils.py b/torch/nn/attention/_utils.py
index a91045b92c13e..cd530bb675e8f 100644
--- a/torch/nn/attention/_utils.py
+++ b/torch/nn/attention/_utils.py
@@ -2,7 +2,6 @@
 """Defines utilities for interacting with scaled_dot_product_attention"""
 
 import math
-from typing import Optional
 
 import torch
 
@@ -22,7 +21,7 @@ def _postprocess_flash_output(inpt_tensor: torch.Tensor, og_size: int) -> torch.
     return inpt_tensor
 
 
-def _calculate_scale(head_dim_size: int, scale: Optional[float]) -> float:
+def _calculate_scale(head_dim_size: int, scale: float | None) -> float:
     """
     For FlashAttention we pad the head dimension to be a multiple of 8 so we need to scale the output
     by the original head size and not the padded.
@@ -36,11 +35,11 @@ def _validate_sdpa_input(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
-    attn_mask: Optional[torch.Tensor] = None,
+    attn_mask: torch.Tensor | None = None,
     dropout_p=0.0,
     is_causal=False,
     scale=None,
-):
+) -> None:
     if query.dtype != key.dtype or query.dtype != value.dtype:
         raise ValueError(
             f"Expected query, key, and value to have the same dtype, "
diff --git a/torch/nn/attention/bias.py b/torch/nn/attention/bias.py
index 551a57e6963e0..0e491d0eb635a 100644
--- a/torch/nn/attention/bias.py
+++ b/torch/nn/attention/bias.py
@@ -2,7 +2,6 @@
 """Defines bias subclasses that work with scaled_dot_product_attention"""
 
 from enum import auto, IntEnum
-from typing import Optional
 from warnings import warn
 
 import torch
@@ -117,7 +116,7 @@ class CausalBias(torch.Tensor):
     .. warning:: This class is a prototype and subject to change.
     """
 
-    def __init__(self, variant: CausalVariant, seq_len_q: int, seq_len_kv: int):
+    def __init__(self, variant: CausalVariant, seq_len_q: int, seq_len_kv: int) -> None:
         """
         Initializes the CausalBias instance with a specified variant and sequence lengths.
 
@@ -155,7 +154,7 @@ def _lower_right(self, device: torch.device) -> torch.Tensor:
         )
 
     # pyrefly: ignore [bad-return]
-    def _materialize(self, device: Optional[torch.device] = None) -> torch.Tensor:
+    def _materialize(self, device: torch.device | None = None) -> torch.Tensor:
         """
         Materializes the causal bias into a tensor form.
 
@@ -183,7 +182,7 @@ def _dispatch(
         attn_mask: "CausalBias",
         dropout_p: float = 0.0,
         is_causal: bool = False,
-        scale: Optional[float] = None,
+        scale: float | None = None,
         enable_gqa: bool = False,
     ) -> torch.Tensor:
         r"""
@@ -296,7 +295,7 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
             return cls._dispatch(*args, **kwargs)
         return super().__torch_function__(func, types, args, kwargs)
 
-    def __repr__(self):  # type:ignore[override]
+    def __repr__(self) -> str:  # type:ignore[override]
         return self._materialize().__repr__()
 
 
diff --git a/torch/nn/attention/experimental/_paged_attention.py b/torch/nn/attention/experimental/_paged_attention.py
index 70eadcdadfaa0..5bbbc2b78aa6a 100644
--- a/torch/nn/attention/experimental/_paged_attention.py
+++ b/torch/nn/attention/experimental/_paged_attention.py
@@ -4,8 +4,6 @@
 This module is experimental and subject to change.
 """
 
-from typing import Optional, Union
-
 import torch
 from torch.nn.attention.flex_attention import (
     _identity,
@@ -19,9 +17,7 @@
 __all__ = ["PagedAttention"]
 
 
-def _cdiv(
-    x: Union[int, float, torch.Tensor], multiple: Union[int, float, torch.Tensor]
-):
+def _cdiv(x: int | float | torch.Tensor, multiple: int | float | torch.Tensor):
     return (x + multiple - 1) // multiple
 
 
@@ -40,7 +36,7 @@ def __init__(
         page_size: int,
         max_batch_size: int,
         device: str = "cuda",
-    ):
+    ) -> None:
         # number of pages
         self.n_pages = n_pages
 
@@ -197,8 +193,8 @@ def assign(
     def convert_logical_block_mask(
         self,
         block_mask: BlockMask,
-        batch_idx: Optional[torch.Tensor] = None,
-        kv_len: Optional[torch.Tensor] = None,
+        batch_idx: torch.Tensor | None = None,
+        kv_len: torch.Tensor | None = None,
     ) -> BlockMask:
         """
         Converts a logical block mask by mapping its logical kv indices to the corresponding
@@ -279,8 +275,8 @@ def convert_logical_block_mask(
 
     def get_mask_mod(
         self,
-        mask_mod: Optional[_mask_mod_signature],
-        kv_len: Optional[torch.Tensor] = None,
+        mask_mod: _mask_mod_signature | None,
+        kv_len: torch.Tensor | None = None,
     ) -> _mask_mod_signature:
         """
         Converts a mask_mod based on mapping from the physical block index to the logical
@@ -316,8 +312,8 @@ def new_mask_mod(
 
     def get_score_mod(
         self,
-        score_mod: Optional[_score_mod_signature],
-        kv_len: Optional[torch.Tensor] = None,
+        score_mod: _score_mod_signature | None,
+        kv_len: torch.Tensor | None = None,
     ) -> _score_mod_signature:
         """
         Converts a score_mod based on mapping from the physical block index to the logical
diff --git a/torch/nn/attention/flex_attention.py b/torch/nn/attention/flex_attention.py
index b79b86a29afb6..f3746bcea1264 100644
--- a/torch/nn/attention/flex_attention.py
+++ b/torch/nn/attention/flex_attention.py
@@ -10,7 +10,7 @@
 import warnings
 from collections.abc import Callable
 from enum import Enum
-from typing import Any, NamedTuple, Optional, Union
+from typing import Any, NamedTuple
 
 import torch
 from torch import Tensor
@@ -244,8 +244,8 @@ class AuxOutput(NamedTuple):
     Fields will be None if not requested, or contain the tensor if requested.
     """
 
-    lse: Optional[Tensor] = None
-    max_scores: Optional[Tensor] = None
+    lse: Tensor | None = None
+    max_scores: Tensor | None = None
 
 
 class _ModificationType(Enum):
@@ -293,9 +293,9 @@ def _get_mod_type(fn: Callable) -> _ModificationType:
 # Need to define it here so that Dynamo doesn't skip it
 def _vmap_for_bhqkv(
     fn: Callable,
-    prefix: tuple[Optional[int], ...],
-    suffix: tuple[Optional[int], ...] = (),
-    out_dims: Union[int, list[Optional[int]]] = 0,
+    prefix: tuple[int | None, ...],
+    suffix: tuple[int | None, ...] = (),
+    out_dims: int | list[int | None] = 0,
     group_dim: bool = False,
 ):
     """Used to vmap both score_mods and mask_mods over 4-dimensional/5-dimension inputs.
@@ -510,12 +510,12 @@ class BlockMask:
     seq_lengths: tuple[int, int]
     kv_num_blocks: Tensor
     kv_indices: Tensor
-    full_kv_num_blocks: Optional[Tensor]
-    full_kv_indices: Optional[Tensor]
-    q_num_blocks: Optional[Tensor]
-    q_indices: Optional[Tensor]
-    full_q_num_blocks: Optional[Tensor]
-    full_q_indices: Optional[Tensor]
+    full_kv_num_blocks: Tensor | None
+    full_kv_indices: Tensor | None
+    q_num_blocks: Tensor | None
+    q_indices: Tensor | None
+    full_q_num_blocks: Tensor | None
+    full_q_indices: Tensor | None
     BLOCK_SIZE: tuple[int, int]
     mask_mod: _mask_mod_signature
 
@@ -542,15 +542,15 @@ def __init__(
         seq_lengths: tuple[int, int],
         kv_num_blocks: Tensor,
         kv_indices: Tensor,
-        full_kv_num_blocks: Optional[Tensor],
-        full_kv_indices: Optional[Tensor],
-        q_num_blocks: Optional[Tensor],
-        q_indices: Optional[Tensor],
-        full_q_num_blocks: Optional[Tensor],
-        full_q_indices: Optional[Tensor],
+        full_kv_num_blocks: Tensor | None,
+        full_kv_indices: Tensor | None,
+        q_num_blocks: Tensor | None,
+        q_indices: Tensor | None,
+        full_q_num_blocks: Tensor | None,
+        full_q_indices: Tensor | None,
         BLOCK_SIZE: tuple[int, int],
         mask_mod: _mask_mod_signature,
-    ):
+    ) -> None:
         if kv_indices.dim() < 2:
             raise RuntimeError("BlockMask must have at least 2 dimensions")
         assert kv_num_blocks is not None, "kv_num_blocks must be provided"
@@ -579,11 +579,11 @@ def from_kv_blocks(
         cls,
         kv_num_blocks: Tensor,
         kv_indices: Tensor,
-        full_kv_num_blocks: Optional[Tensor] = None,
-        full_kv_indices: Optional[Tensor] = None,
-        BLOCK_SIZE: Union[int, tuple[int, int]] = _DEFAULT_SPARSE_BLOCK_SIZE,
-        mask_mod: Optional[_mask_mod_signature] = None,
-        seq_lengths: Optional[tuple[int, int]] = None,
+        full_kv_num_blocks: Tensor | None = None,
+        full_kv_indices: Tensor | None = None,
+        BLOCK_SIZE: int | tuple[int, int] = _DEFAULT_SPARSE_BLOCK_SIZE,
+        mask_mod: _mask_mod_signature | None = None,
+        seq_lengths: tuple[int, int] | None = None,
         compute_q_blocks: bool = True,
     ):
         """
@@ -682,7 +682,7 @@ def shape(self):
         *batch_dims, _, _ = self.kv_indices.shape
         return tuple(batch_dims) + self.seq_lengths
 
-    def __str__(self):
+    def __str__(self) -> str:
         s = f"BlockMask(shape={self.shape}, sparsity={self.sparsity():.2f}%, \n"
         mask_str = self.to_string().strip()
         s += mask_str
@@ -760,8 +760,8 @@ def causal_mask(b, h, q_idx, kv_idx):
             compute_q_blocks=self.q_indices is not None,
         )
 
-    def __repr__(self):
-        def shape_or_none(x: Optional[torch.Tensor]):
+    def __repr__(self) -> str:
+        def shape_or_none(x: torch.Tensor | None):
             return x.shape if x is not None else None
 
         return (
@@ -864,7 +864,7 @@ def create_block_vis(*batch_idx):
 
             vis = ", ".join(reversed(descriptors)) + "\n"
 
-            def summarize_section(section):
+            def summarize_section(section) -> str:
                 percentage = section.float().mean().item()
                 if percentage == 1:
                     return "█"
@@ -907,7 +907,7 @@ def cdiv(a, b):
 
         return "\n".join(total_vis)
 
-    def to(self, device: Union[torch.device, str]) -> "BlockMask":
+    def to(self, device: torch.device | str) -> "BlockMask":
         """Moves the BlockMask to the specified device.
 
         Args:
@@ -973,7 +973,7 @@ def _convert_mask_to_block_mask(
     Q_BLOCK_SIZE=_DEFAULT_SPARSE_BLOCK_SIZE,
     KV_BLOCK_SIZE=_DEFAULT_SPARSE_BLOCK_SIZE,
     separate_full_blocks: bool = False,
-) -> tuple[Tensor, Optional[Tensor]]:
+) -> tuple[Tensor, Tensor | None]:
     assert mask.dtype == torch.bool
     mask = _broadcast_to_dim(mask, 4)
 
@@ -1057,8 +1057,8 @@ def _convert_block_mask_to_mask(
 
 
 def _create_sparse_block_from_block_mask(
-    block_mask: tuple[Tensor, Optional[Tensor]],
-    mask_mod: Optional[Callable],
+    block_mask: tuple[Tensor, Tensor | None],
+    mask_mod: Callable | None,
     seq_lengths: tuple[int, int],
     Q_BLOCK_SIZE: int = _DEFAULT_SPARSE_BLOCK_SIZE,
     KV_BLOCK_SIZE: int = _DEFAULT_SPARSE_BLOCK_SIZE,
@@ -1067,9 +1067,7 @@ def _create_sparse_block_from_block_mask(
 
     partial_bm = _dense_to_ordered(partial_blocks)
     if full_blocks is not None:
-        full_bm: tuple[Optional[Tensor], Optional[Tensor]] = _dense_to_ordered(
-            full_blocks
-        )
+        full_bm: tuple[Tensor | None, Tensor | None] = _dense_to_ordered(full_blocks)
     else:
         full_bm = (None, None)
 
@@ -1085,12 +1083,12 @@ def _create_sparse_block_from_block_mask(
 
 
 def create_mask(
-    mod_fn: Union[_score_mod_signature, _mask_mod_signature],
-    B: Optional[int],
-    H: Optional[int],
+    mod_fn: _score_mod_signature | _mask_mod_signature,
+    B: int | None,
+    H: int | None,
     Q_LEN: int,
     KV_LEN: int,
-    device: Optional[DeviceLikeType] = None,
+    device: DeviceLikeType | None = None,
 ) -> Tensor:
     r"""This function creates a mask tensor from a mod_fn function.
 
@@ -1137,12 +1135,12 @@ def create_mask(
 
 def create_block_mask(
     mask_mod: _mask_mod_signature,
-    B: Optional[int],
-    H: Optional[int],
+    B: int | None,
+    H: int | None,
     Q_LEN: int,
     KV_LEN: int,
-    device: Optional[DeviceLikeType] = None,
-    BLOCK_SIZE: Union[int, tuple[int, int]] = _DEFAULT_SPARSE_BLOCK_SIZE,
+    device: DeviceLikeType | None = None,
+    BLOCK_SIZE: int | tuple[int, int] = _DEFAULT_SPARSE_BLOCK_SIZE,
     _compile=False,
 ) -> BlockMask:
     r"""This function creates a block mask tuple from a mask_mod function.
@@ -1240,7 +1238,7 @@ def _apply_kernel_options(
     value: Tensor,
     return_lse: bool,
     kernel_options,
-    return_aux: Optional[AuxRequest] = None,
+    return_aux: AuxRequest | None = None,
 ):
     kernel_options = {} if kernel_options is None else dict(kernel_options)
 
@@ -1289,7 +1287,7 @@ def _apply_kernel_options(
     return kernel_options
 
 
-def _validate_embed_dim(query: Tensor, key: Tensor, value: Tensor):
+def _validate_embed_dim(query: Tensor, key: Tensor, value: Tensor) -> None:
     if query.size(-1) != key.size(-1):
         raise ValueError(
             f"Expect query and key/value to have the same embedding dimension "
@@ -1297,7 +1295,7 @@ def _validate_embed_dim(query: Tensor, key: Tensor, value: Tensor):
         )
 
 
-def _validate_device(query: Tensor, key: Tensor, value: Tensor):
+def _validate_device(query: Tensor, key: Tensor, value: Tensor) -> None:
     """TODO: Remove once non cuda/cpu devices support is added
     We only need to check query since we have already that q,k,v are on the same device
     """
@@ -1372,15 +1370,15 @@ def flex_attention(
     query: Tensor,
     key: Tensor,
     value: Tensor,
-    score_mod: Optional[_score_mod_signature] = None,
-    block_mask: Optional[BlockMask] = None,
-    scale: Optional[float] = None,
+    score_mod: _score_mod_signature | None = None,
+    block_mask: BlockMask | None = None,
+    scale: float | None = None,
     enable_gqa: bool = False,
     return_lse: bool = False,
-    kernel_options: Optional[FlexKernelOptions] = None,
+    kernel_options: FlexKernelOptions | None = None,
     *,
-    return_aux: Optional[AuxRequest] = None,
-) -> Union[Tensor, tuple[Tensor, Tensor], tuple[Tensor, AuxOutput]]:
+    return_aux: AuxRequest | None = None,
+) -> Tensor | tuple[Tensor, Tensor] | tuple[Tensor, AuxOutput]:
     r"""This function implements scaled dot product attention with an arbitrary attention score modification function.
 
     This function computes the scaled dot product attention between query, key, and value tensors with a user-defined
@@ -1552,7 +1550,7 @@ def _finalize_outputs(
         lse,
         max_scores,
         *,
-        return_aux: Optional[AuxRequest],
+        return_aux: AuxRequest | None,
         return_lse: bool,
     ):
         """Normalize stats and build return value (aux-aware, legacy-compatible)."""
@@ -1624,7 +1622,7 @@ def _flex_attention_hop_wrapper(*args, **kwargs):
             with _temp_remove_pre_dispatch_torch_function_mode():
                 with _temp_remove_metadata_torch_function_mode() as metadata_mode:
                     if metadata_mode:
-                        backend: Union[str, Callable[..., Any]] = (
+                        backend: str | Callable[..., Any] = (
                             make_eager_backend_with_torch_function_mode(metadata_mode)
                         )
                     else:
diff --git a/torch/nn/attention/varlen.py b/torch/nn/attention/varlen.py
index 3a81a3b0cee39..b20c1b4b2e49a 100644
--- a/torch/nn/attention/varlen.py
+++ b/torch/nn/attention/varlen.py
@@ -7,7 +7,7 @@
 
 import logging
 from functools import lru_cache
-from typing import Any, NamedTuple, Optional, Union
+from typing import Any, NamedTuple
 
 import torch
 
@@ -133,8 +133,8 @@ def varlen_attn(
     max_q: int,
     max_k: int,
     is_causal: bool = False,
-    return_aux: Optional[AuxRequest] = None,
-) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+    return_aux: AuxRequest | None = None,
+) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
     """
     Compute variable-length attention using Flash Attention.
     This function is similar to scaled_dot_product_attention but optimized for
@@ -299,7 +299,7 @@ def _varlen_attn_backward_fake(
 
 def _backward(
     ctx: Any, grad_out: torch.Tensor, grad_lse: torch.Tensor, grad_rng: torch.Tensor
-) -> tuple[Optional[torch.Tensor], ...]:
+) -> tuple[torch.Tensor | None, ...]:
     query, key, value, cu_seq_q, cu_seq_k, out, lse, rng_state = ctx.saved_tensors
 
     max_q = ctx.max_q
diff --git a/torch/nn/backends/thnn.py b/torch/nn/backends/thnn.py
index 8564153ece233..c56e923a84383 100644
--- a/torch/nn/backends/thnn.py
+++ b/torch/nn/backends/thnn.py
@@ -2,5 +2,5 @@
 # this is for historical pickle deserialization, it is not used otherwise
 
 
-def _get_thnn_function_backend():
+def _get_thnn_function_backend() -> None:
     pass
diff --git a/torch/nn/common_types.py b/torch/nn/common_types.py
index a8a9d70f4002c..9262c45472271 100644
--- a/torch/nn/common_types.py
+++ b/torch/nn/common_types.py
@@ -1,4 +1,4 @@
-from typing import Optional, TypeAlias as _TypeAlias, TypeVar, Union
+from typing import Optional, TypeAlias as _TypeAlias, TypeVar
 
 from torch import Tensor
 
@@ -11,13 +11,13 @@
 # broadcast to a tuple.
 # Comes in several variants: A tuple of unknown size, and a fixed-size tuple for 1d, 2d, or 3d operations.
 T = TypeVar("T")
-_scalar_or_tuple_any_t: _TypeAlias = Union[T, tuple[T, ...]]
-_scalar_or_tuple_1_t: _TypeAlias = Union[T, tuple[T]]
-_scalar_or_tuple_2_t: _TypeAlias = Union[T, tuple[T, T]]
-_scalar_or_tuple_3_t: _TypeAlias = Union[T, tuple[T, T, T]]
-_scalar_or_tuple_4_t: _TypeAlias = Union[T, tuple[T, T, T, T]]
-_scalar_or_tuple_5_t: _TypeAlias = Union[T, tuple[T, T, T, T, T]]
-_scalar_or_tuple_6_t: _TypeAlias = Union[T, tuple[T, T, T, T, T, T]]
+_scalar_or_tuple_any_t: _TypeAlias = T | tuple[T, ...]
+_scalar_or_tuple_1_t: _TypeAlias = T | tuple[T]
+_scalar_or_tuple_2_t: _TypeAlias = T | tuple[T, T]
+_scalar_or_tuple_3_t: _TypeAlias = T | tuple[T, T, T]
+_scalar_or_tuple_4_t: _TypeAlias = T | tuple[T, T, T, T]
+_scalar_or_tuple_5_t: _TypeAlias = T | tuple[T, T, T, T, T]
+_scalar_or_tuple_6_t: _TypeAlias = T | tuple[T, T, T, T, T, T]
 
 # For arguments which represent size parameters (eg, kernel size, padding)
 _size_any_t: _TypeAlias = _scalar_or_tuple_any_t[int]
diff --git a/torch/nn/cpp.py b/torch/nn/cpp.py
index e447284ad82ba..b4ffd188cd39a 100644
--- a/torch/nn/cpp.py
+++ b/torch/nn/cpp.py
@@ -14,7 +14,7 @@ class OrderedDictWrapper:
     so using properties does not work.
     """
 
-    def __init__(self, cpp_module, attr):
+    def __init__(self, cpp_module, attr) -> None:
         self.cpp_module = cpp_module
         self.attr = attr
 
@@ -37,10 +37,10 @@ def values(self):
     def __iter__(self):
         return self.cpp_dict.__iter__()
 
-    def __len__(self):
+    def __len__(self) -> int:
         return self.cpp_dict.__len__()
 
-    def __contains__(self, key):
+    def __contains__(self, key) -> bool:
         return self.cpp_dict.__contains__(key)
 
     def __getitem__(self, key):
@@ -50,7 +50,7 @@ def __getitem__(self, key):
 class ModuleWrapper(nn.Module):
     """A subclass of ``torch.nn.Module`` that wraps a C++ frontend module and delegates all access."""
 
-    def __init__(self, cpp_module):
+    def __init__(self, cpp_module) -> None:
         # Assign before the super class constructor so ``self.training`` can be
         # assigned to in the super class constructor.
         self.cpp_module = cpp_module
@@ -83,8 +83,8 @@ def training(self):
         return self.cpp_module.training
 
     @training.setter
-    def training(self, mode):
+    def training(self, mode) -> None:
         self.cpp_module.train(mode)
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return self.cpp_module.__repr__()
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index bc1e873c428fb..07fec131d618a 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -4,7 +4,7 @@
 import math
 import warnings
 from collections.abc import Callable
-from typing import Any as _Any, Optional, TYPE_CHECKING, Union
+from typing import Any as _Any, Optional, TYPE_CHECKING
 
 import torch
 from torch import _VF, sym_int as _sym_int, Tensor
@@ -1078,7 +1078,7 @@ def max_unpool3d(
 
 def lp_pool3d(
     input: Tensor,
-    norm_type: Union[int, float],
+    norm_type: int | float,
     kernel_size: BroadcastingList3[int],
     stride: Optional[BroadcastingList3[int]] = None,
     ceil_mode: bool = False,
@@ -1119,7 +1119,7 @@ def lp_pool3d(
 
 def lp_pool2d(
     input: Tensor,
-    norm_type: Union[int, float],
+    norm_type: int | float,
     kernel_size: BroadcastingList2[int],
     stride: Optional[BroadcastingList2[int]] = None,
     ceil_mode: bool = False,
@@ -1158,7 +1158,7 @@ def lp_pool2d(
 
 def lp_pool1d(
     input: Tensor,
-    norm_type: Union[int, float],
+    norm_type: int | float,
     kernel_size: int,
     stride: Optional[BroadcastingList1[int]] = None,
     ceil_mode: bool = False,
@@ -2838,6 +2838,9 @@ def batch_norm(
         # pyrefly: ignore [bad-argument-type]
         _verify_batch_size(input.size())
 
+    if eps <= 0.0:
+        raise ValueError(f"batch_norm eps must be positive, but got {eps}")
+
     return torch.batch_norm(
         input,
         weight,
@@ -3244,7 +3247,7 @@ def poisson_nll_loss(
 def gaussian_nll_loss(
     input: Tensor,
     target: Tensor,
-    var: Union[Tensor, float],
+    var: Tensor | float,
     full: bool = False,
     eps: float = 1e-6,
     reduction: str = "mean",
diff --git a/torch/nn/functional.pyi.in b/torch/nn/functional.pyi.in
index d0b64447e900b..5a3e24b115df7 100644
--- a/torch/nn/functional.pyi.in
+++ b/torch/nn/functional.pyi.in
@@ -1,9 +1,9 @@
 # ${generated_comment}
 # mypy: allow-untyped-defs
 
-from collections.abc import Sequence
-from typing import Any, Callable, Literal, overload
-from typing_extensions import TypeAlias
+from collections.abc import Callable, Sequence
+from enum import Enum
+from typing import Any, Literal, overload, TypeAlias
 
 from torch import Tensor
 from torch.types import _dtype, _int, _size
@@ -716,6 +716,39 @@ def rrelu(
 
 __all__ += ["rrelu"]
 
+def scaled_mm(
+    mat_a: Tensor,
+    mat_b: Tensor,
+    scale_a: Tensor | list[Tensor],
+    scale_recipe_a: ScalingType | list[ScalingType],
+    scale_b: Tensor | list[Tensor],
+    scale_recipe_b: ScalingType | list[ScalingType],
+    swizzle_a: SwizzleType | list[SwizzleType] | None = None,
+    swizzle_b: SwizzleType | list[SwizzleType] | None = None,
+    bias: Tensor | None = None,
+    output_dtype: _dtype = ...,
+    contraction_dim: list[int] | tuple[int] = (),
+    use_fast_accum: bool = False,
+) -> Tensor: ...
+
+__all__ += ["scaled_mm"]
+
+class SwizzleType(Enum):
+    NO_SWIZZLE = 0
+    SWIZZLE_32_4_4 = 1
+
+__all__ += ["SwizzleType"]
+
+class ScalingType(Enum):
+    TensorWise = 0
+    RowWise = 1
+    BlockWise1x16 = 2
+    BlockWise1x32 = 3
+    BlockWise1x128 = 4
+    BlockWise128x128 = 5
+
+__all__ += ["ScalingType"]
+
 def selu(input: Tensor, inplace: bool = ...) -> Tensor: ...
 
 __all__ += ["selu"]
diff --git a/torch/nn/init.py b/torch/nn/init.py
index 78fe7cd7ff80c..3956d9399876e 100644
--- a/torch/nn/init.py
+++ b/torch/nn/init.py
@@ -3,7 +3,7 @@
 import math
 import warnings
 from collections.abc import Callable
-from typing import Literal, Optional as _Optional, TypeVar, Union
+from typing import Literal, Optional as _Optional, TypeVar
 from typing_extensions import ParamSpec
 
 import torch
@@ -138,7 +138,7 @@ def _no_grad_zero_(tensor: Tensor) -> Tensor:
 
 
 def calculate_gain(
-    nonlinearity: _NonlinearityType, param: _Optional[Union[int, float]] = None
+    nonlinearity: _NonlinearityType, param: _Optional[int | float] = None
 ) -> float:
     r"""Return the recommended gain value for the given nonlinearity function.
 
diff --git a/torch/nn/modules/batchnorm.py b/torch/nn/modules/batchnorm.py
index 6a78aba2ad7db..2ac05f2e8f933 100644
--- a/torch/nn/modules/batchnorm.py
+++ b/torch/nn/modules/batchnorm.py
@@ -292,9 +292,9 @@ class BatchNorm1d(_BatchNorm):
     of size `C` (where `C` is the number of features or channels of the input). By default, the
     elements of :math:`\gamma` are set to 1 and the elements of :math:`\beta` are set to 0.
     At train time in the forward pass, the variance is calculated via the biased estimator,
-    equivalent to ``torch.var(input, unbiased=False)``. However, the value stored in the
+    equivalent to ``torch.var(input, correction=0)``. However, the value stored in the
     moving average of the variance is calculated via the unbiased  estimator, equivalent to
-    ``torch.var(input, unbiased=True)``.
+    ``torch.var(input, correction=1)``.
 
     Also by default, during training this layer keeps running estimates of its
     computed mean and variance, which are then used for normalization during
@@ -404,9 +404,9 @@ class BatchNorm2d(_BatchNorm):
     of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
     to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the
     standard-deviation is calculated via the biased estimator, equivalent to
-    ``torch.var(input, unbiased=False)``. However, the value stored in the moving average of the
+    ``torch.var(input, correction=0)``. However, the value stored in the moving average of the
     standard-deviation is calculated via the unbiased  estimator, equivalent to
-    ``torch.var(input, unbiased=True)``.
+    ``torch.var(input, correction=1)``.
 
     Also by default, during training this layer keeps running estimates of its
     computed mean and variance, which are then used for normalization during
@@ -515,9 +515,9 @@ class BatchNorm3d(_BatchNorm):
     of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
     to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the
     standard-deviation is calculated via the biased estimator, equivalent to
-    ``torch.var(input, unbiased=False)``. However, the value stored in the moving average of the
+    ``torch.var(input, correction=0)``. However, the value stored in the moving average of the
     standard-deviation is calculated via the unbiased  estimator, equivalent to
-    ``torch.var(input, unbiased=True)``.
+    ``torch.var(input, correction=1)``.
 
     Also by default, during training this layer keeps running estimates of its
     computed mean and variance, which are then used for normalization during
@@ -628,7 +628,7 @@ class SyncBatchNorm(_BatchNorm):
     By default, the elements of :math:`\gamma` are sampled from
     :math:`\mathcal{U}(0, 1)` and the elements of :math:`\beta` are set to 0.
     The standard-deviation is calculated via the biased estimator, equivalent to
-    `torch.var(input, unbiased=False)`.
+    `torch.var(input, correction=0)`.
 
     Also by default, during training this layer keeps running estimates of its
     computed mean and variance, which are then used for normalization during
diff --git a/torch/nn/modules/container.py b/torch/nn/modules/container.py
index 1132dc2bb0d4d..f062c4bcbd12b 100644
--- a/torch/nn/modules/container.py
+++ b/torch/nn/modules/container.py
@@ -4,7 +4,7 @@
 import operator
 from collections import abc as container_abcs, OrderedDict
 from itertools import chain, islice
-from typing import Any, Optional, overload, TYPE_CHECKING, TypeVar, Union
+from typing import Any, Optional, overload, TYPE_CHECKING, TypeVar
 from typing_extensions import deprecated, Self
 
 import torch
@@ -131,7 +131,7 @@ def _get_item_by_idx(self, iterator: Iterable[_V], idx: int) -> _V:
         return next(islice(iterator, idx, None))
 
     @_copy_to_script_wrapper
-    def __getitem__(self, idx: Union[slice, int]) -> Union[Sequential, Module]:
+    def __getitem__(self, idx: slice | int) -> Sequential | Module:
         if isinstance(idx, slice):
             return self.__class__(OrderedDict(list(self._modules.items())[idx]))
         else:
@@ -141,7 +141,7 @@ def __setitem__(self, idx: int, module: Module) -> None:
         key: str = self._get_item_by_idx(self._modules.keys(), idx)
         return setattr(self, key, module)
 
-    def __delitem__(self, idx: Union[slice, int]) -> None:
+    def __delitem__(self, idx: slice | int) -> None:
         if isinstance(idx, slice):
             for key in list(self._modules.keys())[idx]:
                 delattr(self, key)
@@ -172,7 +172,7 @@ def __add__(self, other) -> Sequential:
                 f"of Sequential class, but {str(type(other))} is given."
             )
 
-    def pop(self, key: Union[int, slice]) -> Module:
+    def pop(self, key: int | slice) -> Module:
         """
         Pop ``key`` from self.
         """
@@ -379,7 +379,7 @@ def __getitem__(self, idx: slice) -> ModuleList: ...
     def __getitem__(self, idx: int) -> Module: ...
 
     @_copy_to_script_wrapper
-    def __getitem__(self, idx: Union[int, slice]) -> Union[Module, ModuleList]:
+    def __getitem__(self, idx: int | slice) -> Module | ModuleList:
         if isinstance(idx, slice):
             return self.__class__(list(self._modules.values())[idx])
         else:
@@ -389,7 +389,7 @@ def __setitem__(self, idx: int, module: Module) -> None:
         idx = self._get_abs_string_index(idx)
         return setattr(self, str(idx), module)
 
-    def __delitem__(self, idx: Union[int, slice]) -> None:
+    def __delitem__(self, idx: int | slice) -> None:
         if isinstance(idx, slice):
             for k in range(len(self._modules))[idx]:
                 delattr(self, str(k))
@@ -478,7 +478,7 @@ def append(self, module: Module) -> Self:
         self.add_module(str(len(self)), module)
         return self
 
-    def pop(self, key: Union[int, slice]) -> Module:
+    def pop(self, key: int | slice) -> Module:
         v = self[key]
         del self[key]
         return v
@@ -519,8 +519,7 @@ class ModuleDict(Module):
       :meth:`~torch.nn.ModuleDict.update`).
 
     Note that :meth:`~torch.nn.ModuleDict.update` with other unordered mapping
-    types (e.g., Python's plain ``dict`` before Python version 3.6) does not
-    preserve the order of the merged mapping.
+    types does not preserve the order of the merged mapping.
 
     Args:
         modules (iterable, optional): a mapping (dictionary) of (string: module)
@@ -960,7 +959,7 @@ def values(self) -> Iterable[Any]:
         r"""Return an iterable of the ParameterDict values."""
         return (self[k] for k in self._keys)
 
-    def update(self, parameters: Union[Mapping[str, Any], ParameterDict]) -> None:
+    def update(self, parameters: Mapping[str, Any] | ParameterDict) -> None:
         r"""Update the :class:`~torch.nn.ParameterDict` with key-value pairs from ``parameters``, overwriting existing keys.
 
         .. note::
diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py
index e0923fb786493..b539203f6fedd 100644
--- a/torch/nn/modules/conv.py
+++ b/torch/nn/modules/conv.py
@@ -1,6 +1,6 @@
 # mypy: allow-untyped-defs
 import math
-from typing import Literal, Optional, Union
+from typing import Literal, Optional
 from typing_extensions import deprecated
 
 import torch
@@ -75,7 +75,7 @@ def _conv_forward(  # type: ignore[empty-body]
     out_channels: int
     kernel_size: tuple[int, ...]
     stride: tuple[int, ...]
-    padding: Union[str, tuple[int, ...]]
+    padding: str | tuple[int, ...]
     dilation: tuple[int, ...]
     transposed: bool
     output_padding: tuple[int, ...]
@@ -90,7 +90,7 @@ def __init__(
         out_channels: int,
         kernel_size: tuple[int, ...],
         stride: tuple[int, ...],
-        padding: Union[str, tuple[int, ...]],
+        padding: str | tuple[int, ...],
         dilation: tuple[int, ...],
         transposed: bool,
         output_padding: tuple[int, ...],
@@ -323,7 +323,7 @@ def __init__(
         out_channels: int,
         kernel_size: _size_1_t,
         stride: _size_1_t = 1,
-        padding: Union[str, _size_1_t] = 0,
+        padding: str | _size_1_t = 0,
         dilation: _size_1_t = 1,
         groups: int = 1,
         bias: bool = True,
@@ -503,7 +503,7 @@ def __init__(
         out_channels: int,
         kernel_size: _size_2_t,
         stride: _size_2_t = 1,
-        padding: Union[str, _size_2_t] = 0,
+        padding: str | _size_2_t = 0,
         dilation: _size_2_t = 1,
         groups: int = 1,
         bias: bool = True,
@@ -673,7 +673,7 @@ def __init__(
         out_channels: int,
         kernel_size: _size_3_t,
         stride: _size_3_t = 1,
-        padding: Union[str, _size_3_t] = 0,
+        padding: str | _size_3_t = 0,
         dilation: _size_3_t = 1,
         groups: int = 1,
         bias: bool = True,
diff --git a/torch/nn/modules/flatten.py b/torch/nn/modules/flatten.py
index c4920ccd65b00..146a1890d4224 100644
--- a/torch/nn/modules/flatten.py
+++ b/torch/nn/modules/flatten.py
@@ -1,5 +1,4 @@
 # mypy: allow-untyped-defs
-from typing import Union
 
 from torch import Tensor
 from torch.types import _size
@@ -112,12 +111,10 @@ class Unflatten(Module):
     NamedShape = tuple[tuple[str, int]]
 
     __constants__ = ["dim", "unflattened_size"]
-    dim: Union[int, str]
-    unflattened_size: Union[_size, NamedShape]
+    dim: int | str
+    unflattened_size: _size | NamedShape
 
-    def __init__(
-        self, dim: Union[int, str], unflattened_size: Union[_size, NamedShape]
-    ) -> None:
+    def __init__(self, dim: int | str, unflattened_size: _size | NamedShape) -> None:
         super().__init__()
 
         if isinstance(dim, int):
diff --git a/torch/nn/modules/instancenorm.py b/torch/nn/modules/instancenorm.py
index da3d3658553f0..058ffb3ed9aa9 100644
--- a/torch/nn/modules/instancenorm.py
+++ b/torch/nn/modules/instancenorm.py
@@ -141,7 +141,7 @@ class InstanceNorm1d(_InstanceNorm):
     for each object in a mini-batch. :math:`\gamma` and :math:`\beta` are learnable parameter vectors
     of size `C` (where `C` is the number of features or channels of the input) if :attr:`affine` is ``True``.
     The variance is calculated via the biased estimator, equivalent to
-    `torch.var(input, unbiased=False)`.
+    `torch.var(input, correction=0)`.
 
     By default, this layer uses instance statistics computed from input data in
     both training and evaluation modes.
@@ -256,7 +256,7 @@ class InstanceNorm2d(_InstanceNorm):
     for each object in a mini-batch. :math:`\gamma` and :math:`\beta` are learnable parameter vectors
     of size `C` (where `C` is the input size) if :attr:`affine` is ``True``.
     The standard-deviation is calculated via the biased estimator, equivalent to
-    `torch.var(input, unbiased=False)`.
+    `torch.var(input, correction=0)`.
 
     By default, this layer uses instance statistics computed from input data in
     both training and evaluation modes.
@@ -372,7 +372,7 @@ class InstanceNorm3d(_InstanceNorm):
     for each object in a mini-batch. :math:`\gamma` and :math:`\beta` are learnable parameter vectors
     of size C (where C is the input size) if :attr:`affine` is ``True``.
     The standard-deviation is calculated via the biased estimator, equivalent to
-    `torch.var(input, unbiased=False)`.
+    `torch.var(input, correction=0)`.
 
     By default, this layer uses instance statistics computed from input data in
     both training and evaluation modes.
diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index c5d2a94cf3941..05b39ba762f47 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -1,6 +1,6 @@
 # mypy: allow-untyped-defs
 from collections.abc import Callable
-from typing import Optional, Union
+from typing import Optional
 from typing_extensions import deprecated
 
 from torch import Tensor
@@ -123,9 +123,6 @@ class L1Loss(_Loss):
 
     __constants__ = ["reduction"]
 
-    def __init__(self, size_average=None, reduce=None, reduction: str = "mean") -> None:
-        super().__init__(size_average, reduce, reduction)
-
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
         """
         Runs the forward pass.
@@ -452,9 +449,7 @@ def __init__(
         self.full = full
         self.eps = eps
 
-    def forward(
-        self, input: Tensor, target: Tensor, var: Union[Tensor, float]
-    ) -> Tensor:
+    def forward(self, input: Tensor, target: Tensor, var: Tensor | float) -> Tensor:
         """
         Runs the forward pass.
         """
@@ -625,9 +620,6 @@ class MSELoss(_Loss):
 
     __constants__ = ["reduction"]
 
-    def __init__(self, size_average=None, reduce=None, reduction: str = "mean") -> None:
-        super().__init__(size_average, reduce, reduction)
-
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
         """
         Runs the forward pass.
@@ -712,15 +704,6 @@ class BCELoss(_WeightedLoss):
 
     __constants__ = ["reduction"]
 
-    def __init__(
-        self,
-        weight: Optional[Tensor] = None,
-        size_average=None,
-        reduce=None,
-        reduction: str = "mean",
-    ) -> None:
-        super().__init__(weight, size_average, reduce, reduction)
-
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
         """
         Runs the forward pass.
@@ -985,9 +968,6 @@ class MultiLabelMarginLoss(_Loss):
 
     __constants__ = ["reduction"]
 
-    def __init__(self, size_average=None, reduce=None, reduction: str = "mean") -> None:
-        super().__init__(size_average, reduce, reduction)
-
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
         """Runs the forward pass."""
         return F.multilabel_margin_loss(input, target, reduction=self.reduction)
@@ -1175,9 +1155,6 @@ class SoftMarginLoss(_Loss):
 
     __constants__ = ["reduction"]
 
-    def __init__(self, size_average=None, reduce=None, reduction: str = "mean") -> None:
-        super().__init__(size_average, reduce, reduction)
-
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
         """Runs the forward pass."""
         return F.soft_margin_loss(input, target, reduction=self.reduction)
@@ -1434,15 +1411,6 @@ class MultiLabelSoftMarginLoss(_WeightedLoss):
 
     __constants__ = ["reduction"]
 
-    def __init__(
-        self,
-        weight: Optional[Tensor] = None,
-        size_average=None,
-        reduce=None,
-        reduction: str = "mean",
-    ) -> None:
-        super().__init__(weight, size_average, reduce, reduction)
-
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
         """Runs the forward pass."""
         return F.multilabel_soft_margin_loss(
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index f7e3d2f262def..6557f60389964 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -294,7 +294,7 @@ def register_module_forward_hook(
 
 
 def register_module_backward_hook(
-    hook: Callable[["Module", _grad_t, _grad_t], Union[None, _grad_t]],
+    hook: Callable[["Module", _grad_t, _grad_t], None | _grad_t],
 ) -> RemovableHandle:
     r"""Register a backward hook common to all the modules.
 
@@ -323,7 +323,7 @@ def register_module_backward_hook(
 
 
 def register_module_full_backward_pre_hook(
-    hook: Callable[["Module", _grad_t], Union[None, _grad_t]],
+    hook: Callable[["Module", _grad_t], None | _grad_t],
 ) -> RemovableHandle:
     r"""Register a backward pre-hook common to all the modules.
 
@@ -350,7 +350,7 @@ def register_module_full_backward_pre_hook(
 
 
 def register_module_full_backward_hook(
-    hook: Callable[["Module", _grad_t, _grad_t], Union[None, _grad_t]],
+    hook: Callable[["Module", _grad_t, _grad_t], None | _grad_t],
 ) -> RemovableHandle:
     r"""Register a backward hook common to all the modules.
 
@@ -1073,7 +1073,7 @@ def apply(self, fn: Callable[["Module"], None]) -> Self:
         fn(self)
         return self
 
-    def cuda(self, device: Optional[Union[int, device]] = None) -> Self:
+    def cuda(self, device: Optional[int | device] = None) -> Self:
         r"""Move all model parameters and buffers to the GPU.
 
         This also makes associated parameters and buffers different objects. So
@@ -1092,7 +1092,7 @@ def cuda(self, device: Optional[Union[int, device]] = None) -> Self:
         """
         return self._apply(lambda t: t.cuda(device))
 
-    def ipu(self, device: Optional[Union[int, device]] = None) -> Self:
+    def ipu(self, device: Optional[int | device] = None) -> Self:
         r"""Move all model parameters and buffers to the IPU.
 
         This also makes associated parameters and buffers different objects. So
@@ -1111,7 +1111,7 @@ def ipu(self, device: Optional[Union[int, device]] = None) -> Self:
         """
         return self._apply(lambda t: t.ipu(device))
 
-    def xpu(self, device: Optional[Union[int, device]] = None) -> Self:
+    def xpu(self, device: Optional[int | device] = None) -> Self:
         r"""Move all model parameters and buffers to the XPU.
 
         This also makes associated parameters and buffers different objects. So
@@ -1130,7 +1130,7 @@ def xpu(self, device: Optional[Union[int, device]] = None) -> Self:
         """
         return self._apply(lambda t: t.xpu(device))
 
-    def mtia(self, device: Optional[Union[int, device]] = None) -> Self:
+    def mtia(self, device: Optional[int | device] = None) -> Self:
         r"""Move all model parameters and buffers to the MTIA.
 
         This also makes associated parameters and buffers different objects. So
@@ -1160,7 +1160,7 @@ def cpu(self) -> Self:
         """
         return self._apply(lambda t: t.cpu())
 
-    def type(self, dst_type: Union[dtype, str]) -> Self:
+    def type(self, dst_type: dtype | str) -> Self:
         r"""Casts all parameters and buffers to :attr:`dst_type`.
 
         .. note::
@@ -1384,7 +1384,7 @@ def convert(t):
 
     def register_full_backward_pre_hook(
         self,
-        hook: Callable[["Module", _grad_t], Union[None, _grad_t]],
+        hook: Callable[["Module", _grad_t], None | _grad_t],
         prepend: bool = False,
     ) -> RemovableHandle:
         r"""Register a backward pre-hook on the module.
@@ -1432,7 +1432,7 @@ def register_full_backward_pre_hook(
         return handle
 
     def register_backward_hook(
-        self, hook: Callable[["Module", _grad_t, _grad_t], Union[None, _grad_t]]
+        self, hook: Callable[["Module", _grad_t, _grad_t], None | _grad_t]
     ) -> RemovableHandle:
         r"""Register a backward hook on the module.
 
@@ -1459,7 +1459,7 @@ def register_backward_hook(
 
     def register_full_backward_hook(
         self,
-        hook: Callable[["Module", _grad_t, _grad_t], Union[None, _grad_t]],
+        hook: Callable[["Module", _grad_t, _grad_t], None | _grad_t],
         prepend: bool = False,
     ) -> RemovableHandle:
         r"""Register a backward hook on the module.
@@ -1623,12 +1623,9 @@ def _maybe_warn_non_full_backward_hook(self, inputs, result, grad_fn) -> None:
 
     def register_forward_pre_hook(
         self,
-        hook: Union[
-            Callable[[T, tuple[Any, ...]], Optional[Any]],
-            Callable[
-                [T, tuple[Any, ...], dict[str, Any]],
-                Optional[tuple[Any, dict[str, Any]]],
-            ],
+        hook: Callable[[T, tuple[Any, ...]], Optional[Any]]
+        | Callable[
+            [T, tuple[Any, ...], dict[str, Any]], Optional[tuple[Any, dict[str, Any]]]
         ],
         *,
         prepend: bool = False,
@@ -1689,10 +1686,8 @@ def register_forward_pre_hook(
 
     def register_forward_hook(
         self,
-        hook: Union[
-            Callable[[T, tuple[Any, ...], Any], Optional[Any]],
-            Callable[[T, tuple[Any, ...], dict[str, Any], Any], Optional[Any]],
-        ],
+        hook: Callable[[T, tuple[Any, ...], Any], Optional[Any]]
+        | Callable[[T, tuple[Any, ...], dict[str, Any], Any], Optional[Any]],
         *,
         prepend: bool = False,
         with_kwargs: bool = False,
@@ -2438,6 +2433,7 @@ def _load_from_state_dict(
                     not is_param_lazy
                     and len(param.shape) == 0
                     and len(input_param.shape) == 1
+                    and input_param.shape[0] == 1
                 ):
                     input_param = input_param[0]
 
@@ -2521,7 +2517,7 @@ def _load_from_state_dict(
             unexpected_keys.append(extra_state_key)
 
         if strict:
-            for key in state_dict.keys():
+            for key in state_dict:
                 if key.startswith(prefix) and key != extra_state_key:
                     input_name = key[len(prefix) :].split(".", 1)
                     # Must be Module if it have attributes
@@ -3040,7 +3036,7 @@ def _replicate_for_data_parallel(self):
 
         return replica
 
-    def compile(self, *args, **kwargs):
+    def compile(self, *args, **kwargs) -> None:
         """
         Compile this Module's forward using :func:`torch.compile`.
 
diff --git a/torch/nn/modules/normalization.py b/torch/nn/modules/normalization.py
index 1474de008c185..60bd561bfd0e4 100644
--- a/torch/nn/modules/normalization.py
+++ b/torch/nn/modules/normalization.py
@@ -119,7 +119,7 @@ class LayerNorm(Module):
     :math:`\gamma` and :math:`\beta` are learnable affine transform parameters of
     :attr:`normalized_shape` if :attr:`elementwise_affine` is ``True``.
     The variance is calculated via the biased estimator, equivalent to
-    `torch.var(input, unbiased=False)`.
+    `torch.var(input, correction=0)`.
 
     .. note::
         Unlike Batch Normalization and Instance Normalization, which applies
@@ -253,7 +253,7 @@ class GroupNorm(Module):
     per-channel affine transform parameter vectors of size :attr:`num_channels` if
     :attr:`affine` is ``True``.
     The variance is calculated via the biased estimator, equivalent to
-    `torch.var(input, unbiased=False)`.
+    `torch.var(input, correction=0)`.
 
     This layer uses statistics computed from input data in both training and
     evaluation modes.
diff --git a/torch/nn/modules/transformer.py b/torch/nn/modules/transformer.py
index 5f445bf26c755..abcd7240a742c 100644
--- a/torch/nn/modules/transformer.py
+++ b/torch/nn/modules/transformer.py
@@ -2,7 +2,7 @@
 import copy
 import warnings
 from collections.abc import Callable
-from typing import Any, Optional, Union
+from typing import Any, Optional
 
 import torch
 import torch.nn.functional as F
@@ -105,7 +105,7 @@ def __init__(
         num_decoder_layers: int = 6,
         dim_feedforward: int = 2048,
         dropout: float = 0.1,
-        activation: Union[str, Callable[[Tensor], Tensor]] = F.relu,
+        activation: str | Callable[[Tensor], Tensor] = F.relu,
         custom_encoder: Optional[Any] = None,
         custom_decoder: Optional[Any] = None,
         layer_norm_eps: float = 1e-5,
@@ -745,7 +745,7 @@ def __init__(
         nhead: int,
         dim_feedforward: int = 2048,
         dropout: float = 0.1,
-        activation: Union[str, Callable[[Tensor], Tensor]] = F.relu,
+        activation: str | Callable[[Tensor], Tensor] = F.relu,
         layer_norm_eps: float = 1e-5,
         batch_first: bool = False,
         norm_first: bool = False,
@@ -1031,7 +1031,7 @@ def __init__(
         nhead: int,
         dim_feedforward: int = 2048,
         dropout: float = 0.1,
-        activation: Union[str, Callable[[Tensor], Tensor]] = F.relu,
+        activation: str | Callable[[Tensor], Tensor] = F.relu,
         layer_norm_eps: float = 1e-5,
         batch_first: bool = False,
         norm_first: bool = False,
diff --git a/torch/nn/parallel/_functions.py b/torch/nn/parallel/_functions.py
index e88a8e1795fb1..70a2eace9eff1 100644
--- a/torch/nn/parallel/_functions.py
+++ b/torch/nn/parallel/_functions.py
@@ -1,6 +1,5 @@
 import warnings
 from itertools import chain
-from typing import Optional
 
 import torch
 from torch._utils import _get_device_index
@@ -116,7 +115,7 @@ def backward(ctx, *grad_output):
 
 
 # background streams used for copying
-_streams: Optional[list[Optional[torch.Stream]]] = None
+_streams: list[torch.Stream | None] | None = None
 
 
 def _get_stream(device: torch.device):
diff --git a/torch/nn/parallel/data_parallel.py b/torch/nn/parallel/data_parallel.py
index 9a0f4973d31b2..4f2319439f092 100644
--- a/torch/nn/parallel/data_parallel.py
+++ b/torch/nn/parallel/data_parallel.py
@@ -3,7 +3,7 @@
 import warnings
 from collections.abc import Sequence
 from itertools import chain
-from typing import Any, Generic, Optional, TypeVar, Union
+from typing import Any, Generic, TypeVar
 
 import torch
 from torch._utils import (
@@ -21,7 +21,7 @@
 __all__ = ["DataParallel", "data_parallel"]
 
 
-def _check_balance(device_ids: Sequence[Union[int, torch.device]]) -> None:
+def _check_balance(device_ids: Sequence[int | torch.device]) -> None:
     imbalance_warn = """
     There is an imbalance between your GPUs. You may want to exclude GPU {} which
     has less than 75% of the memory or cores of GPU {}. You can do so by setting
@@ -30,7 +30,7 @@ def _check_balance(device_ids: Sequence[Union[int, torch.device]]) -> None:
     device_ids = [_get_device_index(x, True) for x in device_ids]
     dev_props = _get_devices_properties(device_ids)
 
-    def warn_imbalance(get_prop):
+    def warn_imbalance(get_prop) -> bool:
         values = [get_prop(props) for props in dev_props]
         min_pos, min_val = min(enumerate(values), key=operator.itemgetter(1))
         max_pos, max_val = max(enumerate(values), key=operator.itemgetter(1))
@@ -136,8 +136,8 @@ class DataParallel(Module, Generic[T]):
     def __init__(
         self,
         module: T,
-        device_ids: Optional[Sequence[Union[int, torch.device]]] = None,
-        output_device: Optional[Union[int, torch.device]] = None,
+        device_ids: Sequence[int | torch.device] | None = None,
+        output_device: int | torch.device | None = None,
         dim: int = 0,
     ) -> None:
         super().__init__()
@@ -197,16 +197,14 @@ def forward(self, *inputs: Any, **kwargs: Any) -> Any:
             outputs = self.parallel_apply(replicas, inputs, module_kwargs)
             return self.gather(outputs, self.output_device)
 
-    def replicate(
-        self, module: T, device_ids: Sequence[Union[int, torch.device]]
-    ) -> list[T]:
+    def replicate(self, module: T, device_ids: Sequence[int | torch.device]) -> list[T]:
         return replicate(module, device_ids, not torch.is_grad_enabled())
 
     def scatter(
         self,
         inputs: tuple[Any, ...],
-        kwargs: Optional[dict[str, Any]],
-        device_ids: Sequence[Union[int, torch.device]],
+        kwargs: dict[str, Any] | None,
+        device_ids: Sequence[int | torch.device],
     ) -> Any:
         return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
 
@@ -217,17 +215,17 @@ def parallel_apply(
             replicas, inputs, kwargs, self.device_ids[: len(replicas)]
         )
 
-    def gather(self, outputs: Any, output_device: Union[int, torch.device]) -> Any:
+    def gather(self, outputs: Any, output_device: int | torch.device) -> Any:
         return gather(outputs, output_device, dim=self.dim)
 
 
 def data_parallel(
     module: Module,
     inputs: Any,
-    device_ids: Optional[Sequence[Union[int, torch.device]]] = None,
-    output_device: Optional[Union[int, torch.device]] = None,
+    device_ids: Sequence[int | torch.device] | None = None,
+    output_device: int | torch.device | None = None,
     dim: int = 0,
-    module_kwargs: Optional[Any] = None,
+    module_kwargs: Any | None = None,
 ) -> torch.Tensor:
     r"""Evaluate module(input) in parallel across the GPUs given in device_ids.
 
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index 1072b68ea1154..4899d123e80a1 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -87,9 +87,9 @@ class _MixedPrecision:
         would result in communication occurring in fp16.
     """
 
-    param_dtype: Optional[torch.dtype] = None
-    reduce_dtype: Optional[torch.dtype] = None
-    buffer_dtype: Optional[torch.dtype] = None
+    param_dtype: torch.dtype | None = None
+    reduce_dtype: torch.dtype | None = None
+    buffer_dtype: torch.dtype | None = None
     # TODO (rohan-varma): keep_low_precision_grads: bool = False
     # TODO (rohan-varma): APIs to allow users to run batchnorm and layernorm
     # in full precision. For DDP, this can be implemented by not performing the
@@ -666,7 +666,7 @@ def __init__(
         static_graph=False,
         delay_all_reduce_named_params=None,
         param_to_hook_all_reduce=None,
-        mixed_precision: Optional[_MixedPrecision] = None,
+        mixed_precision: _MixedPrecision | None = None,
         device_mesh=None,
         skip_all_reduce_unused_params=False,
     ):
@@ -675,7 +675,7 @@ def __init__(
         self._use_python_reducer = (
             torch._dynamo.utils.get_optimize_ddp_mode() == "python_reducer"
         )
-        self.logger: Optional[dist.Logger] = None
+        self.logger: dist.Logger | None = None
         if bool(delay_all_reduce_named_params is not None) != bool(
             param_to_hook_all_reduce is not None
         ):
@@ -839,7 +839,7 @@ def __init__(
         )
 
         # Initialize gradient buffers and register all reduce hook
-        self._delay_grad_buffer: Optional[torch.Tensor] = None
+        self._delay_grad_buffer: torch.Tensor | None = None
         self._delay_grad_views: list[torch.Tensor] = []
         self._delay_all_reduce_all_params = False
         if len(self._delay_all_reduce_params) != 0:
@@ -1626,7 +1626,7 @@ def _post_forward(self, output):
                 treespec,
                 output_is_rref,
             ) = _tree_flatten_with_rref(output)
-            output_placeholders: list[Optional[torch.Tensor]] = [
+            output_placeholders: list[torch.Tensor | None] = [
                 None for _ in range(len(output_tensor_list))
             ]
             # Do not touch tensors that have no grad_fn, which can cause issues
diff --git a/torch/nn/parallel/parallel_apply.py b/torch/nn/parallel/parallel_apply.py
index d0b50bbc20208..6c26aaf5048e9 100644
--- a/torch/nn/parallel/parallel_apply.py
+++ b/torch/nn/parallel/parallel_apply.py
@@ -1,6 +1,6 @@
 import threading
 from collections.abc import Sequence
-from typing import Any, cast, Optional, Union
+from typing import Any, cast
 
 import torch
 from torch._utils import ExceptionWrapper
@@ -12,8 +12,8 @@
 
 
 def get_a_var(
-    obj: Union[torch.Tensor, list[Any], tuple[Any, ...], dict[Any, Any]],
-) -> Optional[torch.Tensor]:
+    obj: torch.Tensor | list[Any] | tuple[Any, ...] | dict[Any, Any],
+) -> torch.Tensor | None:
     if isinstance(obj, torch.Tensor):
         return obj
 
@@ -31,8 +31,8 @@ def get_a_var(
 def parallel_apply(
     modules: Sequence[Module],
     inputs: Sequence[Any],
-    kwargs_tup: Optional[Sequence[dict[str, Any]]] = None,
-    devices: Optional[Sequence[Optional[Union[int, torch.device]]]] = None,
+    kwargs_tup: Sequence[dict[str, Any]] | None = None,
+    devices: Sequence[int | torch.device | None] | None = None,
 ) -> list[Any]:
     r"""Apply each `module` in :attr:`modules` in parallel on each of :attr:`devices`.
 
@@ -73,8 +73,8 @@ def _worker(
         module: Module,
         input: Any,
         kwargs: dict[str, Any],
-        device: Optional[Union[int, torch.device]] = None,
-        stream: Optional[torch.Stream] = None,
+        device: int | torch.device | None = None,
+        stream: torch.Stream | None = None,
     ) -> None:
         torch.set_grad_enabled(grad_enabled)
         if device is None:
diff --git a/torch/nn/parallel/replicate.py b/torch/nn/parallel/replicate.py
index 6c6e4567efa11..7e7844ab4aba2 100644
--- a/torch/nn/parallel/replicate.py
+++ b/torch/nn/parallel/replicate.py
@@ -1,6 +1,6 @@
 from collections import OrderedDict
 from collections.abc import Iterator, Sequence
-from typing import cast, Optional, TYPE_CHECKING, TypeVar, Union
+from typing import cast, TYPE_CHECKING, TypeVar
 from typing_extensions import TypeIs
 
 import torch
@@ -49,7 +49,7 @@ def _is_jit_enabled() -> "EnabledProxy":
 #
 # currently a module cannot be replicated properly if the descendants of
 # any ScriptModule contains python module (type 1 above)
-def _replicatable_module(module: Module, memo: Optional[set[Module]] = None) -> bool:
+def _replicatable_module(module: Module, memo: set[Module] | None = None) -> bool:
     # module.modules() contains module itself as the first element
     def descendant_modules(module: Module) -> Iterator[Module]:
         gen = module.modules()
@@ -82,7 +82,7 @@ def descendant_modules(module: Module) -> Iterator[Module]:
 
 def _broadcast_coalesced_reshape(
     tensors: Sequence[torch.Tensor],
-    devices: Sequence[Union[int, torch.device]],
+    devices: Sequence[int | torch.device],
     detach: bool = False,
 ) -> list[list[torch.Tensor]]:
     from torch.nn.parallel._functions import Broadcast
@@ -106,7 +106,7 @@ def _broadcast_coalesced_reshape(
 
 def replicate(
     network: T,
-    devices: Sequence[Union[int, torch.device]],
+    devices: Sequence[int | torch.device],
     detach: bool = False,
 ) -> list[T]:
     if not _replicatable_module(network):
diff --git a/torch/nn/parallel/scatter_gather.py b/torch/nn/parallel/scatter_gather.py
index 96a1237275252..27aeaf19944dc 100644
--- a/torch/nn/parallel/scatter_gather.py
+++ b/torch/nn/parallel/scatter_gather.py
@@ -1,6 +1,6 @@
 # mypy: allow-untyped-defs
 from collections.abc import Sequence
-from typing import Any, Optional, overload, TypeVar, Union
+from typing import Any, overload, TypeVar
 from typing_extensions import deprecated
 
 import torch
@@ -33,7 +33,7 @@ def _is_namedtuple(obj: Any) -> bool:
 @overload
 def scatter(
     inputs: torch.Tensor,
-    target_gpus: Sequence[Union[int, torch.device]],
+    target_gpus: Sequence[int | torch.device],
     dim: int = ...,
 ) -> tuple[torch.Tensor, ...]: ...
 
@@ -41,7 +41,7 @@ def scatter(
 @overload
 def scatter(
     inputs: T,
-    target_gpus: Sequence[Union[int, torch.device]],
+    target_gpus: Sequence[int | torch.device],
     dim: int = ...,
 ) -> list[T]: ...
 
@@ -93,8 +93,8 @@ def scatter_map(obj):
 
 def scatter_kwargs(
     inputs: tuple[Any, ...],
-    kwargs: Optional[dict[str, Any]],
-    target_gpus: Sequence[Union[int, torch.device]],
+    kwargs: dict[str, Any] | None,
+    target_gpus: Sequence[int | torch.device],
     dim: int = 0,
 ) -> tuple[tuple[Any, ...], tuple[dict[str, Any], ...]]:
     r"""Scatter with support for kwargs dictionary."""
@@ -111,7 +111,7 @@ def scatter_kwargs(
     return tuple(scattered_inputs), tuple(scattered_kwargs)
 
 
-def gather(outputs: Any, target_device: Union[int, torch.device], dim: int = 0) -> Any:
+def gather(outputs: Any, target_device: int | torch.device, dim: int = 0) -> Any:
     r"""Gather tensors from different GPUs on a specified device.
 
     This function is useful for gathering the results of a distributed computation.
diff --git a/torch/nn/parameter.py b/torch/nn/parameter.py
index c03c85f48fc35..64e9d8c2d80f2 100644
--- a/torch/nn/parameter.py
+++ b/torch/nn/parameter.py
@@ -18,7 +18,7 @@
 # Metaclass to combine _TensorMeta and the instance check override for Parameter.
 class _ParameterMeta(torch._C._TensorMeta):
     # Make `isinstance(t, Parameter)` return True for custom tensor instances that have the _is_param flag.
-    def __instancecheck__(self, instance):
+    def __instancecheck__(self, instance) -> bool:
         if self is Parameter:
             if isinstance(instance, torch.Tensor) and getattr(
                 instance, "_is_param", False
@@ -82,7 +82,7 @@ def __deepcopy__(self, memo):
             return result
 
     # pyrefly: ignore [bad-override]
-    def __repr__(self):
+    def __repr__(self) -> str:
         return "Parameter containing:\n" + super().__repr__()
 
     def __reduce_ex__(self, proto):
@@ -125,7 +125,7 @@ class UninitializedTensorMixin:
         torch._has_compatible_shallow_copy_type,
     ]
 
-    def materialize(self, shape, device=None, dtype=None):
+    def materialize(self, shape, device=None, dtype=None) -> None:
         r"""Create a Parameter or Tensor with the same properties of the uninitialized one.
 
         Given a shape, it materializes a parameter in the same device
@@ -163,7 +163,7 @@ def share_memory_(self):
             "`module.share_memory()`."
         )
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return f"<{self.__class__.__name__}>"
 
     def __reduce_ex__(self, proto):
@@ -235,7 +235,7 @@ def __deepcopy__(self, memo):
 # Metaclass to combine _TensorMeta and the instance check override for Buffer.
 class _BufferMeta(torch._C._TensorMeta):
     # Make `isinstance(t, Buffer)` return True for custom tensor instances that have the _is_buffer flag.
-    def __instancecheck__(self, instance):
+    def __instancecheck__(self, instance) -> bool:
         if self is Buffer:
             if isinstance(instance, torch.Tensor) and getattr(
                 instance, "_is_buffer", False
diff --git a/torch/nn/parameter.pyi b/torch/nn/parameter.pyi
index a17821c2b16c1..3d1cddb7e8b8b 100644
--- a/torch/nn/parameter.pyi
+++ b/torch/nn/parameter.pyi
@@ -25,7 +25,7 @@ class Buffer(Tensor):
         data: Tensor = ...,
         requires_grad: bool = ...,
         persistent: bool = ...,
-    ): ...
+    ) -> None: ...
 
 class UninitializedBuffer(Tensor):
     persistent: bool
@@ -34,7 +34,7 @@ class UninitializedBuffer(Tensor):
         data: Tensor = ...,
         requires_grad: bool = ...,
         persistent: bool = ...,
-    ): ...
+    ) -> None: ...
     def materialize(
         self,
         shape: tuple[int, ...],
diff --git a/torch/nn/utils/_expanded_weights/conv_utils.py b/torch/nn/utils/_expanded_weights/conv_utils.py
index d68a82b71268b..f3444ed64d3ca 100644
--- a/torch/nn/utils/_expanded_weights/conv_utils.py
+++ b/torch/nn/utils/_expanded_weights/conv_utils.py
@@ -1,5 +1,4 @@
 # mypy: allow-untyped-defs
-from typing import Optional
 
 import torch
 import torch.nn.functional as F
@@ -143,7 +142,7 @@ def calc_total_padding(func, was_same, padding, dilation, kernel_size):
     kernel_size = [weight_shape[i] for i in range(2, conv_picker(func, 3, 4, 5))]
 
     batch_size = ctx.batch_size
-    results: list[Optional[torch.Tensor]] = []
+    results: list[torch.Tensor | None] = []
     results.append(None)  # for kwarg names
     results.append(None)  # for op reference
 
diff --git a/torch/nn/utils/_expanded_weights/embedding_expanded_weights.py b/torch/nn/utils/_expanded_weights/embedding_expanded_weights.py
index 3b4f0ce46b95b..74350b88b5407 100644
--- a/torch/nn/utils/_expanded_weights/embedding_expanded_weights.py
+++ b/torch/nn/utils/_expanded_weights/embedding_expanded_weights.py
@@ -1,4 +1,4 @@
-from typing import Any, Optional
+from typing import Any
 
 import torch
 import torch.nn.functional as F
@@ -38,7 +38,7 @@ def forward(
     # pyrefly: ignore [bad-override]
     def backward(
         ctx: Any, grad_output: torch.Tensor
-    ) -> tuple[Optional[torch.Tensor], ...]:
+    ) -> tuple[torch.Tensor | None, ...]:
         input, weight = ctx.input, ctx.weight
         padding_idx, scale_grad_by_freq, sparse = (
             ctx.padding_idx,
@@ -61,7 +61,7 @@ def weight_per_sample_grad(weight: torch.Tensor) -> torch.Tensor:
                 1, index, grad_output.reshape(batch_size, -1, embedding_dim)
             )
 
-        results: list[Optional[torch.Tensor]] = []
+        results: list[torch.Tensor | None] = []
         results.append(None)  # for kwarg names
         results.append(None)  # for op reference
 
diff --git a/torch/nn/utils/_expanded_weights/expanded_weights_impl.py b/torch/nn/utils/_expanded_weights/expanded_weights_impl.py
index cfb1d99ac30ec..58ef67e06148a 100644
--- a/torch/nn/utils/_expanded_weights/expanded_weights_impl.py
+++ b/torch/nn/utils/_expanded_weights/expanded_weights_impl.py
@@ -37,10 +37,10 @@
 # all of the RNN decomps run linear with the batch dimension second, even if batch_first was set
 @contextmanager
 def batch_second(args, kwargs):
-    def set_batch_second(ew):
+    def set_batch_second(ew) -> None:
         ew.set_batch_first(False)
 
-    def reset_batch_first(ew):
+    def reset_batch_first(ew) -> None:
         ew.set_batch_first(True)
 
     tree_map_only(ExpandedWeight, set_batch_second, args)
@@ -55,10 +55,10 @@ def reset_batch_first(ew):
 # to support packed sequences, we need to allow for smaller batches. Expanded weights represents the largest batch
 @contextmanager
 def allow_smaller_batches(args, kwargs):
-    def allow(ew):
+    def allow(ew) -> None:
         ew.set_allow_smaller_batches(True)
 
-    def reset(ew):
+    def reset(ew) -> None:
         ew.set_allow_smaller_batches(False)
 
     tree_map_only(ExpandedWeight, allow, args)
@@ -102,7 +102,7 @@ def decorator(autograd_func):
 #
 # Needs to be a tensor subclass to allow reparameterization
 class ExpandedWeight(torch.Tensor):
-    def __init__(self, orig_weight, batch_size, loss_reduction):
+    def __init__(self, orig_weight, batch_size, loss_reduction) -> None:
         self.batch_size = batch_size
         self.batch_first = True
         self.allow_smaller_batches = False
@@ -179,8 +179,8 @@ def data_ptr(self):
     def get_device(self):
         return self.orig_weight.get_device()
 
-    def set_allow_smaller_batches(self, is_allow_smaller_batches):
+    def set_allow_smaller_batches(self, is_allow_smaller_batches) -> None:
         self.allow_smaller_batches = is_allow_smaller_batches
 
-    def set_batch_first(self, is_batch_first=True):
+    def set_batch_first(self, is_batch_first=True) -> None:
         self.batch_first = is_batch_first
diff --git a/torch/nn/utils/_expanded_weights/expanded_weights_utils.py b/torch/nn/utils/_expanded_weights/expanded_weights_utils.py
index ec6d55305fb46..32f24cb4f5d04 100644
--- a/torch/nn/utils/_expanded_weights/expanded_weights_utils.py
+++ b/torch/nn/utils/_expanded_weights/expanded_weights_utils.py
@@ -1,5 +1,4 @@
 # mypy: allow-untyped-defs
-from typing import Optional
 
 import torch
 
@@ -94,7 +93,7 @@ def _check_and_unexpand_args(func, expanded_args, expanded_kwargs):
                 f"input batch size of {batch_size} with ExpandedWeight of batch size {arg.batch_size}"
             )
 
-    loss_reduction: Optional[str] = None
+    loss_reduction: str | None = None
     for arg in expanded_args + tuple(expanded_kwargs.values()):
         if isinstance(arg, ExpandedWeight):
             if loss_reduction is None:
@@ -123,7 +122,7 @@ def maybe_scale_by_batch_size(grad_sample, expanded_weight):
         return grad_sample
 
 
-def set_grad_sample_if_exists(maybe_expanded_weight, per_sample_grad_fn):
+def set_grad_sample_if_exists(maybe_expanded_weight, per_sample_grad_fn) -> None:
     unpacked = unpack_expanded_weight_or_tensor(maybe_expanded_weight)
     if isinstance(maybe_expanded_weight, ExpandedWeight):
         grad_sample_contribution = maybe_scale_by_batch_size(
diff --git a/torch/nn/utils/_expanded_weights/group_norm_expanded_weights.py b/torch/nn/utils/_expanded_weights/group_norm_expanded_weights.py
index 9ddf60e0a54ea..373222c2f049a 100644
--- a/torch/nn/utils/_expanded_weights/group_norm_expanded_weights.py
+++ b/torch/nn/utils/_expanded_weights/group_norm_expanded_weights.py
@@ -1,7 +1,6 @@
 # mypy: allow-untyped-defs
 import operator
 from functools import reduce
-from typing import Optional
 
 import torch
 import torch.nn.functional as F
@@ -53,7 +52,7 @@ def backward(ctx, grad_output):
         weight, bias, eps = ctx.weight, ctx.bias, ctx.eps
         mean, rstd = ctx.mean, ctx.rstd
 
-        results: list[Optional[torch.Tensor]] = []
+        results: list[torch.Tensor | None] = []
         results.append(None)  # for kwarg names
         results.append(None)  # for op reference
 
diff --git a/torch/nn/utils/_expanded_weights/instance_norm_expanded_weights.py b/torch/nn/utils/_expanded_weights/instance_norm_expanded_weights.py
index 613ce90431b8b..8a5237cb4e324 100644
--- a/torch/nn/utils/_expanded_weights/instance_norm_expanded_weights.py
+++ b/torch/nn/utils/_expanded_weights/instance_norm_expanded_weights.py
@@ -1,6 +1,5 @@
 # mypy: allow-untyped-defs
 from functools import partial
-from typing import Optional
 
 import torch
 import torch.nn.functional as F
@@ -42,7 +41,7 @@ def backward(ctx, grad_output):
         input, running_mean, running_var = ctx.input, ctx.running_mean, ctx.running_var
         weight, bias, eps = ctx.weight, ctx.bias, ctx.eps
 
-        results: list[Optional[torch.Tensor]] = []
+        results: list[torch.Tensor | None] = []
         results.append(None)  # for kwarg names
         results.append(None)  # for op reference
         if input.requires_grad:
diff --git a/torch/nn/utils/_expanded_weights/layer_norm_expanded_weights.py b/torch/nn/utils/_expanded_weights/layer_norm_expanded_weights.py
index ff5b5a61e7f5d..705253861dbd0 100644
--- a/torch/nn/utils/_expanded_weights/layer_norm_expanded_weights.py
+++ b/torch/nn/utils/_expanded_weights/layer_norm_expanded_weights.py
@@ -1,5 +1,4 @@
 # mypy: allow-untyped-defs
-from typing import Optional
 
 import torch
 import torch.nn.functional as F
@@ -54,7 +53,7 @@ def weight_per_sample_grad(weight):
         input, normalized_shape = ctx.args
         mean, rstd = ctx.mean, ctx.rstd
 
-        results: list[Optional[torch.Tensor]] = []
+        results: list[torch.Tensor | None] = []
         results.append(None)  # for kwarg names
         results.append(None)  # for op reference
         if input.requires_grad:
diff --git a/torch/nn/utils/_expanded_weights/linear_expanded_weights.py b/torch/nn/utils/_expanded_weights/linear_expanded_weights.py
index 80903782db18e..2cd6b96f58bd6 100644
--- a/torch/nn/utils/_expanded_weights/linear_expanded_weights.py
+++ b/torch/nn/utils/_expanded_weights/linear_expanded_weights.py
@@ -1,5 +1,4 @@
 # mypy: allow-untyped-defs
-from typing import Optional
 
 import torch
 import torch.nn.functional as F
@@ -40,7 +39,7 @@ def forward(ctx, _, __, *expanded_args_and_kwargs):
     def backward(ctx, grad_output):
         input, weight = ctx.args
         bias = ctx.kwargs["bias"]
-        results: list[Optional[torch.Tensor]] = []
+        results: list[torch.Tensor | None] = []
         results.append(None)  # for kwarg_names
         results.append(None)  # for op reference
 
diff --git a/torch/nn/utils/clip_grad.py b/torch/nn/utils/clip_grad.py
index 99c2abe4e56c1..30202708bfa38 100644
--- a/torch/nn/utils/clip_grad.py
+++ b/torch/nn/utils/clip_grad.py
@@ -5,7 +5,7 @@
 import typing
 import warnings
 from collections.abc import Callable
-from typing import cast, Optional, TypeAlias, TypeVar, Union
+from typing import cast, TypeAlias, TypeVar
 from typing_extensions import deprecated, ParamSpec
 
 import torch
@@ -24,10 +24,7 @@
 ]
 
 
-_tensor_or_tensors: TypeAlias = Union[  # noqa: PYI042
-    torch.Tensor,
-    typing.Iterable[torch.Tensor],  # noqa: UP006 - needed until XLA's patch is updated
-]
+_tensor_or_tensors: TypeAlias = torch.Tensor | typing.Iterable[torch.Tensor]  # noqa: PYI042
 
 _P = ParamSpec("_P")
 _R = TypeVar("_R")
@@ -54,7 +51,7 @@ def _get_total_norm(
     tensors: _tensor_or_tensors,
     norm_type: float = 2.0,
     error_if_nonfinite: bool = False,
-    foreach: Optional[bool] = None,
+    foreach: bool | None = None,
 ) -> torch.Tensor:
     r"""Compute the norm of an iterable of tensors.
 
@@ -125,7 +122,7 @@ def _clip_grads_with_norm_(
     parameters: _tensor_or_tensors,
     max_norm: float,
     total_norm: torch.Tensor,
-    foreach: Optional[bool] = None,
+    foreach: bool | None = None,
 ) -> None:
     r"""Scale the gradients of an iterable of parameters given a pre-calculated total norm and desired max norm.
 
@@ -191,7 +188,7 @@ def clip_grad_norm_(
     max_norm: float,
     norm_type: float = 2.0,
     error_if_nonfinite: bool = False,
-    foreach: Optional[bool] = None,
+    foreach: bool | None = None,
 ) -> torch.Tensor:
     r"""Clip the gradient norm of an iterable of parameters.
 
@@ -246,7 +243,7 @@ def clip_grad_norm(
     max_norm: float,
     norm_type: float = 2.0,
     error_if_nonfinite: bool = False,
-    foreach: Optional[bool] = None,
+    foreach: bool | None = None,
 ) -> torch.Tensor:
     r"""Clip the gradient norm of an iterable of parameters.
 
@@ -261,7 +258,7 @@ def clip_grad_norm(
 def clip_grad_value_(
     parameters: _tensor_or_tensors,
     clip_value: float,
-    foreach: Optional[bool] = None,
+    foreach: bool | None = None,
 ) -> None:
     r"""Clip the gradients of an iterable of parameters at specified value.
 
diff --git a/torch/nn/utils/convert_parameters.py b/torch/nn/utils/convert_parameters.py
index 874e3549f8a85..6a56da711ecda 100644
--- a/torch/nn/utils/convert_parameters.py
+++ b/torch/nn/utils/convert_parameters.py
@@ -1,5 +1,4 @@
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 
@@ -55,7 +54,7 @@ def vector_to_parameters(vec: torch.Tensor, parameters: Iterable[torch.Tensor])
         pointer += num_param
 
 
-def _check_param_device(param: torch.Tensor, old_param_device: Optional[int]) -> int:
+def _check_param_device(param: torch.Tensor, old_param_device: int | None) -> int:
     r"""Check if the parameters are located on the same device.
 
     Currently, the conversion between model parameters and single vector form is not supported
diff --git a/torch/nn/utils/parametrizations.py b/torch/nn/utils/parametrizations.py
index 7706be61e39f1..3a51bbc15c596 100644
--- a/torch/nn/utils/parametrizations.py
+++ b/torch/nn/utils/parametrizations.py
@@ -1,6 +1,5 @@
 # mypy: allow-untyped-defs
 from enum import auto, Enum
-from typing import Optional
 
 import torch
 import torch.nn.functional as F
@@ -194,7 +193,7 @@ def right_inverse(self, Q: torch.Tensor) -> torch.Tensor:
 def orthogonal(
     module: Module,
     name: str = "weight",
-    orthogonal_map: Optional[str] = None,
+    orthogonal_map: str | None = None,
     *,
     use_trivialization: bool = True,
 ) -> Module:
@@ -317,7 +316,7 @@ def orthogonal(
 class _WeightNorm(Module):
     def __init__(
         self,
-        dim: Optional[int] = 0,
+        dim: int | None = 0,
     ) -> None:
         super().__init__()
         if dim is None:
@@ -388,7 +387,7 @@ def _weight_norm_compat_hook(
         missing_keys,
         unexpected_keys,
         error_msgs,
-    ):
+    ) -> None:
         g_key = f"{prefix}{name}_g"
         v_key = f"{prefix}{name}_v"
         if g_key in state_dict and v_key in state_dict:
@@ -532,7 +531,7 @@ def spectral_norm(
     name: str = "weight",
     n_power_iterations: int = 1,
     eps: float = 1e-12,
-    dim: Optional[int] = None,
+    dim: int | None = None,
 ) -> Module:
     r"""Apply spectral normalization to a parameter in the given module.
 
diff --git a/torch/nn/utils/parametrize.py b/torch/nn/utils/parametrize.py
index 88eeb3aaf50c3..28599db7bdf11 100644
--- a/torch/nn/utils/parametrize.py
+++ b/torch/nn/utils/parametrize.py
@@ -5,7 +5,6 @@
 from collections.abc import Sequence
 from contextlib import contextmanager
 from copy import deepcopy
-from typing import Optional, Union
 
 import torch
 from torch import Tensor
@@ -26,7 +25,7 @@
 ]
 
 _cache_enabled = 0
-_cache: dict[tuple[int, str], Optional[Tensor]] = {}
+_cache: dict[tuple[int, str], Tensor | None] = {}
 
 
 @contextmanager
@@ -72,7 +71,7 @@ def cached():
             _cache = {}
 
 
-def _register_parameter_or_buffer(module, name, X):
+def _register_parameter_or_buffer(module, name, X) -> None:
     if isinstance(X, Parameter):
         module.register_parameter(name, X)
     else:
@@ -122,7 +121,7 @@ class ParametrizationList(ModuleList):
     def __init__(
         self,
         modules: Sequence[Module],
-        original: Union[Tensor, Parameter],
+        original: Tensor | Parameter,
         unsafe: bool = False,
     ) -> None:
         # We require this because we need to treat differently the first parametrization
@@ -644,7 +643,7 @@ def right_inverse(self, X: Tensor) -> Union[Tensor, Sequence[Tensor]]
     return module
 
 
-def is_parametrized(module: Module, tensor_name: Optional[str] = None) -> bool:
+def is_parametrized(module: Module, tensor_name: str | None = None) -> bool:
     r"""Determine if a module has a parametrization.
 
     Args:
@@ -776,7 +775,7 @@ def type_before_parametrizations(module: Module) -> type:
 def transfer_parametrizations_and_params(
     from_module: Module,
     to_module: Module,
-    tensor_name: Optional[str] = None,
+    tensor_name: str | None = None,
 ) -> Module:
     r"""Transfer parametrizations and the parameters they parametrize from :attr:`from_module` to :attr:`to_module`.
 
@@ -796,7 +795,7 @@ def transfer_parametrizations_and_params(
         assert isinstance(from_module.parametrizations, ModuleDict)  # for mypy
 
         # get list of all params or the single param to transfer
-        parameters_to_transfer: Union[list, ModuleDict] = (
+        parameters_to_transfer: list | ModuleDict = (
             from_module.parametrizations if tensor_name is None else [tensor_name]
         )
 
diff --git a/torch/nn/utils/prune.py b/torch/nn/utils/prune.py
index 3c1a800085951..827bf19ed4bea 100644
--- a/torch/nn/utils/prune.py
+++ b/torch/nn/utils/prune.py
@@ -231,7 +231,7 @@ def prune(self, t, default_mask=None, importance_scores=None):
         default_mask = default_mask if default_mask is not None else torch.ones_like(t)
         return t * self.compute_mask(importance_scores, default_mask=default_mask)
 
-    def remove(self, module):
+    def remove(self, module) -> None:
         r"""Remove the pruning reparameterization from a module.
 
         The pruned parameter named ``name`` remains permanently pruned,
@@ -269,7 +269,7 @@ class PruningContainer(BasePruningMethod):
     them.
     """
 
-    def __init__(self, *args):
+    def __init__(self, *args) -> None:
         self._pruning_methods: tuple[BasePruningMethod, ...] = ()
         if not isinstance(args, Iterable):  # only 1 item
             self._tensor_name = args._tensor_name
@@ -284,7 +284,7 @@ def __init__(self, *args):
             for method in args:
                 self.add_pruning_method(method)
 
-    def add_pruning_method(self, method):
+    def add_pruning_method(self, method) -> None:
         r"""Add a child pruning ``method`` to the container.
 
         Args:
@@ -303,7 +303,7 @@ def add_pruning_method(self, method):
         # if all checks passed, add to _pruning_methods tuple
         self._pruning_methods += (method,)  # type: ignore[operator]
 
-    def __len__(self):
+    def __len__(self) -> int:
         return len(self._pruning_methods)
 
     def __iter__(self):
@@ -449,7 +449,7 @@ class RandomUnstructured(BasePruningMethod):
 
     PRUNING_TYPE = "unstructured"
 
-    def __init__(self, amount):
+    def __init__(self, amount) -> None:
         # Check range of validity of pruning amount
         _validate_pruning_amount_init(amount)
         self.amount = amount
@@ -506,7 +506,7 @@ class L1Unstructured(BasePruningMethod):
 
     PRUNING_TYPE = "unstructured"
 
-    def __init__(self, amount):
+    def __init__(self, amount) -> None:
         # Check range of validity of pruning amount
         _validate_pruning_amount_init(amount)
         self.amount = amount
@@ -574,7 +574,7 @@ class RandomStructured(BasePruningMethod):
 
     PRUNING_TYPE = "structured"
 
-    def __init__(self, amount, dim=-1):
+    def __init__(self, amount, dim=-1) -> None:
         # Check range of validity of amount
         _validate_pruning_amount_init(amount)
         self.amount = amount
@@ -682,7 +682,7 @@ class LnStructured(BasePruningMethod):
 
     PRUNING_TYPE = "structured"
 
-    def __init__(self, amount, n, dim=-1):
+    def __init__(self, amount, n, dim=-1) -> None:
         # Check range of validity of amount
         _validate_pruning_amount_init(amount)
         self.amount = amount
@@ -799,7 +799,7 @@ def apply(cls, module, name, amount, n, dim, importance_scores=None):  # type: i
 class CustomFromMask(BasePruningMethod):
     PRUNING_TYPE = "global"
 
-    def __init__(self, mask):
+    def __init__(self, mask) -> None:
         self.mask = mask
 
     def compute_mask(self, t, default_mask):
@@ -1025,7 +1025,9 @@ def ln_structured(module, name, amount, n, dim, importance_scores=None):
     return module
 
 
-def global_unstructured(parameters, pruning_method, importance_scores=None, **kwargs):
+def global_unstructured(
+    parameters, pruning_method, importance_scores=None, **kwargs
+) -> None:
     r"""
     Globally prunes tensors corresponding to all parameters in ``parameters`` by applying the specified ``pruning_method``.
 
@@ -1212,7 +1214,7 @@ def remove(module, name):
     )
 
 
-def is_pruned(module):
+def is_pruned(module) -> bool:
     r"""Check if a module is pruned by looking for pruning pre-hooks.
 
     Check whether ``module`` is pruned by looking for
@@ -1241,7 +1243,7 @@ def is_pruned(module):
     return False
 
 
-def _validate_pruning_amount_init(amount):
+def _validate_pruning_amount_init(amount) -> None:
     r"""Validate helper to check the range of amount at init.
 
     Args:
@@ -1271,7 +1273,7 @@ def _validate_pruning_amount_init(amount):
         )
 
 
-def _validate_pruning_amount(amount, tensor_size):
+def _validate_pruning_amount(amount, tensor_size) -> None:
     r"""Validate that the pruning amount is meaningful wrt to the size of the data.
 
     Validation helper to check that the amount of parameters to prune
@@ -1295,7 +1297,7 @@ def _validate_pruning_amount(amount, tensor_size):
         )
 
 
-def _validate_structured_pruning(t):
+def _validate_structured_pruning(t) -> None:
     r"""Validate that the tensor to be pruned is at least 2-Dimensional.
 
     Validation helper to check that the tensor to be pruned is multi-
@@ -1342,7 +1344,7 @@ def _compute_nparams_toprune(amount, tensor_size):
         return round(amount * tensor_size)
 
 
-def _validate_pruning_dim(t, dim):
+def _validate_pruning_dim(t, dim) -> None:
     r"""Validate that the pruning dimension is within the bounds of the tensor dimension.
 
     Args:
diff --git a/torch/nn/utils/rnn.py b/torch/nn/utils/rnn.py
index 47bd937a32ae0..f0530d99f94e0 100644
--- a/torch/nn/utils/rnn.py
+++ b/torch/nn/utils/rnn.py
@@ -1,6 +1,6 @@
 import warnings
 from collections.abc import Callable, Iterable
-from typing import Any, NamedTuple, Optional, overload, TypeVar, Union
+from typing import Any, NamedTuple, overload, TypeVar
 from typing_extensions import Self
 
 import torch
@@ -25,11 +25,11 @@
 class PackedSequence_(NamedTuple):
     data: torch.Tensor
     batch_sizes: torch.Tensor
-    sorted_indices: Optional[torch.Tensor]
-    unsorted_indices: Optional[torch.Tensor]
+    sorted_indices: torch.Tensor | None
+    unsorted_indices: torch.Tensor | None
 
 
-def bind(optional: Optional[_T], fn: Callable[[_T], _R]) -> Optional[_R]:
+def bind(optional: _T | None, fn: Callable[[_T], _R]) -> _R | None:
     if optional is None:
         return None
     return fn(optional)
@@ -74,9 +74,9 @@ class PackedSequence(PackedSequence_):
     def __new__(
         cls,
         data: Tensor,
-        batch_sizes: Optional[Tensor] = None,
-        sorted_indices: Optional[Tensor] = None,
-        unsorted_indices: Optional[Tensor] = None,
+        batch_sizes: Tensor | None = None,
+        sorted_indices: Tensor | None = None,
+        unsorted_indices: Tensor | None = None,
     ) -> Self:
         return super().__new__(
             cls,
@@ -110,8 +110,8 @@ def to(
     @overload
     def to(
         self,
-        device: Optional[Union[str, torch.device, int]] = ...,
-        dtype: Optional[torch.dtype] = ...,
+        device: str | torch.device | int | None = ...,
+        dtype: torch.dtype | None = ...,
         non_blocking: bool = ...,
         copy: bool = ...,
     ) -> Self: ...
@@ -212,10 +212,10 @@ def is_pinned(self) -> bool:
 # method to construct PackedSequence
 def _packed_sequence_init_args(
     data: Tensor,
-    batch_sizes: Optional[Tensor] = None,
-    sorted_indices: Optional[Tensor] = None,
-    unsorted_indices: Optional[Tensor] = None,
-) -> tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]:
+    batch_sizes: Tensor | None = None,
+    sorted_indices: Tensor | None = None,
+    unsorted_indices: Tensor | None = None,
+) -> tuple[Tensor, Tensor, Tensor | None, Tensor | None]:
     # NB: if unsorted_indices is provided, it should be the inverse permutation
     # to sorted_indices. Don't assert it here because the PackedSequence ctor
     # should only be used internally.
@@ -244,9 +244,9 @@ def _packed_sequence_init_args(
 
 def _packed_sequence_init(
     data: Tensor,
-    batch_sizes: Optional[Tensor] = None,
-    sorted_indices: Optional[Tensor] = None,
-    unsorted_indices: Optional[Tensor] = None,
+    batch_sizes: Tensor | None = None,
+    sorted_indices: Tensor | None = None,
+    unsorted_indices: Tensor | None = None,
 ) -> PackedSequence:
     data, batch_sizes, sorted_indices, unsorted_indices = _packed_sequence_init_args(
         data, batch_sizes, sorted_indices, unsorted_indices
@@ -254,7 +254,7 @@ def _packed_sequence_init(
     return PackedSequence(data, batch_sizes, sorted_indices, unsorted_indices)
 
 
-def invert_permutation(permutation: Optional[Tensor]) -> Optional[Tensor]:
+def invert_permutation(permutation: Tensor | None) -> Tensor | None:
     """Returns the inverse of ``permutation``.
 
     This is useful for converting between sorted and unsorted indices in
@@ -274,7 +274,7 @@ def invert_permutation(permutation: Optional[Tensor]) -> Optional[Tensor]:
 
 def pack_padded_sequence(
     input: Tensor,
-    lengths: Union[Tensor, list[int]],
+    lengths: Tensor | list[int],
     batch_first: bool = False,
     enforce_sorted: bool = True,
 ) -> PackedSequence:
@@ -345,7 +345,7 @@ def pad_packed_sequence(
     sequence: PackedSequence,
     batch_first: bool = False,
     padding_value: float = 0.0,
-    total_length: Optional[int] = None,
+    total_length: int | None = None,
 ) -> tuple[Tensor, Tensor]:
     r"""Pad a packed batch of variable length sequences.
 
@@ -420,7 +420,7 @@ def pad_packed_sequence(
 
 # NOTE: for JIT-compatibility, we need to be more restrictive here and use specific types instead of Iterable.
 def pad_sequence(
-    sequences: Union[Tensor, list[Tensor]],
+    sequences: Tensor | list[Tensor],
     batch_first: bool = False,
     padding_value: float = 0.0,
     padding_side: str = "right",
diff --git a/torch/nn/utils/spectral_norm.py b/torch/nn/utils/spectral_norm.py
index d40e3a35e55eb..a11613a51dac4 100644
--- a/torch/nn/utils/spectral_norm.py
+++ b/torch/nn/utils/spectral_norm.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 """Spectral Normalization from https://arxiv.org/abs/1802.05957."""
 
-from typing import Any, Optional, TypeVar
+from typing import Any, TypeVar
 
 import torch
 import torch.nn.functional as F
@@ -268,7 +268,7 @@ def spectral_norm(
     name: str = "weight",
     n_power_iterations: int = 1,
     eps: float = 1e-12,
-    dim: Optional[int] = None,
+    dim: int | None = None,
 ) -> T_module:
     r"""Apply spectral normalization to a parameter in the given module.
 
diff --git a/torch/nn/utils/stateless.py b/torch/nn/utils/stateless.py
index 148052740922f..70f0afdeb5292 100644
--- a/torch/nn/utils/stateless.py
+++ b/torch/nn/utils/stateless.py
@@ -1,6 +1,6 @@
 # mypy: allow-untyped-defs
 import contextlib
-from typing import Any, Optional, Union
+from typing import Any
 from typing_extensions import deprecated
 
 import torch
@@ -165,8 +165,8 @@ def _reparametrize_module(
 def functional_call(
     module: "torch.nn.Module",
     parameters_and_buffers: dict[str, Tensor],
-    args: Optional[Union[Any, tuple]] = None,
-    kwargs: Optional[dict[str, Any]] = None,
+    args: Any | tuple | None = None,
+    kwargs: dict[str, Any] | None = None,
     *,
     tie_weights: bool = True,
     strict: bool = False,
@@ -243,8 +243,8 @@ def functional_call(
 def _functional_call(
     module: "torch.nn.Module",
     parameters_and_buffers: dict[str, Tensor],
-    args: Optional[Union[Any, tuple]] = None,
-    kwargs: Optional[dict[str, Any]] = None,
+    args: Any | tuple | None = None,
+    kwargs: dict[str, Any] | None = None,
     *,
     tie_weights: bool = True,
     strict: bool = False,
diff --git a/torch/numa/binding.py b/torch/numa/binding.py
index 89602e2136ad8..91a3cfd6926d0 100644
--- a/torch/numa/binding.py
+++ b/torch/numa/binding.py
@@ -7,7 +7,7 @@
 from enum import Enum
 from functools import wraps
 from logging import getLogger
-from typing import Optional, ParamSpec, TypeVar
+from typing import ParamSpec, TypeVar
 
 import torch
 from torch._utils_internal import signpost_event
@@ -53,7 +53,7 @@ def maybe_wrap_command_args_with_numa_binding(
     command_args: tuple[str, ...],
     *,
     gpu_index: int,
-    numa_options: Optional[NumaOptions],
+    numa_options: NumaOptions | None,
 ) -> tuple[str, ...]:
     """
     Wraps command arguments with numactl to apply NUMA CPU binding.
@@ -115,7 +115,7 @@ def maybe_wrap_with_numa_binding(
     func: Callable[_TParams, _TReturn],
     *,
     gpu_index: int,
-    numa_options: Optional[NumaOptions],
+    numa_options: NumaOptions | None,
 ) -> Callable[_TParams, _TReturn]:
     """
     Wraps a function to apply NUMA CPU binding before execution.
diff --git a/torch/onnx/_internal/exporter/_building.py b/torch/onnx/_internal/exporter/_building.py
index 2dbcf8f083877..4536e33087eb8 100644
--- a/torch/onnx/_internal/exporter/_building.py
+++ b/torch/onnx/_internal/exporter/_building.py
@@ -537,7 +537,7 @@ class OpRecorder(evaluator.Evaluator):
 
     def __init__(
         self, opset: onnxscript.values.Opset, constant_farm: dict[Any, ir.Value]
-    ):
+    ) -> None:
         self.nodes: list[ir.Node] = []
         self.opset = opset
         self.functions: dict[
diff --git a/torch/onnx/_internal/exporter/_capture_strategies.py b/torch/onnx/_internal/exporter/_capture_strategies.py
index 63421ff5bb947..8d1f04a8a80a7 100644
--- a/torch/onnx/_internal/exporter/_capture_strategies.py
+++ b/torch/onnx/_internal/exporter/_capture_strategies.py
@@ -92,7 +92,7 @@ def __init__(
         dump: bool = False,
         artifacts_dir: str | os.PathLike = ".",
         timestamp: str | None = None,
-    ):
+    ) -> None:
         """Initialize the strategy.
 
         Args:
diff --git a/torch/onnx/_internal/exporter/_core.py b/torch/onnx/_internal/exporter/_core.py
index b618943c3f21b..f1f1ac6c67e40 100644
--- a/torch/onnx/_internal/exporter/_core.py
+++ b/torch/onnx/_internal/exporter/_core.py
@@ -109,7 +109,7 @@ def torch_dtype_to_onnx_dtype(dtype: torch.dtype) -> ir.DataType:
 
 
 class TorchTensor(ir.Tensor):
-    def __init__(self, tensor: torch.Tensor, name: str | None = None):
+    def __init__(self, tensor: torch.Tensor, name: str | None = None) -> None:
         # Pass the tensor as the raw data to ir.Tensor's constructor
         if tensor.dtype == torch.float4_e2m1fn_x2:
             # Change the shape to the unpacked shape
diff --git a/torch/onnx/_internal/exporter/_dynamic_shapes.py b/torch/onnx/_internal/exporter/_dynamic_shapes.py
index e128ecf74e9e4..888db138736fb 100644
--- a/torch/onnx/_internal/exporter/_dynamic_shapes.py
+++ b/torch/onnx/_internal/exporter/_dynamic_shapes.py
@@ -67,7 +67,7 @@ def from_dynamic_axes_to_dynamic_shapes(
             # output names are not needed for dynamic_shapes
             continue
         if isinstance(axes, dict):
-            if any(not isinstance(k, int) for k in axes.keys()):
+            if any(not isinstance(k, int) for k in axes):
                 raise ValueError(
                     "The axis in dynamic_axes must be in the form of: dict[int, str] or list[int]."
                 )
diff --git a/torch/onnx/_internal/exporter/_onnx_program.py b/torch/onnx/_internal/exporter/_onnx_program.py
index 942638598047f..b2d4101fdc9a1 100644
--- a/torch/onnx/_internal/exporter/_onnx_program.py
+++ b/torch/onnx/_internal/exporter/_onnx_program.py
@@ -211,7 +211,7 @@ class ONNXProgram:
 
     def __init__(
         self, model: ir.Model, exported_program: torch.export.ExportedProgram | None
-    ):
+    ) -> None:
         """Initialize the ONNX program with the specified model and exported program.
         Args:
             model: The ONNX model.
@@ -327,7 +327,7 @@ def save(
         include_initializers: bool = True,
         keep_initializers_as_inputs: bool = False,
         external_data: bool | None = None,
-    ):
+    ) -> None:
         """Save the ONNX model to the specified destination.
 
         When ``external_data`` is ``True`` or the model is larger than 2GB,
diff --git a/torch/onnx/_internal/exporter/_reporting.py b/torch/onnx/_internal/exporter/_reporting.py
index e2e02e089c5d1..dc9cabeb677c4 100644
--- a/torch/onnx/_internal/exporter/_reporting.py
+++ b/torch/onnx/_internal/exporter/_reporting.py
@@ -149,7 +149,7 @@ def create_torch_export_error_report(
     *,
     export_status: ExportStatus,
     profile_result: str | None,
-):
+) -> None:
     with open(filename, "w", encoding="utf-8") as f:
         f.write("# PyTorch ONNX Conversion Error Report\n\n")
         f.write(_format_export_status(export_status))
@@ -175,7 +175,7 @@ def create_onnx_export_report(
     model: ir.Model | None = None,
     registry: _registration.ONNXRegistry | None = None,
     verification_result: str | None = None,
-):
+) -> None:
     with open(filename, "w", encoding="utf-8") as f:
         f.write("# PyTorch ONNX Conversion Report\n\n")
         f.write(_format_export_status(export_status))
diff --git a/torch/onnx/_internal/exporter/_schemas.py b/torch/onnx/_internal/exporter/_schemas.py
index 0ed3791c46fc7..89991b030509b 100644
--- a/torch/onnx/_internal/exporter/_schemas.py
+++ b/torch/onnx/_internal/exporter/_schemas.py
@@ -21,7 +21,7 @@
 
 # A special value to indicate that the default value is not specified
 class _Empty:
-    def __repr__(self):
+    def __repr__(self) -> str:
         return "_EMPTY_DEFAULT"
 
 
diff --git a/torch/onnx/_internal/exporter/_tensors.py b/torch/onnx/_internal/exporter/_tensors.py
index 2a6c74120d568..8f0706bf98638 100644
--- a/torch/onnx/_internal/exporter/_tensors.py
+++ b/torch/onnx/_internal/exporter/_tensors.py
@@ -18,7 +18,7 @@ def __init__(
         type: ir.TypeProtocol | None = None,
         doc_string: str | None = None,
         const_value: ir.TensorProtocol | None = None,
-    ):
+    ) -> None:
         super().__init__(
             name=name,
             shape=shape,
diff --git a/torch/onnx/_internal/exporter/_torchlib/ops/nn.py b/torch/onnx/_internal/exporter/_torchlib/ops/nn.py
index 31f87046315b6..3f165dd0facc3 100644
--- a/torch/onnx/_internal/exporter/_torchlib/ops/nn.py
+++ b/torch/onnx/_internal/exporter/_torchlib/ops/nn.py
@@ -6,7 +6,7 @@
 
 from __future__ import annotations
 
-from typing import Optional, Sequence, TYPE_CHECKING
+from typing import Optional, Sequence, TYPE_CHECKING  # noqa: UP035
 
 from onnxscript.onnx_opset import (  # type: ignore[attr-defined]
     opset20 as op20,
@@ -117,7 +117,7 @@ def aten_scaled_dot_product_attention_23(
             else attn_mask
         )
         attn_weight = torch.softmax(
-            (Q @ K.transpose(-2, -1) *  attn_mask, dim=-1
+            (Q @ K.transpose(-2, -1) * scale_factor) + attn_mask, dim=-1
         )
         attn_weight = torch.dropout(attn_weight, dropout_p)
         return attn_weight @ V
diff --git a/torch/onnx/_internal/fx/_pass.py b/torch/onnx/_internal/fx/_pass.py
index b1fad573f2902..95b7892fec4df 100644
--- a/torch/onnx/_internal/fx/_pass.py
+++ b/torch/onnx/_internal/fx/_pass.py
@@ -66,7 +66,7 @@ def _patch_difflib_sequence_matcher_init():
     """
     original_init = difflib.SequenceMatcher.__init__
 
-    def patched_init(self, isjunk=None, a="", b="", autojunk=True):
+    def patched_init(self, isjunk=None, a="", b="", autojunk=True) -> None:
         original_init(self, isjunk, a, b, autojunk=False)
 
     difflib.SequenceMatcher.__init__ = patched_init  # type: ignore[assignment]
@@ -192,7 +192,7 @@ class Transform(abc.ABC):
     def __init__(
         self,
         module: torch.fx.GraphModule,
-    ):
+    ) -> None:
         """Initialize the transform.
 
         Args:
diff --git a/torch/onnx/_internal/fx/passes/type_promotion.py b/torch/onnx/_internal/fx/passes/type_promotion.py
index 0dea1aa15317e..3d4e919a3b2fb 100644
--- a/torch/onnx/_internal/fx/passes/type_promotion.py
+++ b/torch/onnx/_internal/fx/passes/type_promotion.py
@@ -63,7 +63,7 @@ class TypePromotionSnapshot:
 class TypePromotionRule(abc.ABC):
     """Base class for type promotion rule per 'torch.ops.{namespace}.{op_name}'."""
 
-    def __init__(self, namespace: str, op_name: str):
+    def __init__(self, namespace: str, op_name: str) -> None:
         self.namespace = namespace
         self.op_name = op_name
 
@@ -74,7 +74,7 @@ def __init__(self, namespace: str, op_name: str):
     def __hash__(self) -> int: ...
 
     @abc.abstractmethod
-    def __repr__(self): ...
+    def __repr__(self) -> str: ...
 
     @abc.abstractmethod
     def __eq__(self, other: object) -> bool: ...
@@ -128,7 +128,7 @@ def __init__(
         promote_args_positions: Sequence[int],
         promote_kwargs_names: Sequence[str],
         promotion_kind: _prims_common.ELEMENTWISE_TYPE_PROMOTION_KIND,
-    ):
+    ) -> None:
         """Constructs a TypePromotionRule for elementwise operators.
 
         Args:
@@ -143,7 +143,7 @@ def __init__(
         self.promote_kwargs_names = promote_kwargs_names
         self.promotion_kind = promotion_kind
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return (
             f"ElementwiseTypePromotionRule('{self.namespace}', '{self.op_name}', "
             f"{self.promote_args_positions}, {self.promote_kwargs_names}, {self.promotion_kind})"
@@ -216,7 +216,7 @@ class DivElementwiseTypePromotionRule(ElementwiseTypePromotionRule):
     Rule depends on the value of the `rounding_mode` argument.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__(
             "aten",
             "div",
@@ -252,7 +252,7 @@ def __init__(
         namespace: str,
         op_name: str,
         promotion_kind: _prims_common.REDUCTION_OUTPUT_TYPE_KIND,
-    ):
+    ) -> None:
         """Constructs a TypePromotionRule for reduction operators.
 
         Args:
@@ -263,7 +263,7 @@ def __init__(
         super().__init__(namespace, op_name)
         self.promotion_kind = promotion_kind
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return f"ReductionTypePromotionRule('{self.namespace}', '{self.op_name}', {self.promotion_kind})"
 
     # pyrefly: ignore [bad-override]
@@ -311,7 +311,7 @@ class AllOrAnyReductionTypePromotionRule(ReductionTypePromotionRule):
     The result dtype is always uint8 if `dtype` kwarg is uint8, otherwise torch.bool.
     """
 
-    def __init__(self, op_name: str):
+    def __init__(self, op_name: str) -> None:
         super().__init__(
             "aten",
             op_name,
@@ -1205,7 +1205,7 @@ def _parse_type_promotion_rule_from_refs_op(
 class TypePromotionTable:
     """Type promotion table for torch.ops."""
 
-    def __init__(self):
+    def __init__(self) -> None:
         self._rule_table = {}
         for rule in _GENERATED_ATEN_TYPE_PROMOTION_RULE_SET:
             self.add_rule(rule)
@@ -1262,7 +1262,7 @@ class _OpTraceDispatchMode(_python_dispatch.TorchDispatchMode):
     op overload for a given op overload packet for different set of args and kwargs.
     """
 
-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         self.traced_ops = []
 
@@ -1331,7 +1331,7 @@ def __init__(
         self,
         module: torch.fx.GraphModule,
         type_promotion_table: TypePromotionTable,
-    ):
+    ) -> None:
         super().__init__(module)
         self.type_promotion_table = type_promotion_table
 
@@ -1603,7 +1603,7 @@ def __init__(
         self,
         module: torch.fx.GraphModule,
         type_promotion_table: TypePromotionTable | None = None,
-    ):
+    ) -> None:
         super().__init__(module)
         self.interpreter = _TypePromotionInterpreter(
             module, type_promotion_table or TypePromotionTable()
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py
index e1b34469fbf20..7ae1c5a082e1c 100644
--- a/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py
@@ -810,7 +810,7 @@ def _reduce_with_dtype(onnx_op: str, name: str, allow_multi_dim_support: bool =
 
 @_onnx_symbolic("aten::cumsum")
 @symbolic_helper.parse_args("v", "i", "none")
-def cumsum(g: jit_utils.GraphContext, input, dim, dtype):
+def cumsum(g: jit_utils.GraphContext, input, dim, dtype) -> None:
     symbolic_helper._onnx_opset_unsupported("cumsum", 9, 11, input)
 
 
@@ -3332,7 +3332,9 @@ def _unique(g: jit_utils.GraphContext, input, sorted, return_inverse):
 
 @_onnx_symbolic("aten::_unique2")
 @symbolic_helper.parse_args("v", "i", "i", "i")
-def _unique2(g: jit_utils.GraphContext, input, sorted, return_inverse, return_counts):
+def _unique2(
+    g: jit_utils.GraphContext, input, sorted, return_inverse, return_counts
+) -> None:
     symbolic_helper._onnx_opset_unsupported("_unique2", 9, 11, input)
 
 
@@ -6289,7 +6291,7 @@ def broadcast_tensors(g: jit_utils.GraphContext, self):
 
 
 @_onnx_symbolic("aten::is_pinned")
-def is_pinned(g: jit_utils.GraphContext, self, device=None):
+def is_pinned(g: jit_utils.GraphContext, self, device=None) -> None:
     # Unused by ONNX.
     return None
 
@@ -6357,7 +6359,7 @@ def prim_layout(g: jit_utils.GraphContext, self):
 
 
 @_onnx_symbolic("prim::ListConstruct")
-def prim_list_construct(g: jit_utils.GraphContext, *inputs, **kwargs):
+def prim_list_construct(g: jit_utils.GraphContext, *inputs, **kwargs) -> None:
     return None
 
 
@@ -6374,12 +6376,12 @@ def prim_list_unpack(
 
 
 @_onnx_symbolic("prim::TupleConstruct")
-def prim_tuple_construct(g: jit_utils.GraphContext, *inputs, **kwargs):
+def prim_tuple_construct(g: jit_utils.GraphContext, *inputs, **kwargs) -> None:
     return None
 
 
 @_onnx_symbolic("prim::Uninitialized")
-def prim_uninitialized(g: jit_utils.GraphContext, *inputs, **kwargs):
+def prim_uninitialized(g: jit_utils.GraphContext, *inputs, **kwargs) -> None:
     return None
 
 
diff --git a/torch/onnx/_internal/torchscript_exporter/utils.py b/torch/onnx/_internal/torchscript_exporter/utils.py
index d66962f690ea1..050b60c292684 100644
--- a/torch/onnx/_internal/torchscript_exporter/utils.py
+++ b/torch/onnx/_internal/torchscript_exporter/utils.py
@@ -571,7 +571,7 @@ def forward(self, x):
     return None
 
 
-def _is_constant_tensor_list(node):
+def _is_constant_tensor_list(node) -> bool | None:
     if node.kind() != "prim::Constant":
         return False
     output_type = node.output().type()
@@ -585,7 +585,7 @@ def _is_constant_tensor_list(node):
 # get generated in constant prop. So we split them back into prim::ListConstructs
 
 
-def _split_tensor_list_constants(g, block):
+def _split_tensor_list_constants(g, block) -> None:
     for node in block.nodes():
         for subblock in node.blocks():
             _split_tensor_list_constants(g, subblock)
@@ -722,7 +722,7 @@ def _optimize_graph(
     return graph
 
 
-def warn_on_static_input_change(input_states):
+def warn_on_static_input_change(input_states) -> None:
     """Warns that changes to input dictionaries and strings won't take effect in the traced ONNX graph.
 
     We accept dictionaries and strings as ONNX inputs, but they should be only for
@@ -932,7 +932,7 @@ def _get_param_count_list(method_graph, args_params):
     return param_count_list
 
 
-def _check_flatten_did_not_remove(original, jit_flattened):
+def _check_flatten_did_not_remove(original, jit_flattened) -> None:
     """torch.jit._flatten removes None. Check if it did so in this case."""
 
     def flatten(x):
@@ -1286,13 +1286,13 @@ def _setup_trace_module_map(
     model: torch.nn.Module | torch.jit.ScriptModule,
     export_modules_as_functions: bool | Collection[type[torch.nn.Module]],
 ) -> set[str]:
-    def __register_attribute_hook():
+    def __register_attribute_hook() -> None:
         attr_name = "_onnx_attrs"
 
-        def _track_module_attributes_forward_pre_hook(module, input):
+        def _track_module_attributes_forward_pre_hook(module, input) -> None:
             setattr(module, attr_name, _get_module_attributes(module))
 
-        def _track_module_attributes_forward_hook(module, input, output):
+        def _track_module_attributes_forward_hook(module, input, output) -> None:
             tracing_state = _C._get_tracing_state()
             if not tracing_state:
                 return
@@ -1359,7 +1359,7 @@ def _find_typename(v):
     return module_typenames
 
 
-def _reset_trace_module_map():
+def _reset_trace_module_map() -> None:
     torch.jit._trace._trace_module_map = None
     _C._jit_pass_onnx_clear_scope_records()
 
@@ -1388,7 +1388,7 @@ def _get_module_attributes(module):
     return attrs
 
 
-def _trigger_symbolic_function_registration():
+def _trigger_symbolic_function_registration() -> None:
     """Trigger the registration of symbolic functions for all supported opsets."""
 
     from torch.onnx._internal.torchscript_exporter import (  # noqa: F401
@@ -1599,7 +1599,7 @@ def _export(
     return torch_out
 
 
-def _apply_friendly_debug_names(graph, params):
+def _apply_friendly_debug_names(graph, params) -> None:
     for n in graph.nodes():
         for v in n.inputs():
             old_name = v.debugName()
@@ -1611,8 +1611,8 @@ def _apply_friendly_debug_names(graph, params):
                 params[new_name] = params.pop(old_name)
 
 
-def _set_input_and_output_names(graph, input_names, output_names):
-    def set_names(node_list, name_list, descriptor):
+def _set_input_and_output_names(graph, input_names, output_names) -> None:
+    def set_names(node_list, name_list, descriptor) -> None:
         if name_list is None:
             return
         if len(name_list) > len(node_list):
@@ -1681,7 +1681,7 @@ def _add_output_to_block(block: _C.Block, value: _C.Value) -> int:
 
 def _should_aten_fallback(
     name: str, opset_version: int, operator_export_type: _C_onnx.OperatorExportTypes
-):
+) -> bool:
     # For all builds, if domain=="aten" and operator_export_type==ONNX_ATEN,
     #   an aten::ATen operator is created regardless of symbolics existence
 
@@ -1822,7 +1822,7 @@ def _run_symbolic_function(
         raise
 
 
-def _verify_custom_op_name(symbolic_name: str):
+def _verify_custom_op_name(symbolic_name: str) -> None:
     if not re.match(r"^[a-zA-Z0-9-_]+::[a-zA-Z-_]+[a-zA-Z0-9-_]*$", symbolic_name):
         raise errors.OnnxExporterError(
             f"Failed to register operator {symbolic_name}. "
@@ -1842,7 +1842,7 @@ def register_custom_op_symbolic(
     symbolic_name: str,
     symbolic_fn: Callable,
     opset_version: int,
-):
+) -> None:
     """Registers a symbolic function for a custom operator.
 
     When the user registers symbolic for custom/contrib ops,
@@ -1868,7 +1868,7 @@ def register_custom_op_symbolic(
     registration.custom_onnx_symbolic(symbolic_name, opset_version)(symbolic_fn)
 
 
-def unregister_custom_op_symbolic(symbolic_name: str, opset_version: int):
+def unregister_custom_op_symbolic(symbolic_name: str, opset_version: int) -> None:
     """Unregisters ``symbolic_name``.
 
     See "Custom Operators" in the module documentation for an example usage.
@@ -1886,7 +1886,7 @@ def unregister_custom_op_symbolic(symbolic_name: str, opset_version: int):
     registration.registry.unregister(symbolic_name, opset_version)
 
 
-def _validate_dynamic_axes(dynamic_axes, model, input_names, output_names):
+def _validate_dynamic_axes(dynamic_axes, model, input_names, output_names) -> None:
     """Ensures dynamic axes argument is follows the expected format."""
     if len(dynamic_axes) == 0:
         return
diff --git a/torch/onnx/_internal/torchscript_exporter/verification.py b/torch/onnx/_internal/torchscript_exporter/verification.py
index 32885d1f63774..33fa18a3fd472 100644
--- a/torch/onnx/_internal/torchscript_exporter/verification.py
+++ b/torch/onnx/_internal/torchscript_exporter/verification.py
@@ -209,7 +209,7 @@ def _compare_onnx_pytorch_outputs_in_np(
     onnx_outs: _OutputsType,
     pt_outs: _OutputsType,
     options: VerificationOptions,
-):
+) -> None:
     assert len(onnx_outs) == len(pt_outs), (
         f"Number of outputs differ ONNX runtime: ({len(onnx_outs)}) PyTorch: ({len(pt_outs)})"
     )
@@ -261,7 +261,7 @@ def _compare_onnx_pytorch_outputs(
     onnx_outs: _OutputsType,
     pt_outs: Any,
     options: VerificationOptions,
-):
+) -> None:
     """
     Compare ONNX and PyTorch outputs.
 
@@ -383,7 +383,7 @@ def _compare_onnx_pytorch_model(
     input_kwargs: _InputKwargsType | None,
     additional_test_inputs: Sequence[_InputArgsType] | None,
     options: VerificationOptions,
-):
+) -> None:
     """Compare outputs from ONNX model runs with outputs from PyTorch model runs.
 
     Args:
@@ -401,7 +401,7 @@ def _compare_onnx_pytorch_model(
     """
     onnx_session = _onnx_backend_session(onnx_model_f, options.backend)
 
-    def compare_onnx_pytorch_model_with_input(input_args, input_kwargs):
+    def compare_onnx_pytorch_model_with_input(input_args, input_kwargs) -> None:
         pt_args, pt_kwargs = _prepare_input_for_pytorch(input_args, input_kwargs)
         # TODO: remove this and treat mutating model separately. See #77679
         pt_model_copy = _try_clone_model(pt_model)
@@ -443,7 +443,7 @@ def verify(
     use_external_data: bool = False,
     additional_test_inputs: Sequence[_InputArgsType] | None = None,
     options: VerificationOptions | None = None,
-):
+) -> None:
     """Verify model export to ONNX against original PyTorch model.
 
     .. deprecated:: 2.7
diff --git a/torch/onnx/errors.py b/torch/onnx/errors.py
index d5483dc67e3b1..3645e01d7a7a2 100644
--- a/torch/onnx/errors.py
+++ b/torch/onnx/errors.py
@@ -30,7 +30,7 @@ class UnsupportedOperatorError(OnnxExporterError):
 
     # NOTE: This is legacy and is only used by the torchscript exporter
     # Clean up when the torchscript exporter is removed
-    def __init__(self, name: str, version: int, supported_version: int | None):
+    def __init__(self, name: str, version: int, supported_version: int | None) -> None:
         if supported_version is not None:
             msg = (
                 f"Exporting the operator '{name}' to ONNX opset version {version} "
@@ -57,7 +57,7 @@ class SymbolicValueError(OnnxExporterError):
 
     # NOTE: This is legacy and is only used by the torchscript exporter
     # Clean up when the torchscript exporter is removed
-    def __init__(self, msg: str, value: _C.Value):
+    def __init__(self, msg: str, value: _C.Value) -> None:
         message = (
             f"{msg}  [Caused by the value '{value}' (type '{value.type()}') in the "
             f"TorchScript graph. The containing node has kind '{value.node().kind()}'.] "
diff --git a/torch/optim/_adafactor.py b/torch/optim/_adafactor.py
index 4def193daf190..c417b354429b5 100644
--- a/torch/optim/_adafactor.py
+++ b/torch/optim/_adafactor.py
@@ -32,7 +32,7 @@ def __init__(
         *,
         foreach: Optional[bool] = None,
         maximize: bool = False,
-    ):
+    ) -> None:
         if isinstance(lr, Tensor) and lr.numel() != 1:
             raise ValueError("Tensor lr must be 1-element")
         if not 0.0 <= lr:
@@ -77,7 +77,7 @@ def _init_group(
         col_vars,
         variances,
         state_steps,
-    ):
+    ) -> bool:
         for p in group["params"]:
             if p.grad is None:
                 continue
@@ -349,7 +349,7 @@ def _single_tensor_adafactor(
     eps2: float,
     maximize: bool,
     has_complex: bool,
-):
+) -> None:
     if grad_scale is not None or found_inf is not None:
         raise AssertionError("Grad scaling should occur outside of optimizer.step()")
 
@@ -473,7 +473,7 @@ def _multi_tensor_adafactor(
     eps2: float,
     maximize: bool,
     has_complex: bool,
-):
+) -> None:
     if len(params) == 0:
         return
 
@@ -624,7 +624,7 @@ def adafactor(
     eps1: float,
     eps2: float,
     maximize: bool,
-):
+) -> None:
     r"""Functional API that performs Adafactor algorithm computation.
 
     See :class:`~torch.optim.Adafactor` for details.
diff --git a/torch/optim/_functional.py b/torch/optim/_functional.py
index 9b2c76700b356..ba97bc9979378 100644
--- a/torch/optim/_functional.py
+++ b/torch/optim/_functional.py
@@ -33,7 +33,7 @@ def sparse_adam(
     beta2: float,
     lr: float,
     maximize: bool,
-):
+) -> None:
     r"""Functional API that performs Sparse Adam algorithm computation.
 
     See :class:`~torch.optim.SparseAdam` for details.
diff --git a/torch/optim/_muon.py b/torch/optim/_muon.py
index 7b7167a40fc1c..5b7b9892daf3a 100644
--- a/torch/optim/_muon.py
+++ b/torch/optim/_muon.py
@@ -141,7 +141,7 @@ def _init_group(
         params_with_grad: list[Tensor],
         grads: list[Tensor],
         muon_momentum_bufs: list[Tensor],
-    ):
+    ) -> bool:
         for p in group["params"]:
             if p.grad is None:
                 continue
@@ -337,7 +337,7 @@ def muon(
     eps: float,
     adjust_lr_fn: Optional[str],
     has_complex: bool,
-):
+) -> None:
     r"""Functional API that performs Muon algorithm computation.
 
     See :class:`~torch.optim.Muon` for details.
diff --git a/torch/optim/adadelta.py b/torch/optim/adadelta.py
index 4a893026451ae..75ac77790e309 100644
--- a/torch/optim/adadelta.py
+++ b/torch/optim/adadelta.py
@@ -38,7 +38,7 @@ def __init__(
         capturable: bool = False,
         maximize: bool = False,
         differentiable: bool = False,
-    ):
+    ) -> None:
         if isinstance(lr, Tensor) and lr.numel() != 1:
             raise ValueError("Tensor lr must be 1-element")
         if not 0.0 <= lr:
@@ -257,7 +257,7 @@ def _single_tensor_adadelta(
     differentiable: bool,
     capturable: bool,
     has_complex: bool,
-):
+) -> None:
     # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
     if not torch.compiler.is_compiling() and capturable:
         capturable_supported_devices = _get_capturable_supported_devices(
@@ -317,7 +317,7 @@ def _multi_tensor_adadelta(
     differentiable: bool,
     capturable: bool,
     has_complex: bool,
-):
+) -> None:
     if differentiable:
         raise AssertionError("_foreach ops don't support autograd")
 
@@ -427,7 +427,7 @@ def adadelta(
     eps: float,
     weight_decay: float,
     maximize: bool,
-):
+) -> None:
     r"""Functional API that performs Adadelta algorithm computation.
 
     See :class:`~torch.optim.Adadelta` for details.
diff --git a/torch/optim/adagrad.py b/torch/optim/adagrad.py
index 4d2523b2a16af..519900ab5da63 100644
--- a/torch/optim/adagrad.py
+++ b/torch/optim/adagrad.py
@@ -38,7 +38,7 @@ def __init__(
         maximize: bool = False,
         differentiable: bool = False,
         fused: Optional[bool] = None,
-    ):
+    ) -> None:
         if isinstance(lr, Tensor) and lr.numel() != 1:
             raise ValueError("Tensor lr must be 1-element")
         if not 0.0 <= lr:
@@ -116,7 +116,7 @@ def __setstate__(self, state):
                     float(s["step"]), dtype=_get_scalar_dtype(is_fused=fused)
                 )
 
-    def share_memory(self):
+    def share_memory(self) -> None:
         """Calls tensor.share_memory_() on the state sum tensors."""
         for group in self.param_groups:
             for p in group["params"]:
@@ -261,7 +261,7 @@ def adagrad(
     lr_decay: float,
     eps: float,
     maximize: bool,
-):
+) -> None:
     r"""Functional API that performs Adagrad algorithm computation.
 
     See :class:`~torch.optim.Adagrad` for details.
@@ -336,7 +336,7 @@ def _single_tensor_adagrad(
     maximize: bool,
     differentiable: bool,
     has_complex: bool,
-):
+) -> None:
     if grad_scale is not None or found_inf is not None:
         raise AssertionError("Expected grad_scale and found_inf to be None")
 
@@ -404,7 +404,7 @@ def _multi_tensor_adagrad(
     maximize: bool,
     differentiable: bool,
     has_complex: bool,
-):
+) -> None:
     if differentiable:
         raise AssertionError("_foreach ops don't support autograd")
     if grad_scale is not None or found_inf is not None:
diff --git a/torch/optim/adam.py b/torch/optim/adam.py
index 5ceadccce86a5..6b8fd5b7e70f6 100644
--- a/torch/optim/adam.py
+++ b/torch/optim/adam.py
@@ -47,7 +47,7 @@ def __init__(
         differentiable: bool = False,
         fused: Optional[bool] = None,
         decoupled_weight_decay: bool = False,
-    ):
+    ) -> None:
         if isinstance(lr, Tensor):
             if foreach and not capturable:
                 raise ValueError(
@@ -365,7 +365,7 @@ def _single_tensor_adam(
     capturable: bool,
     differentiable: bool,
     decoupled_weight_decay: bool,
-):
+) -> None:
     if grad_scale is not None or found_inf is not None:
         raise AssertionError("Expected grad_scale and found_inf to be None")
 
@@ -572,7 +572,7 @@ def _multi_tensor_adam(
     capturable: bool,
     differentiable: bool,
     decoupled_weight_decay: bool,
-):
+) -> None:
     if len(params) == 0:
         return
 
@@ -925,7 +925,7 @@ def adam(
     weight_decay: float,
     eps: float,
     maximize: bool,
-):
+) -> None:
     r"""Functional API that performs Adam algorithm computation.
 
     See :class:`~torch.optim.Adam` for details.
diff --git a/torch/optim/adamax.py b/torch/optim/adamax.py
index 76d784d6ea764..264451dbb4091 100644
--- a/torch/optim/adamax.py
+++ b/torch/optim/adamax.py
@@ -39,7 +39,7 @@ def __init__(
         maximize: bool = False,
         differentiable: bool = False,
         capturable: bool = False,
-    ):
+    ) -> None:
         if isinstance(lr, Tensor) and lr.numel() != 1:
             raise ValueError("Tensor lr must be 1-element")
         if not 0.0 <= lr:
@@ -239,7 +239,7 @@ def _single_tensor_adamax(
     differentiable: bool,
     capturable: bool,
     has_complex: bool,
-):
+) -> None:
     if not torch.jit.is_scripting():
         lr = _to_scalar(lr)
 
@@ -319,7 +319,7 @@ def _multi_tensor_adamax(
     differentiable: bool,
     capturable: bool,
     has_complex: bool,
-):
+) -> None:
     if differentiable:
         raise AssertionError("_foreach ops don't support autograd")
 
@@ -441,7 +441,7 @@ def adamax(
     beta2: float,
     lr: float,
     weight_decay: float,
-):
+) -> None:
     r"""Functional API that performs adamax algorithm computation.
 
     See :class:`~torch.optim.Adamax` for details.
diff --git a/torch/optim/adamw.py b/torch/optim/adamw.py
index 0558cbddd883b..2c968fabb698c 100644
--- a/torch/optim/adamw.py
+++ b/torch/optim/adamw.py
@@ -33,7 +33,7 @@ def __init__(
         capturable: bool = False,
         differentiable: bool = False,
         fused: Optional[bool] = None,
-    ):
+    ) -> None:
         super().__init__(
             params,
             lr,
@@ -152,7 +152,7 @@ def adamw(
     weight_decay: float,
     eps: float,
     maximize: bool,
-):
+) -> None:
     r"""Functional API that performs AdamW algorithm computation.
 
     See :class:`~torch.optim.AdamW` for details.
diff --git a/torch/optim/asgd.py b/torch/optim/asgd.py
index 0008694bda18b..0af7f9b4e6f6d 100644
--- a/torch/optim/asgd.py
+++ b/torch/optim/asgd.py
@@ -39,7 +39,7 @@ def __init__(
         maximize: bool = False,
         differentiable: bool = False,
         capturable: bool = False,
-    ):
+    ) -> None:
         if isinstance(lr, Tensor) and lr.numel() != 1:
             raise ValueError("Tensor lr must be 1-element")
         if not 0.0 <= lr:
@@ -211,7 +211,7 @@ def _single_tensor_asgd(
     differentiable: bool,
     capturable: bool,
     has_complex: bool,
-):
+) -> None:
     if not torch.jit.is_scripting():
         lr = _to_scalar(lr)
 
@@ -292,7 +292,7 @@ def _multi_tensor_asgd(
     differentiable: bool,
     capturable: bool,
     has_complex: bool,
-):
+) -> None:
     if len(params) == 0:
         return
 
@@ -442,7 +442,7 @@ def asgd(
     t0: float,
     alpha: float,
     weight_decay: float,
-):
+) -> None:
     r"""Functional API that performs asgd algorithm computation.
 
     See :class:`~torch.optim.ASGD` for details.
diff --git a/torch/optim/lbfgs.py b/torch/optim/lbfgs.py
index ae4b286ffa225..3d138f6a43f76 100644
--- a/torch/optim/lbfgs.py
+++ b/torch/optim/lbfgs.py
@@ -254,7 +254,7 @@ def __init__(
         tolerance_change: float = 1e-9,
         history_size: int = 100,
         line_search_fn: Optional[str] = None,
-    ):
+    ) -> None:
         if isinstance(lr, Tensor) and lr.numel() != 1:
             raise ValueError("Tensor lr must be 1-element")
         if not 0.0 <= lr:
@@ -304,7 +304,7 @@ def _gather_flat_grad(self):
             views.append(view)
         return torch.cat(views, 0)
 
-    def _add_grad(self, step_size, update):
+    def _add_grad(self, step_size, update) -> None:
         offset = 0
         for p in self._params:
             if torch.is_complex(p):
@@ -319,7 +319,7 @@ def _add_grad(self, step_size, update):
     def _clone_param(self):
         return [p.clone(memory_format=torch.contiguous_format) for p in self._params]
 
-    def _set_param(self, params_data):
+    def _set_param(self, params_data) -> None:
         for p, pdata in zip(self._params, params_data, strict=True):
             p.copy_(pdata)
 
diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py
index 71dcb6129a8ec..6426283e6542c 100644
--- a/torch/optim/lr_scheduler.py
+++ b/torch/optim/lr_scheduler.py
@@ -89,7 +89,9 @@ def _param_groups_val_list(optimizer: Optimizer, key: str) -> list[Any]:
     ]
 
 
-def _update_param_group_val(param_group: dict[str, Any], key: str, val: float | Tensor):
+def _update_param_group_val(
+    param_group: dict[str, Any], key: str, val: float | Tensor
+) -> None:
     """Set param_group[key] to val without aliasing or assignment when they're
     both tensors. Raises a KeyError if param_group[key] does not exist.
     """
@@ -196,7 +198,7 @@ def state_dict(self) -> dict[str, Any]:
             key: value for key, value in self.__dict__.items() if key != "optimizer"
         }
 
-    def load_state_dict(self, state_dict: dict[str, Any]):
+    def load_state_dict(self, state_dict: dict[str, Any]) -> None:
         """Load the scheduler's state.
 
         Args:
@@ -288,7 +290,7 @@ def step(self, epoch: Optional[int] = None) -> None:
             warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning, stacklevel=2)
         self._update_lr(epoch)
 
-    def _update_lr(self, epoch: Optional[int] = None):
+    def _update_lr(self, epoch: Optional[int] = None) -> None:
         with _enable_get_lr_call(self):
             if epoch is None:
                 self.last_epoch += 1
@@ -339,7 +341,7 @@ def __exit__(self, type, value, traceback) -> None:
 
 
 class _initial_mode:
-    def __init__(self, o: LRScheduler):
+    def __init__(self, o: LRScheduler) -> None:
         self.o = o
 
     def __enter__(self):
@@ -1180,7 +1182,7 @@ def __init__(
 
         self._last_lr = schedulers[0].get_last_lr()
 
-    def recursive_undo(self, sched=None):
+    def recursive_undo(self, sched=None) -> None:
         """
         Recursively undo any step performed by the initialisation of
         schedulers.
@@ -1659,7 +1661,7 @@ def __init__(
         cooldown: int = 0,
         min_lr: Union[list[float], float] = 0,
         eps: float = 1e-8,
-    ):  # noqa: D107
+    ) -> None:  # noqa: D107
         if factor >= 1.0:
             raise ValueError("Factor should be < 1.0.")
         self.factor = factor
@@ -1691,7 +1693,7 @@ def __init__(
         )
         self._reset()
 
-    def _reset(self):
+    def _reset(self) -> None:
         """Reset num_bad_epochs counter and cooldown counter."""
         self.best = self.mode_worse
         self.cooldown_counter = 0
@@ -1724,7 +1726,7 @@ def step(self, metrics: SupportsFloat, epoch=None) -> None:  # type: ignore[over
 
         self._last_lr = _param_groups_val_list(self.optimizer, "lr")
 
-    def _reduce_lr(self, epoch):
+    def _reduce_lr(self, epoch) -> None:
         if len(self.optimizer.param_groups) != len(self.min_lrs):
             if self.default_min_lr is None:
                 raise RuntimeError(
@@ -1765,7 +1767,7 @@ def _is_better(self, a, best):  # noqa: D102
         else:  # mode == 'max' and epsilon_mode == 'abs':
             return a > best + self.threshold
 
-    def _init_is_better(self, mode, threshold, threshold_mode):
+    def _init_is_better(self, mode, threshold, threshold_mode) -> None:
         if mode not in {"min", "max"}:
             raise ValueError("mode " + mode + " is unknown!")
         if threshold_mode not in {"rel", "abs"}:
@@ -1904,7 +1906,7 @@ def __init__(
         base_momentum: float = 0.8,
         max_momentum: float = 0.9,
         last_epoch: int = -1,
-    ):  # noqa: D107
+    ) -> None:  # noqa: D107
         # Attach optimizer
         if not isinstance(optimizer, Optimizer):
             raise TypeError(f"{type(optimizer).__name__} is not an Optimizer")
@@ -1970,7 +1972,7 @@ def __init__(
         super().__init__(optimizer, last_epoch)
         self.base_lrs = base_lrs
 
-    def _init_scale_fn(self):
+    def _init_scale_fn(self) -> None:
         if self._scale_fn_custom is not None:
             return
         if self.mode == "triangular":
@@ -2155,7 +2157,7 @@ def __init__(
         T_mult: int = 1,
         eta_min: float = 0.0,
         last_epoch: int = -1,
-    ):  # noqa: D107
+    ) -> None:  # noqa: D107
         if T_0 <= 0 or not isinstance(T_0, int):
             raise ValueError(f"Expected positive integer T_0, but got {T_0}")
         if T_mult < 1 or not isinstance(T_mult, int):
@@ -2407,7 +2409,7 @@ def __init__(
         final_div_factor: float = 1e4,
         three_phase: bool = False,
         last_epoch: int = -1,
-    ):  # noqa: D107
+    ) -> None:  # noqa: D107
         # Validate optimizer
         if not isinstance(optimizer, Optimizer):
             raise TypeError(f"{type(optimizer).__name__} is not an Optimizer")
diff --git a/torch/optim/nadam.py b/torch/optim/nadam.py
index 508648a65c14a..f83cd4b85d02f 100644
--- a/torch/optim/nadam.py
+++ b/torch/optim/nadam.py
@@ -44,7 +44,7 @@ def __init__(
         maximize: bool = False,
         capturable: bool = False,
         differentiable: bool = False,
-    ):  # noqa: D107
+    ) -> None:  # noqa: D107
         if isinstance(lr, Tensor) and lr.numel() != 1:
             raise ValueError("Tensor lr must be 1-element")
         if not 0.0 <= lr:
@@ -297,7 +297,7 @@ def _single_tensor_nadam(
     capturable: bool,
     differentiable: bool,
     has_complex: bool,
-):
+) -> None:
     if not torch.jit.is_scripting():
         lr = _to_scalar(lr)
 
@@ -397,7 +397,7 @@ def _multi_tensor_nadam(
     capturable: bool,
     differentiable: bool,
     has_complex: bool,
-):
+) -> None:
     if len(params) == 0:
         return
 
@@ -624,7 +624,7 @@ def nadam(
     weight_decay: float,
     momentum_decay: float,
     eps: float,
-):
+) -> None:
     r"""Functional API that performs NAdam algorithm computation.
 
     See :class:`~torch.optim.NAdam` for details.
diff --git a/torch/optim/optimizer.py b/torch/optim/optimizer.py
index 6a336fa5bab70..c42ea3cfb02d5 100644
--- a/torch/optim/optimizer.py
+++ b/torch/optim/optimizer.py
@@ -204,7 +204,7 @@ def _device_dtype_check_for_fused(
         )
 
 
-def _view_as_real(params, *state_and_grads):
+def _view_as_real(params, *state_and_grads) -> None:
     for i, p in enumerate(params):
         if torch.is_complex(p):
             params[i] = torch.view_as_real(params[i])
diff --git a/torch/optim/radam.py b/torch/optim/radam.py
index e13e6806e43a7..db69bbb01a042 100644
--- a/torch/optim/radam.py
+++ b/torch/optim/radam.py
@@ -42,7 +42,7 @@ def __init__(
         maximize: bool = False,
         capturable: bool = False,
         differentiable: bool = False,
-    ):  # noqa: D107
+    ) -> None:  # noqa: D107
         if isinstance(lr, Tensor) and lr.numel() != 1:
             raise ValueError("Tensor lr must be 1-element")
         if not 0.0 <= lr:
@@ -270,7 +270,7 @@ def _single_tensor_radam(
     maximize: bool,
     capturable: bool,
     has_complex: bool,
-):
+) -> None:
     if not torch.jit.is_scripting():
         lr = _to_scalar(lr)
 
@@ -377,7 +377,7 @@ def _multi_tensor_radam(
     maximize: bool,
     capturable: bool,
     has_complex: bool,
-):
+) -> None:
     if len(params) == 0:
         return
 
@@ -586,7 +586,7 @@ def radam(
     lr: float,
     weight_decay: float,
     eps: float,
-):
+) -> None:
     r"""Functional API that performs RAdam algorithm computation.
 
     See :class:`~torch.optim.RAdam` for details.
diff --git a/torch/optim/rmsprop.py b/torch/optim/rmsprop.py
index 04981d517d1ef..364068ecc9ab3 100644
--- a/torch/optim/rmsprop.py
+++ b/torch/optim/rmsprop.py
@@ -41,7 +41,7 @@ def __init__(
         foreach: Optional[bool] = None,
         maximize: bool = False,
         differentiable: bool = False,
-    ):  # noqa: D107
+    ) -> None:  # noqa: D107
         if isinstance(lr, Tensor) and lr.numel() != 1:
             raise ValueError("Tensor lr must be 1-element")
         if not 0.0 <= lr:
@@ -280,7 +280,7 @@ def _single_tensor_rmsprop(
     differentiable: bool,
     capturable: bool,
     has_complex: bool,
-):
+) -> None:
     if not torch.jit.is_scripting():
         lr = _to_scalar(lr)
 
@@ -357,7 +357,7 @@ def _multi_tensor_rmsprop(
     differentiable: bool,
     capturable: bool,
     has_complex: bool,
-):
+) -> None:
     if len(params) == 0:
         return
 
@@ -495,7 +495,7 @@ def rmsprop(
     weight_decay: float,
     momentum: float,
     centered: bool,
-):
+) -> None:
     r"""Functional API that performs rmsprop algorithm computation.
 
     See :class:`~torch.optim.RMSProp` for details.
diff --git a/torch/optim/rprop.py b/torch/optim/rprop.py
index 8ad7faf130e39..c9e1d5eabaeee 100644
--- a/torch/optim/rprop.py
+++ b/torch/optim/rprop.py
@@ -39,7 +39,7 @@ def __init__(
         foreach: Optional[bool] = None,
         maximize: bool = False,
         differentiable: bool = False,
-    ):  # noqa: D107
+    ) -> None:  # noqa: D107
         if isinstance(lr, Tensor) and lr.numel() != 1:
             raise ValueError("Tensor lr must be 1-element")
         if not 0.0 <= lr:
@@ -235,7 +235,7 @@ def _single_tensor_rprop(
     capturable: bool,
     differentiable: bool,
     has_complex: bool,
-):
+) -> None:
     for i, param in enumerate(params):
         grad = grads[i]
         grad = grad if not maximize else -grad
@@ -306,7 +306,7 @@ def _multi_tensor_rprop(
     capturable: bool,
     differentiable: bool,
     has_complex: bool,
-):
+) -> None:
     if len(params) == 0:
         return
 
@@ -428,7 +428,7 @@ def rprop(
     step_size_max: float,
     etaminus: float,
     etaplus: float,
-):
+) -> None:
     r"""Functional API that performs rprop algorithm computation.
 
     See :class:`~torch.optim.Rprop` for details.
diff --git a/torch/optim/sgd.py b/torch/optim/sgd.py
index 9c2c5a0eab3d0..63c80d645cd08 100644
--- a/torch/optim/sgd.py
+++ b/torch/optim/sgd.py
@@ -39,7 +39,7 @@ def __init__(
         foreach: Optional[bool] = None,
         differentiable: bool = False,
         fused: Optional[bool] = None,
-    ):  # noqa: D107
+    ) -> None:  # noqa: D107
         if isinstance(lr, Tensor) and lr.numel() != 1:
             raise ValueError("Tensor lr must be 1-element")
         if lr < 0.0:
@@ -267,7 +267,7 @@ def sgd(
     dampening: float,
     nesterov: bool,
     maximize: bool,
-):
+) -> None:
     r"""Functional API that performs SGD algorithm computation.
 
     See :class:`~torch.optim.SGD` for details.
@@ -333,7 +333,7 @@ def _single_tensor_sgd(
     nesterov: bool,
     maximize: bool,
     has_sparse_grad: bool,
-):
+) -> None:
     if grad_scale is not None or found_inf is not None:
         raise AssertionError("Expected grad_scale and found_inf to be None")
 
@@ -394,7 +394,7 @@ def _multi_tensor_sgd(
     nesterov: bool,
     maximize: bool,
     has_sparse_grad: bool,
-):
+) -> None:
     if grad_scale is not None or found_inf is not None:
         raise AssertionError("Expected grad_scale and found_inf to be None")
 
diff --git a/torch/optim/sparse_adam.py b/torch/optim/sparse_adam.py
index ca87e87ce8674..ed58c93181ae2 100644
--- a/torch/optim/sparse_adam.py
+++ b/torch/optim/sparse_adam.py
@@ -19,7 +19,7 @@ def __init__(
         betas: tuple[float, float] = (0.9, 0.999),
         eps: float = 1e-8,
         maximize: bool = False,
-    ):
+    ) -> None:
         if isinstance(lr, Tensor) and lr.numel() != 1:
             raise ValueError("Tensor lr must be 1-element")
         if not 0.0 < lr:
diff --git a/torch/optim/swa_utils.py b/torch/optim/swa_utils.py
index 1ab915d27cd66..ebe3e07025957 100644
--- a/torch/optim/swa_utils.py
+++ b/torch/optim/swa_utils.py
@@ -43,7 +43,9 @@ def get_ema_multi_avg_fn(decay=0.999):
         )
 
     @torch.no_grad()
-    def ema_update(ema_param_list: PARAM_LIST, current_param_list: PARAM_LIST, _):
+    def ema_update(
+        ema_param_list: PARAM_LIST, current_param_list: PARAM_LIST, _
+    ) -> None:
         # foreach lerp only handles float and complex
         if torch.is_floating_point(ema_param_list[0]) or torch.is_complex(
             ema_param_list[0]
@@ -64,7 +66,7 @@ def swa_update(
         averaged_param_list: PARAM_LIST,
         current_param_list: PARAM_LIST,
         num_averaged: Union[Tensor, int],
-    ):
+    ) -> None:
         # foreach lerp only handles float and complex
         if torch.is_floating_point(averaged_param_list[0]) or torch.is_complex(
             averaged_param_list[0]
@@ -227,7 +229,7 @@ def __init__(
             Callable[[PARAM_LIST, PARAM_LIST, Union[Tensor, int]], None]
         ] = None,
         use_buffers=False,
-    ):  # noqa: D107
+    ) -> None:  # noqa: D107
         super().__init__()
         if avg_fn is not None and multi_avg_fn is not None:
             raise AssertionError(
@@ -247,7 +249,7 @@ def forward(self, *args, **kwargs):
         """Forward pass."""
         return self.module(*args, **kwargs)
 
-    def update_parameters(self, model: Module):
+    def update_parameters(self, model: Module) -> None:
         """Update model parameters."""
         self_param = (
             # pyrefly: ignore [bad-argument-type]
@@ -329,7 +331,7 @@ def update_bn(
     loader: Iterable[Any],
     model: Module,
     device: Optional[Union[int, torch.device]] = None,
-):
+) -> None:
     r"""Update BatchNorm running_mean, running_var buffers in the model.
 
     It performs one pass over data in `loader` to estimate the activation
@@ -367,7 +369,7 @@ def update_bn(
 
     was_training = model.training
     model.train()
-    for module in momenta.keys():
+    for module in momenta:
         module.momentum = None
 
     for input in loader:
@@ -378,7 +380,7 @@ def update_bn(
 
         model(input)
 
-    for bn_module in momenta.keys():
+    for bn_module in momenta:
         bn_module.momentum = momenta[bn_module]
     model.train(was_training)
 
@@ -434,7 +436,7 @@ def __init__(
         anneal_epochs=10,
         anneal_strategy: Literal["cos", "linear"] = "cos",
         last_epoch=-1,
-    ):  # noqa: D107
+    ) -> None:  # noqa: D107
         swa_lrs = _format_param("swa_lr", optimizer, swa_lr)
         for swa_lr, group in zip(swa_lrs, optimizer.param_groups, strict=True):
             group["swa_lr"] = swa_lr
@@ -516,7 +518,7 @@ def get_lr(self):
             for group, lr in zip(self.optimizer.param_groups, prev_lrs, strict=True)
         ]
 
-    def _set_anneal_func(self, anneal_strategy: Literal["cos", "linear"]):
+    def _set_anneal_func(self, anneal_strategy: Literal["cos", "linear"]) -> None:
         self._anneal_strategy = anneal_strategy
         if anneal_strategy == "cos":
             self.anneal_func = self._cosine_anneal
diff --git a/torch/package/_stdlib.py b/torch/package/_stdlib.py
index 57a51ac41cfd9..e07b20a83cc6d 100644
--- a/torch/package/_stdlib.py
+++ b/torch/package/_stdlib.py
@@ -17,230 +17,5 @@ def is_stdlib_module(module: str) -> bool:
 
 
 def _get_stdlib_modules():
-    if sys.version_info.major == 3:  # noqa: UP036
-        if sys.version_info.minor == 9:
-            return stdlib3_9
-        if sys.version_info.minor >= 10:  # noqa: YTT204
-            return sys.stdlib_module_names  # type: ignore[attr-defined]
-    elif sys.version_info.major > 3:  # noqa: UP036
-        return sys.stdlib_module_names  # type: ignore[attr-defined]
-
-    raise RuntimeError(f"Unsupported Python version: {sys.version_info}")
-
-
-stdlib3_9 = {
-    "_thread",
-    "abc",
-    "aifc",
-    "argparse",
-    "array",
-    "ast",
-    "asynchat",
-    "asyncio",
-    "asyncore",
-    "atexit",
-    "audioop",
-    "base64",
-    "bdb",
-    "binascii",
-    "binhex",
-    "bisect",
-    "builtins",
-    "bz2",
-    "cProfile",
-    "calendar",
-    "cgi",
-    "cgitb",
-    "chunk",
-    "cmath",
-    "cmd",
-    "code",
-    "codecs",
-    "codeop",
-    "collections",
-    "colorsys",
-    "compileall",
-    "concurrent",
-    "configparser",
-    "contextlib",
-    "contextvars",
-    "copy",
-    "copyreg",
-    "crypt",
-    "csv",
-    "ctypes",
-    "curses",
-    "dataclasses",
-    "datetime",
-    "dbm",
-    "decimal",
-    "difflib",
-    "dis",
-    "distutils",
-    "doctest",
-    "email",
-    "encodings",
-    "ensurepip",
-    "enum",
-    "errno",
-    "faulthandler",
-    "fcntl",
-    "filecmp",
-    "fileinput",
-    "fnmatch",
-    "formatter",
-    "fractions",
-    "ftplib",
-    "functools",
-    "gc",
-    "getopt",
-    "getpass",
-    "gettext",
-    "glob",
-    "graphlib",
-    "grp",
-    "gzip",
-    "hashlib",
-    "heapq",
-    "hmac",
-    "html",
-    "http",
-    "imaplib",
-    "imghdr",
-    "imp",
-    "importlib",
-    "inspect",
-    "io",
-    "ipaddress",
-    "itertools",
-    "json",
-    "keyword",
-    "lib2to3",
-    "linecache",
-    "locale",
-    "logging",
-    "lzma",
-    "mailbox",
-    "mailcap",
-    "marshal",
-    "math",
-    "mimetypes",
-    "mmap",
-    "modulefinder",
-    "msilib",
-    "msvcrt",
-    "multiprocessing",
-    "netrc",
-    "nis",
-    "nntplib",
-    "ntpath",
-    "numbers",
-    "operator",
-    "optparse",
-    "os",
-    "ossaudiodev",
-    "parser",
-    "pathlib",
-    "pdb",
-    "pickle",
-    "pickletools",
-    "pipes",
-    "pkgutil",
-    "platform",
-    "plistlib",
-    "poplib",
-    "posix",
-    "posixpath",
-    "pprint",
-    "profile",
-    "pstats",
-    "pty",
-    "pwd",
-    "py_compile",
-    "pyclbr",
-    "pydoc",
-    "queue",
-    "quopri",
-    "random",
-    "re",
-    "readline",
-    "reprlib",
-    "resource",
-    "rlcompleter",
-    "runpy",
-    "sched",
-    "secrets",
-    "select",
-    "selectors",
-    "shelve",
-    "shlex",
-    "shutil",
-    "signal",
-    "site",
-    "smtpd",
-    "smtplib",
-    "sndhdr",
-    "socket",
-    "socketserver",
-    "spwd",
-    "sqlite3",
-    "sre",
-    "sre_compile",
-    "sre_constants",
-    "sre_parse",
-    "ssl",
-    "stat",
-    "statistics",
-    "string",
-    "stringprep",
-    "struct",
-    "subprocess",
-    "sunau",
-    "symbol",
-    "symtable",
-    "sys",
-    "sysconfig",
-    "syslog",
-    "tabnanny",
-    "tarfile",
-    "telnetlib",
-    "tempfile",
-    "termios",
-    "test",
-    "textwrap",
-    "threading",
-    "time",
-    "timeit",
-    "tkinter",
-    "token",
-    "tokenize",
-    "trace",
-    "traceback",
-    "tracemalloc",
-    "tty",
-    "turtle",
-    "turtledemo",
-    "types",
-    "typing",
-    "unicodedata",
-    "unittest",
-    "urllib",
-    "uu",
-    "uuid",
-    "venv",
-    "warnings",
-    "wave",
-    "weakref",
-    "webbrowser",
-    "winreg",
-    "winsound",
-    "wsgiref",
-    "xdrlib",
-    "xml",
-    "xmlrpc",
-    "zipapp",
-    "zipfile",
-    "zipimport",
-    "zlib",
-    "zoneinfo",
-}
+    assert sys.version_info >= (3, 10)
+    return sys.stdlib_module_names
diff --git a/torch/package/file_structure_representation.py b/torch/package/file_structure_representation.py
index 8ef00e0159d8b..2dae130ed6007 100644
--- a/torch/package/file_structure_representation.py
+++ b/torch/package/file_structure_representation.py
@@ -55,7 +55,7 @@ def has_file(self, filename: str) -> bool:
         lineage = filename.split("/", maxsplit=1)
         child = lineage[0]
         grandchildren = lineage[1] if len(lineage) > 1 else None
-        if child in self.children.keys():
+        if child in self.children:
             if grandchildren is None:
                 return True
             else:
diff --git a/torch/package/package_exporter.py b/torch/package/package_exporter.py
index b25ebca23095f..cea4335f75a70 100644
--- a/torch/package/package_exporter.py
+++ b/torch/package/package_exporter.py
@@ -1157,7 +1157,7 @@ def get_rdeps(self, module_name: str) -> list[str]:
         Returns:
             A list containing the names of modules which depend on ``module_name``.
         """
-        if module_name in self.dependency_graph._pred.keys():
+        if module_name in self.dependency_graph._pred:
             return list(self.dependency_graph._pred[module_name].keys())
         else:
             return []
diff --git a/torch/profiler/_memory_profiler.py b/torch/profiler/_memory_profiler.py
index 3f21ce81171d7..dfa83f7467cd6 100644
--- a/torch/profiler/_memory_profiler.py
+++ b/torch/profiler/_memory_profiler.py
@@ -711,7 +711,7 @@ def timeline(self) -> tuple[tuple[int, Action, KeyAndID, int], ...]:
 
         events: list[tuple[int, Action, TensorAndID]] = [
             (-1, Action.PREEXISTING, (key, version))
-            for key, version in snapshot.keys()
+            for key, version in snapshot
             if (key, True) not in allocation_times and version == 0
         ]
 
@@ -938,7 +938,7 @@ def _set_parameters_using_data_flow(self) -> None:
         parameter_keys = {key.id for key, _ in candidate_parameters}
         parameter_keys &= self._any_version_depends_on_gradient()
 
-        for key, _ in snapshot.keys():
+        for key, _ in snapshot:
             if key.id in parameter_keys:
                 self._categories.set_by_id(key, Category.PARAMETER)
 
diff --git a/torch/profiler/_utils.py b/torch/profiler/_utils.py
index 2c6e06b2cb3c9..2c575b06509e5 100644
--- a/torch/profiler/_utils.py
+++ b/torch/profiler/_utils.py
@@ -4,7 +4,7 @@
 import re
 from collections import deque
 from dataclasses import dataclass
-from typing import TYPE_CHECKING
+from typing import Any, Literal, Optional, TYPE_CHECKING
 
 from torch.autograd.profiler import profile
 from torch.profiler import DeviceType
@@ -103,7 +103,7 @@ def __init__(self, prof: profile) -> None:
         self.metrics: dict[EventKey, EventMetrics] = {}
         self.compute_self_time()
         self.event_keys = sorted(
-            (e for e in self.metrics.keys()), key=lambda x: x.event.start_time_ns
+            self.metrics.keys(), key=lambda x: x.event.start_time_ns
         )
         self.events = [e.event for e in self.event_keys]
         self.cuda_events: list[_KinetoEvent] = []
@@ -265,7 +265,7 @@ def compute_idle_time(self) -> None:
                 idle_intervals.append(Interval(idle_start, data_point.start))
                 idle = False
 
-        event_list = [e.event for e in self.metrics.keys()]
+        event_list = [e.event for e in self.metrics]
         for event in event_list:
             self.metrics[EventKey(event)].idle_time_ns = EventKey(
                 event
@@ -316,7 +316,7 @@ def rank_events(self, length):
         # Filter out events that are not in the decrease interval
         event_list = [
             event
-            for event in self.metrics.keys()
+            for event in self.metrics
             if event.intervals_overlap(decrease_interval)
         ]
         if event_list:
@@ -400,3 +400,170 @@ def _init_for_cuda_graphs() -> None:
 
     with profile():
         pass
+
+
+@dataclass
+class TimelineEvent:
+    """Represents an event in the profiler timeline."""
+
+    timestamp: int
+    event_type: Literal["start", "end", "regular"]
+    marker_type: Optional[Literal["filename", "node"]]
+    identifier: Optional[str | int]
+    event: dict[str, Any]
+
+
+@dataclass
+class ContextStackEntry:
+    """Represents a context (filename or node) in the stack."""
+
+    context_type: Literal["filename", "node"]
+    identifier: str | int
+    metadata: Optional[dict]
+    tid: Optional[int] = None  # Thread ID associated with this context
+
+
+def map_recorded_events_to_aten_ops_with_stack_trace(traced_data):
+    """
+    Maps recorded profiler events to their corresponding fx nodes and adds stack traces.
+
+    Builds a timeline of all events (regular ops and FX markers for filenames/nodes),
+    sorts by timestamp, then processes chronologically while maintaining a context stack of active
+    filename/node scopes. Regular events are augmented with stack traces and node names from the
+    innermost active context. Runtime is O(n log n) for n events.
+
+    Args:
+        traced_data: Json of profiler events from Chrome trace
+
+    Returns:
+        Dict mapping recorded event names to their aten operations with added stack traces
+    """
+    from torch.fx.traceback import _FX_METADATA_REGISTRY
+
+    trace_events = traced_data.get("traceEvents", [])
+
+    # Create event timeline
+    event_timeline: list[TimelineEvent] = []
+
+    def is_fx_marker_event(event):
+        return (
+            event.get("cat") == "cpu_op"
+            and event.get("name", "").startswith("## ")
+            and event.get("name", "").endswith(" ##")
+        )
+
+    def append_fx_marker_event(event_type, identifier, event):
+        start_ts = event["ts"]
+        end_ts = start_ts + event["dur"]
+        event_timeline.append(
+            TimelineEvent(start_ts, "start", event_type, identifier, event)
+        )
+        event_timeline.append(
+            TimelineEvent(end_ts, "end", event_type, identifier, event)
+        )
+
+    for event in trace_events:
+        if "ts" not in event or "dur" not in event:
+            continue
+
+        if is_fx_marker_event(event):
+            content = event["name"][3:-3]
+
+            if content.endswith(".py"):
+                append_fx_marker_event("filename", content, event)
+            else:
+                try:
+                    node_index = int(content)
+                except ValueError:
+                    pass
+                append_fx_marker_event("node", node_index, event)  # type: ignore[possibly-undefined]
+
+        else:
+            # Regular event that needs augmentation
+            start_ts = event["ts"]
+            event_timeline.append(TimelineEvent(start_ts, "regular", None, None, event))
+
+    # Sort by timestamp
+    event_timeline.sort(key=lambda x: x.timestamp)
+
+    # Process events in chronological order with a stack
+    context_stack: list[ContextStackEntry] = []
+
+    # Invariant: all start event has a corresponding end event
+    for timeline_event in event_timeline:
+        match timeline_event.event_type:
+            case "start":
+                assert timeline_event.identifier is not None
+
+                if timeline_event.marker_type == "filename":
+                    assert isinstance(timeline_event.identifier, str)
+                    # Push filename context - query metadata registry on-demand
+                    metadata = _FX_METADATA_REGISTRY.get(timeline_event.identifier)
+                    tid = timeline_event.event.get("tid")
+                    context_stack.append(
+                        ContextStackEntry(
+                            "filename", timeline_event.identifier, metadata, tid
+                        )
+                    )
+                elif timeline_event.marker_type == "node":
+                    # Find the current filename from stack
+                    current_file_metadata = None
+                    tid = timeline_event.event.get("tid")
+                    for ctx_entry in reversed(context_stack):
+                        if (
+                            ctx_entry.context_type == "filename"
+                            and ctx_entry.tid == tid
+                        ):
+                            current_file_metadata = ctx_entry.metadata
+                            break
+
+                    if current_file_metadata:
+                        node_metadata = current_file_metadata.get("node_metadata", {})
+                        if timeline_event.identifier in node_metadata:
+                            node_meta: Optional[dict] = node_metadata[
+                                timeline_event.identifier
+                            ]
+                            context_stack.append(
+                                ContextStackEntry(
+                                    "node", timeline_event.identifier, node_meta, tid
+                                )
+                            )
+
+            case "end":
+                # Pop from stack - search backwards to find matching context
+                for i in range(len(context_stack) - 1, -1, -1):
+                    ctx_entry = context_stack[i]
+                    if (
+                        timeline_event.marker_type == ctx_entry.context_type
+                        and timeline_event.identifier == ctx_entry.identifier
+                    ):
+                        context_stack.pop(i)
+                        break
+
+            case "regular":
+                # Apply metadata from current context stack
+                # Find the most specific context (node takes precedence over filename)
+                # Only augment events with the same tid as the file/node event matched
+                current_stack_trace = None
+                current_node_name = None
+                event_tid = timeline_event.event.get("tid")
+
+                for ctx_entry in reversed(context_stack):
+                    # Only apply metadata from contexts with matching tid
+                    if ctx_entry.tid == event_tid:
+                        if ctx_entry.context_type == "node" and ctx_entry.metadata:
+                            current_stack_trace = ctx_entry.metadata.get(
+                                "stack_trace", "No model stack trace available"
+                            )
+                            current_node_name = ctx_entry.metadata.get("name", "")
+                            # Do we want to only attach the stack trace of the lowest node or stack trace of all nodes
+                            # if nodes are nested, e.g. in nested graph modules
+                            break
+
+                # Augment the event
+                if current_stack_trace or current_node_name:
+                    args = timeline_event.event.setdefault("args", {})
+                    if current_stack_trace:
+                        args["stack_trace"] = current_stack_trace
+                    if current_node_name:
+                        args["node_name"] = current_node_name
diff --git a/torch/profiler/profiler.py b/torch/profiler/profiler.py
index ee0ea85e1694b..f3400e438a2d3 100644
--- a/torch/profiler/profiler.py
+++ b/torch/profiler/profiler.py
@@ -210,7 +210,8 @@ def prepare_trace(self) -> None:
     def start_trace(self) -> None:
         if self.execution_trace_observer:
             self.execution_trace_observer.start()
-        assert self.profiler is not None
+        if self.profiler is None:
+            raise AssertionError("Profiler must be initialized before starting trace")
         self.profiler._start_trace()
 
         if self.profile_memory:
@@ -256,7 +257,8 @@ def start_trace(self) -> None:
     def stop_trace(self) -> None:
         if self.execution_trace_observer:
             self.execution_trace_observer.stop()
-        assert self.profiler is not None
+        if self.profiler is None:
+            raise AssertionError("Profiler must be initialized before stopping trace")
         self.profiler.__exit__(None, None, None)
 
     def export_chrome_trace(self, path: str):
@@ -264,15 +266,15 @@ def export_chrome_trace(self, path: str):
         Exports the collected trace in Chrome JSON format. If kineto is enabled, only
         last cycle in schedule is exported.
         """
-        assert self.profiler
+        if self.profiler is None:
+            raise AssertionError(
+                "Profiler must be initialized before exporting chrome trace"
+            )
         if path.endswith(".gz"):
-            fp = tempfile.NamedTemporaryFile("w+b", suffix=".json", delete=False)
-            fp.close()
-            retvalue = self.profiler.export_chrome_trace(fp.name)
-            with open(fp.name, "rb") as fin:
-                with gzip.open(path, "wb") as fout:
+            with tempfile.NamedTemporaryFile("w+b", suffix=".json") as fp:
+                retvalue = self.profiler.export_chrome_trace(fp.name)
+                with open(fp.name, "rb") as fin, gzip.open(path, "wb") as fout:
                     fout.writelines(fin)
-            os.remove(fp.name)
             return retvalue
         else:
             return self.profiler.export_chrome_trace(path)
@@ -284,7 +286,8 @@ def export_stacks(self, path: str, metric: str = "self_cpu_time_total"):
             path (str): save stacks file to this location;
             metric (str): metric to use: "self_cpu_time_total" or "self_cuda_time_total"
         """
-        assert self.profiler
+        if self.profiler is None:
+            raise AssertionError("Profiler must be initialized before exporting stacks")
         return self.profiler.export_stacks(path, metric)
 
     def toggle_collection_dynamic(
@@ -316,7 +319,7 @@ def toggle_collection_dynamic(
             print(p.key_averages().table(
                 sort_by="self_cuda_time_total", row_limit=-1))
         """
-        if not self.profiler:
+        if self.profiler is None:
             return
         self.profiler.toggle_collection_dynamic(enable, activities)
 
@@ -333,7 +336,10 @@ def key_averages(
             To use shape/stack functionality make sure to set record_shapes/with_stack
             when creating profiler context manager.
         """
-        assert self.profiler
+        if self.profiler is None:
+            raise AssertionError(
+                "Profiler must be initialized before getting key averages"
+            )
         return self.profiler.key_averages(
             group_by_input_shape, group_by_stack_n, group_by_overload_name
         )
@@ -343,7 +349,8 @@ def events(self):
         Returns the list of unaggregated profiler events,
         to be used in the trace callback or after the profiling is finished
         """
-        assert self.profiler
+        if self.profiler is None:
+            raise AssertionError("Profiler must be initialized before accessing events")
         return self.profiler.function_events
 
     def add_metadata(self, key: str, value: str) -> None:
@@ -395,7 +402,10 @@ def _memory_profile(self) -> MemoryProfile:
         if missing:
             raise ValueError(f"{', '.join(missing)} required for memory profiling.")
 
-        assert self.profiler is not None and self.profiler.kineto_results is not None
+        if self.profiler is None or self.profiler.kineto_results is None:
+            raise AssertionError(
+                "Profiler and kineto_results must be initialized for memory profiling"
+            )
         return MemoryProfile(self.profiler.kineto_results)
 
     def export_memory_timeline(self, path: str, device: Optional[str] = None) -> None:
@@ -435,16 +445,13 @@ def export_memory_timeline(self, path: str, device: Optional[str] = None) -> Non
         if path.endswith(".html"):
             self.mem_tl.export_memory_timeline_html(path, device)
         elif path.endswith(".gz"):
-            fp = tempfile.NamedTemporaryFile("w+t", suffix=".json", delete=False)
-            fp.close()
-            if path.endswith("raw.json.gz"):
-                self.mem_tl.export_memory_timeline_raw(fp.name, device)
-            else:
-                self.mem_tl.export_memory_timeline(fp.name, device)
-            with open(fp.name) as fin:
-                with gzip.open(path, "wt") as fout:
+            with tempfile.NamedTemporaryFile("w+t", suffix=".json") as fp:
+                if path.endswith("raw.json.gz"):
+                    self.mem_tl.export_memory_timeline_raw(fp.name, device)
+                else:
+                    self.mem_tl.export_memory_timeline(fp.name, device)
+                with open(fp.name) as fin, gzip.open(path, "wt") as fout:
                     fout.writelines(fin)
-            os.remove(fp.name)
         else:
             self.mem_tl.export_memory_timeline(path, device)
 
@@ -485,7 +492,8 @@ def schedule(
     """
 
     def schedule_fn(step: int) -> ProfilerAction:
-        assert step >= 0
+        if step < 0:
+            raise AssertionError(f"Step must be non-negative. Got {step}.")
         if step < skip_first:
             return ProfilerAction.NONE
         else:
@@ -508,9 +516,11 @@ def schedule_fn(step: int) -> ProfilerAction:
                 else ProfilerAction.RECORD_AND_SAVE
             )
 
-    assert (
-        wait >= 0 and warmup >= 0 and active > 0 and repeat >= 0 and skip_first >= 0
-    ), "Invalid profiler schedule arguments"
+    if wait < 0 or warmup < 0 or active <= 0 or repeat < 0 or skip_first < 0:
+        raise AssertionError(
+            f"Invalid profiler schedule arguments. Got wait={wait} (need >= 0), warmup={warmup} (need >= 0), "
+            f"active={active} (need > 0), repeat={repeat} (need >= 0), skip_first={skip_first} (need >= 0)."
+        )
     if warmup == 0:
         warn(
             "Profiler won't be using warmup, this can skew profiler results",
@@ -717,7 +727,8 @@ def __init__(
                 activities_set.add(ProfilerActivity.CUDA)
             elif ProfilerActivity.CUDA in activities_set:
                 activities_set.remove(ProfilerActivity.CUDA)
-        assert len(activities_set) > 0, "No valid profiler activities found"
+        if len(activities_set) == 0:
+            raise AssertionError("No valid profiler activities found")
 
         super().__init__(
             activities=activities,
@@ -930,7 +941,7 @@ def build_execution_trace_obs_from_env() -> Optional["ExecutionTraceObserver"]:
         """
         if os.environ.get("ENABLE_PYTORCH_EXECUTION_TRACE", "0") == "1":
             try:
-                fp = tempfile.NamedTemporaryFile("w+t", suffix=".et.json", delete=False)
+                fp = tempfile.NamedTemporaryFile("w+t", suffix=".et.json", delete=False)  # noqa:SIM115
             except Exception as e:
                 warn(
                     f"Execution trace will not be recorded. Exception on creating default temporary file: {e}",
diff --git a/torch/serialization.py b/torch/serialization.py
index ce5a74d92384e..ffa77cec732ed 100644
--- a/torch/serialization.py
+++ b/torch/serialization.py
@@ -1250,7 +1250,7 @@ def persistent_id(self, obj):
         zip_file.write_record("byteorder", sys.byteorder, len(sys.byteorder))
 
     # Write each tensor to a file named tensor/the_tensor_key in the zip archive
-    for key in serialized_storages.keys():
+    for key in serialized_storages:
         name = f"data/{key}"
         storage = serialized_storages[key]
         num_bytes = storage.nbytes()
@@ -1494,7 +1494,7 @@ def _get_wo_message(message: str) -> str:
 
     _check_dill_version(pickle_module)
 
-    if "encoding" not in pickle_load_args.keys():
+    if "encoding" not in pickle_load_args:
         pickle_load_args["encoding"] = "utf-8"
 
     with _open_file_like(f, "rb") as opened_file:
diff --git a/torch/signal/windows/windows.py b/torch/signal/windows/windows.py
index 8478d0df574dc..cda60aadfe1d6 100644
--- a/torch/signal/windows/windows.py
+++ b/torch/signal/windows/windows.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 from collections.abc import Callable, Iterable
 from math import sqrt
-from typing import Optional, TypeVar
+from typing import TypeVar
 
 import torch
 from torch import Tensor
@@ -133,12 +133,12 @@ def _window_function_checks(
 def exponential(
     M: int,
     *,
-    center: Optional[float] = None,
+    center: float | None = None,
     tau: float = 1.0,
     sym: bool = True,
-    dtype: Optional[torch.dtype] = None,
+    dtype: torch.dtype | None = None,
     layout: torch.layout = torch.strided,
-    device: Optional[torch.device] = None,
+    device: torch.device | None = None,
     requires_grad: bool = False,
 ) -> Tensor:
     if dtype is None:
@@ -220,9 +220,9 @@ def cosine(
     M: int,
     *,
     sym: bool = True,
-    dtype: Optional[torch.dtype] = None,
+    dtype: torch.dtype | None = None,
     layout: torch.layout = torch.strided,
-    device: Optional[torch.device] = None,
+    device: torch.device | None = None,
     requires_grad: bool = False,
 ) -> Tensor:
     if dtype is None:
@@ -294,9 +294,9 @@ def gaussian(
     *,
     std: float = 1.0,
     sym: bool = True,
-    dtype: Optional[torch.dtype] = None,
+    dtype: torch.dtype | None = None,
     layout: torch.layout = torch.strided,
-    device: Optional[torch.device] = None,
+    device: torch.device | None = None,
     requires_grad: bool = False,
 ) -> Tensor:
     if dtype is None:
@@ -373,9 +373,9 @@ def kaiser(
     *,
     beta: float = 12.0,
     sym: bool = True,
-    dtype: Optional[torch.dtype] = None,
+    dtype: torch.dtype | None = None,
     layout: torch.layout = torch.strided,
-    device: Optional[torch.device] = None,
+    device: torch.device | None = None,
     requires_grad: bool = False,
 ) -> Tensor:
     if dtype is None:
@@ -465,9 +465,9 @@ def hamming(
     M: int,
     *,
     sym: bool = True,
-    dtype: Optional[torch.dtype] = None,
+    dtype: torch.dtype | None = None,
     layout: torch.layout = torch.strided,
-    device: Optional[torch.device] = None,
+    device: torch.device | None = None,
     requires_grad: bool = False,
 ) -> Tensor:
     return general_hamming(
@@ -519,9 +519,9 @@ def hann(
     M: int,
     *,
     sym: bool = True,
-    dtype: Optional[torch.dtype] = None,
+    dtype: torch.dtype | None = None,
     layout: torch.layout = torch.strided,
-    device: Optional[torch.device] = None,
+    device: torch.device | None = None,
     requires_grad: bool = False,
 ) -> Tensor:
     return general_hamming(
@@ -573,9 +573,9 @@ def blackman(
     M: int,
     *,
     sym: bool = True,
-    dtype: Optional[torch.dtype] = None,
+    dtype: torch.dtype | None = None,
     layout: torch.layout = torch.strided,
-    device: Optional[torch.device] = None,
+    device: torch.device | None = None,
     requires_grad: bool = False,
 ) -> Tensor:
     if dtype is None:
@@ -634,9 +634,9 @@ def bartlett(
     M: int,
     *,
     sym: bool = True,
-    dtype: Optional[torch.dtype] = None,
+    dtype: torch.dtype | None = None,
     layout: torch.layout = torch.strided,
-    device: Optional[torch.device] = None,
+    device: torch.device | None = None,
     requires_grad: bool = False,
 ) -> Tensor:
     if dtype is None:
@@ -710,9 +710,9 @@ def general_cosine(
     *,
     a: Iterable,
     sym: bool = True,
-    dtype: Optional[torch.dtype] = None,
+    dtype: torch.dtype | None = None,
     layout: torch.layout = torch.strided,
-    device: Optional[torch.device] = None,
+    device: torch.device | None = None,
     requires_grad: bool = False,
 ) -> Tensor:
     if dtype is None:
@@ -803,9 +803,9 @@ def general_hamming(
     *,
     alpha: float = 0.54,
     sym: bool = True,
-    dtype: Optional[torch.dtype] = None,
+    dtype: torch.dtype | None = None,
     layout: torch.layout = torch.strided,
-    device: Optional[torch.device] = None,
+    device: torch.device | None = None,
     requires_grad: bool = False,
 ) -> Tensor:
     return general_cosine(
@@ -867,9 +867,9 @@ def nuttall(
     M: int,
     *,
     sym: bool = True,
-    dtype: Optional[torch.dtype] = None,
+    dtype: torch.dtype | None = None,
     layout: torch.layout = torch.strided,
-    device: Optional[torch.device] = None,
+    device: torch.device | None = None,
     requires_grad: bool = False,
 ) -> Tensor:
     return general_cosine(
diff --git a/torch/sparse/__init__.py b/torch/sparse/__init__.py
index 2ddd930cd8521..ac286c39aa723 100644
--- a/torch/sparse/__init__.py
+++ b/torch/sparse/__init__.py
@@ -19,7 +19,7 @@
 if TYPE_CHECKING:
     from torch.types import _dtype as DType
 
-    DimOrDims = Optional[Union[int, tuple[int, ...], list[int]]]
+    DimOrDims = Optional[int | tuple[int, ...] | list[int]]
 else:
     # The JIT doesn't understand Union, nor torch.dtype here
     DType = int
@@ -198,7 +198,7 @@
 )
 
 
-def sum(input: Tensor, dim: DimOrDims = None, dtype: Optional[DType] = None) -> Tensor:
+def sum(input: Tensor, dim: DimOrDims = None, dtype: DType | None = None) -> Tensor:
     r"""Return the sum of each row of the given sparse tensor.
 
     Returns the sum of each row of the sparse tensor :attr:`input` in the given
@@ -521,7 +521,7 @@ def disable():
     # context manager support
     def __init__(self, enable=True):
         self.state = enable
-        self.saved_state: Optional[bool] = None
+        self.saved_state: bool | None = None
 
     def __enter__(self):
         if self.saved_state is not None:
diff --git a/torch/sparse/_triton_ops.py b/torch/sparse/_triton_ops.py
index 2011930d78fbf..cd3d660d2e9c9 100644
--- a/torch/sparse/_triton_ops.py
+++ b/torch/sparse/_triton_ops.py
@@ -4,7 +4,6 @@
 import os
 import weakref
 from functools import lru_cache
-from typing import Optional
 
 import torch
 from torch._dynamo.utils import warn_once
@@ -1123,12 +1122,12 @@ def _int_bsr_dense_addmm(
     *,
     beta=1,
     alpha=1,
-    left_alpha: Optional[torch.Tensor] = None,
-    right_alpha: Optional[torch.Tensor] = None,
-    out: Optional[torch.Tensor] = None,
+    left_alpha: torch.Tensor | None = None,
+    right_alpha: torch.Tensor | None = None,
+    out: torch.Tensor | None = None,
     skip_checks: bool = False,
-    max_grid: Optional[tuple[Optional[int], Optional[int], Optional[int]]] = None,
-    meta: Optional[dict] = None,
+    max_grid: tuple[int | None, int | None, int | None] | None = None,
+    meta: dict | None = None,
 ):
     if out is None and dense.dtype is torch.int8:
         f_name = "_int_bsr_dense_addmm"
@@ -1164,12 +1163,12 @@ def bsr_dense_addmm(
     *,
     beta=1,
     alpha=1,
-    left_alpha: Optional[torch.Tensor] = None,
-    right_alpha: Optional[torch.Tensor] = None,
-    out: Optional[torch.Tensor] = None,
+    left_alpha: torch.Tensor | None = None,
+    right_alpha: torch.Tensor | None = None,
+    out: torch.Tensor | None = None,
     skip_checks: bool = False,
-    max_grid: Optional[tuple[Optional[int], Optional[int], Optional[int]]] = None,
-    meta: Optional[dict] = None,
+    max_grid: tuple[int | None, int | None, int | None] | None = None,
+    meta: dict | None = None,
 ):
     """Compute
 
@@ -1667,9 +1666,9 @@ def sampled_addmm(
         *,
         beta=1.0,
         alpha=1.0,
-        out: Optional[torch.Tensor] = None,
+        out: torch.Tensor | None = None,
         skip_checks: bool = False,
-        max_grid: Optional[tuple[Optional[int], Optional[int], Optional[int]]] = None,
+        max_grid: tuple[int | None, int | None, int | None] | None = None,
     ):
         f_name = "sampled_addmm"
 
@@ -1751,10 +1750,10 @@ def bsr_dense_mm(
         bsr: torch.Tensor,
         dense: torch.Tensor,
         *,
-        out: Optional[torch.Tensor] = None,
+        out: torch.Tensor | None = None,
         skip_checks: bool = False,
-        max_grid: Optional[tuple[Optional[int], Optional[int], Optional[int]]] = None,
-        meta: Optional[dict] = None,
+        max_grid: tuple[int | None, int | None, int | None] | None = None,
+        meta: dict | None = None,
     ):
         f_name = "bsr_dense_mm"
         m, _kl = bsr.shape[-2:]
@@ -1967,10 +1966,10 @@ def _scaled_dot_product_attention(
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
-        attn_mask: Optional[torch.Tensor],
+        attn_mask: torch.Tensor | None,
         dropout_p: float = 0.0,
         is_causal: bool = False,
-        scale: Optional[float] = None,
+        scale: float | None = None,
     ):
         f_name = "_scaled_dot_product_attention"
         check(not is_causal, f"{f_name}(): is_causal == True is not supported.")
diff --git a/torch/sparse/_triton_ops_meta.py b/torch/sparse/_triton_ops_meta.py
index 38749d00f0eb4..ae8e5f4066e27 100644
--- a/torch/sparse/_triton_ops_meta.py
+++ b/torch/sparse/_triton_ops_meta.py
@@ -217,9 +217,8 @@ def update(op, device_name, version, key, value):
 def dump():
     """Store the current runtime db state to the module file."""
     current_file = inspect.getfile(dump)
-    f = open(current_file)
-    current_content = f.read()
-    f.close()
+    with open(current_file) as f:
+        current_content = f.read()
     begin_data_str = "# BEGIN GENERATED DATA\n"
     begin_data_index = current_content.find(begin_data_str)
     end_data_index = current_content.find("    # END GENERATED DATA\n")
@@ -250,9 +249,8 @@ def sort_key(key):
         data_part.append("    },")
     new_content = part1 + "\n".join(data_part) + "\n" + part2
     if current_content != new_content:
-        f = open(current_file, "w")
-        f.write(new_content)
-        f.close()
+        with open(current_file, "w") as f:
+            f.write(new_content)
 
 
 def minimize(
diff --git a/torch/sparse/semi_structured.py b/torch/sparse/semi_structured.py
index df5e3508e5256..a50c74083c01c 100644
--- a/torch/sparse/semi_structured.py
+++ b/torch/sparse/semi_structured.py
@@ -2,7 +2,7 @@
 import warnings
 from collections import namedtuple
 from collections.abc import Callable
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from torch.sparse._semi_structured_conversions import (
@@ -63,11 +63,11 @@ class SparseSemiStructuredTensor(torch.Tensor):
     BACKEND: str
     SPARSE_DISPATCH: dict[Callable, Callable]
 
-    packed: Optional[torch.Tensor]
-    meta: Optional[torch.Tensor]
-    packed_t: Optional[torch.Tensor]
-    meta_t: Optional[torch.Tensor]
-    compressed_swizzled_bitmask: Optional[torch.Tensor]
+    packed: torch.Tensor | None
+    meta: torch.Tensor | None
+    packed_t: torch.Tensor | None
+    meta_t: torch.Tensor | None
+    compressed_swizzled_bitmask: torch.Tensor | None
     fuse_transpose_cusparselt: bool
     alg_id_cusparselt: int
 
@@ -77,11 +77,11 @@ class SparseSemiStructuredTensor(torch.Tensor):
     def __new__(  # noqa: PYI034
         cls,
         shape: torch.Size,
-        packed: Optional[torch.Tensor],
-        meta: Optional[torch.Tensor],
-        packed_t: Optional[torch.Tensor],
-        meta_t: Optional[torch.Tensor],
-        compressed_swizzled_bitmask: Optional[torch.Tensor],
+        packed: torch.Tensor | None,
+        meta: torch.Tensor | None,
+        packed_t: torch.Tensor | None,
+        meta_t: torch.Tensor | None,
+        compressed_swizzled_bitmask: torch.Tensor | None,
         fuse_transpose_cusparselt: bool = False,
         alg_id_cusparselt: int = 0,
         requires_grad: bool = False,
@@ -312,7 +312,7 @@ def _mm(
         self,
         B: torch.Tensor,
         *,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
         **kwargs,
     ) -> torch.Tensor:
         raise NotImplementedError
@@ -514,7 +514,7 @@ def prune_dense_static_sort(
         )
 
     def _mm(
-        self, B: torch.Tensor, *, bias: Optional[torch.Tensor] = None, **kwargs
+        self, B: torch.Tensor, *, bias: torch.Tensor | None = None, **kwargs
     ) -> torch.Tensor:
         if isinstance(B, SparseSemiStructuredTensor):
             raise ValueError(
@@ -643,7 +643,7 @@ def prune_dense_static_sort(
         )
 
     def _mm(
-        self, B: torch.Tensor, *, bias: Optional[torch.Tensor] = None, **kwargs
+        self, B: torch.Tensor, *, bias: torch.Tensor | None = None, **kwargs
     ) -> torch.Tensor:
         if isinstance(B, SparseSemiStructuredTensor):
             raise ValueError(
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index c31d7a54b65a1..9acc6f0f75676 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -1516,7 +1516,7 @@ def __init__(self, d):
         assert isinstance(d, dict), (
             "precisionOverride not given a dtype : precision dict!"
         )
-        for dtype in d.keys():
+        for dtype in d:
             assert isinstance(dtype, torch.dtype), (
                 f"precisionOverride given unknown dtype {dtype}"
             )
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index 18384b311b936..c2b4dd57055a6 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -238,6 +238,47 @@ def wrapper(*args, **kwargs):
     return decorator
 
 
+def requires_world_size(n: int):
+    """
+    Decorator to request a specific world size for a test. The test harness can
+    read this attribute to set the number of ranks to spawn. If there are fewer
+    than `n` CUDA devices available, the test should be skipped by the harness.
+
+    Usage:
+        @require_world_size(3)
+        def test_something(self):
+            ...
+    """
+
+    def decorator(func):
+        func._required_world_size = n
+        available = torch.cuda.device_count()
+        return unittest.skipUnless(
+            available >= n, f"requires {n} GPUs, found {available}"
+        )(func)
+
+    return decorator
+
+
+def get_required_world_size(obj: Any, default: int) -> int:
+    """
+    Returns the requested world size for the currently running unittest method on `obj`
+    if annotated via `@require_world_size(n)`, else returns `default`.
+    """
+    try:
+        # Try MultiProcessTestCase helper first, then unittest fallback
+        test_name = (
+            obj._current_test_name()  # type: ignore[attr-defined]
+            if hasattr(obj, "_current_test_name") and callable(obj._current_test_name)
+            else obj._testMethodName
+        )
+        fn = getattr(obj, test_name)
+        value = fn._required_world_size
+        return int(value)
+    except Exception:
+        return default
+
+
 # This decorator helps avoiding initializing cuda while testing other backends
 def nccl_skip_if_lt_x_gpu(backend, x):
     def decorator(func):
@@ -367,6 +408,13 @@ def requires_nccl_version(version, msg):
         )
 
 
+def requires_nccl_shrink():
+    """
+    Require NCCL shrink support (NCCL available and version >= 2.27).
+    """
+    return requires_nccl_version((2, 27), "Need NCCL 2.27+ for shrink_group")
+
+
 def requires_nccl():
     return skip_but_pass_in_sandcastle_if(
         not c10d.is_nccl_available(),
@@ -1663,7 +1711,7 @@ def opts(cls, high_priority_stream=False):
     @classmethod
     def _init_pg(cls, rank, world_size, rdvz_file):
         assert rdvz_file is not None
-        # rank should be local_rank for tests running on <= 8gpus which is how all these tests are designed
+        # rank should be local_rank for tests running on <= 8 gpus which is how all these tests are designed
         # and we expect LOCAL_RANK set by torchrun. Setting it lets init_device_mesh set the device without
         # issuing a warning
         os.environ["LOCAL_RANK"] = str(rank)
@@ -1723,6 +1771,22 @@ def _worker_loop(cls, rank, world_size, rdvz_file, task_queue, completion_queue)
                 cls._run_test_given_id(test_id)
                 completion_queue.put(test_id)
             except BaseException as ex:  # noqa: B036
+                if isinstance(ex, SystemExit):
+                    # Get exit code from the process
+                    exit_code = getattr(ex, "code", None)
+
+                    # Look up exit code in TEST_SKIPS to see if it is a valid skip
+                    skip_entry = next(
+                        (v for v in TEST_SKIPS.values() if v.exit_code == exit_code),
+                        None,
+                    )
+
+                    # If we found an entry, we want to skip the test and the object back to the main process
+                    if skip_entry:
+                        completion_queue.put(unittest.SkipTest(skip_entry.message))
+                        # Skip exception handling below, move to main thread for processing the skip
+                        continue
+
                 raised_exception = True
                 # Send the exception and stack trace back to the dispatcher
                 exc_info = sys.exc_info()
@@ -1844,6 +1908,8 @@ def wrapper(self):
                 # Wait for the workers to finish the test
                 for i, completion_queue in enumerate(self.completion_queues):
                     rv = completion_queue.get()
+                    if isinstance(rv, unittest.SkipTest):
+                        raise rv
                     if isinstance(rv, BaseException):
                         # Hit an exception, re-raise it in the main process.
                         logger.warning(
diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py
index 36c72f1d5c3be..74b3cdc78f2d9 100644
--- a/torch/testing/_internal/common_fsdp.py
+++ b/torch/testing/_internal/common_fsdp.py
@@ -209,7 +209,7 @@ def _broadcast_state_dict(rank, state_dict):
     dist.broadcast_object_list(olist)
     state_dict = cast(dict[str, torch.Tensor], olist[0])
     # Ensure that the state is on DEVICE
-    for param_name in state_dict.keys():
+    for param_name in state_dict:
         state_dict[param_name] = state_dict[param_name].to(DEVICE_TYPE)
     return state_dict
 
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 92f212a3c650e..6724ab2ae739a 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -34,6 +34,9 @@
     SM53OrLater, SM80OrLater, SM89OrLater, with_tf32_off, TEST_CUDNN, _get_torch_cuda_version,
     _get_torch_rocm_version,
 )
+from torch.testing._internal.common_quantized import (
+    _bfloat16_to_float4_e2m1fn_x2,
+)
 from torch.testing._internal.common_utils import (
     make_fullrank_matrices_with_distinct_singular_values,
     TEST_WITH_ROCM, IS_FBCODE, IS_WINDOWS, IS_MACOS, IS_S390X, TEST_SCIPY,
@@ -155,6 +158,10 @@
     import scipy.special
 
 
+def round_up(x: int, y: int) -> int:
+    return ((x + y - 1) // y) * y
+
+
 # test if a tensor is close to an integer
 def close_to_int(x, eps=0.1):
     if x.is_complex():
@@ -8840,6 +8847,158 @@ def sample_inputs_scaled_mm(op_info, device, dtype, requires_grad, **kwargs):
 
     yield from samples
 
+def sample_inputs_scaled_mm_v2(op_info, device, dtype, requires_grad, **kwargs):
+    from torch.nn.functional import ScalingType, SwizzleType
+    make_mat_e4m3 = partial(make_tensor, device=device, dtype=torch.float8_e4m3fn, requires_grad=requires_grad)
+
+    make_scale = partial(make_tensor, device=device, dtype=torch.float, requires_grad=False)
+
+    M, N, K = 15, 32, 16
+    samples = []
+    # two e4m3 tensorwise
+    mat1 = make_mat_e4m3((M, K))
+    mat2 = make_mat_e4m3((K, N)).t().contiguous().t()
+    scale1 = make_scale((1,))
+    scale2 = make_scale((1,))
+    samples.append(
+        SampleInput(
+            mat1,
+            mat2,
+            [scale1, ],
+            [ScalingType.TensorWise, ],
+            [SwizzleType.NO_SWIZZLE, ],
+            [scale2, ],
+            [ScalingType.TensorWise, ],
+            [SwizzleType.NO_SWIZZLE, ],
+            None,  # bias
+            torch.bfloat16,  # out_dtype
+        )
+    )
+    # two e4m3 rowwise
+    mat1 = make_mat_e4m3((M, K))
+    mat2 = make_mat_e4m3((K, N)).t().contiguous().t()
+    scale1 = make_scale((M, 1))
+    scale2 = make_scale((1, N))
+    samples.append(
+        SampleInput(
+            mat1,
+            mat2,
+            [scale1, ],
+            [ScalingType.RowWise, ],
+            [SwizzleType.NO_SWIZZLE, ],
+            [scale2, ],
+            [ScalingType.RowWise, ],
+            [SwizzleType.NO_SWIZZLE, ],
+            None,  # bias
+            torch.bfloat16,  # out_dtype
+        )
+    )
+    M, K, N = 256, 512, 768
+    mat1 = make_mat_e4m3((M, K))
+    mat2 = make_mat_e4m3((K, N)).t().contiguous().t()
+
+    dmajor, dminor = torch.cuda.get_device_capability()
+
+    if dmajor == 9 and not torch.version.hip:
+        # 1x128 x 1x128
+        scale1 = make_scale((K // 128, M)).t()
+        scale2 = make_scale((K // 128, N)).t()
+        samples.append(
+            SampleInput(
+                mat1,
+                mat2,
+                [scale1, ],
+                [ScalingType.BlockWise1x128, ],
+                [SwizzleType.NO_SWIZZLE, ],
+                [scale2, ],
+                [ScalingType.BlockWise1x128, ],
+                [SwizzleType.NO_SWIZZLE, ],
+                None,  # bias
+                torch.bfloat16,  # out_dtype
+            )
+        )
+        # 128x128 x 1x128
+        L4 = round_up(K // 128, 4)
+        scale1 = make_scale((M // 128, L4)).t()
+        scale2 = make_scale((K // 128, N)).t()
+        samples.append(
+            SampleInput(
+                mat1,
+                mat2,
+                [scale1, ],
+                [ScalingType.BlockWise128x128, ],
+                [SwizzleType.NO_SWIZZLE, ],
+                [scale2, ],
+                [ScalingType.BlockWise1x128, ],
+                [SwizzleType.NO_SWIZZLE, ],
+                None,  # bias
+                torch.bfloat16,  # out_dtype
+            )
+        )
+        # 1x128 x 128x128
+        L4 = round_up(K // 128, 4)
+        scale1 = make_scale((K // 128, M)).t()
+        scale2 = make_scale((N // 128, L4)).t()
+        samples.append(
+            SampleInput(
+                mat1,
+                mat2,
+                [scale1, ],
+                [ScalingType.BlockWise1x128, ],
+                [SwizzleType.NO_SWIZZLE, ],
+                [scale2, ],
+                [ScalingType.BlockWise128x128, ],
+                [SwizzleType.NO_SWIZZLE, ],
+                None,  # bias
+                torch.bfloat16,  # out_dtype
+            )
+        )
+
+    if dmajor >= 10:
+        # MXFP8
+        scale1 = make_scale((M, K // 32)).to(torch.float8_e8m0fnu)
+        scale2 = make_scale((K // 32, N)).to(torch.float8_e8m0fnu)
+        samples.append(
+            SampleInput(
+                mat1,
+                mat2,
+                [scale1, ],
+                [ScalingType.BlockWise1x32, ],
+                [SwizzleType.SWIZZLE_32_4_4, ],
+                [scale2, ],
+                [ScalingType.BlockWise1x32, ],
+                [SwizzleType.SWIZZLE_32_4_4, ],
+                None,  # bias
+                torch.bfloat16,  # out_dtype
+            )
+        )
+        # NVFP4
+        # [M, K] -> [M, K // 2]
+        # [K, N] -> [K // 2, N]
+        mat1_fp4 = _bfloat16_to_float4_e2m1fn_x2(mat1.to(torch.bfloat16))
+        mat2_fp4 = _bfloat16_to_float4_e2m1fn_x2(mat2.to(torch.bfloat16).t()).t()
+        scale1 = make_scale((M, K // 16)).to(torch.float8_e4m3fn)
+        global_scale1 = make_scale((1, ))
+        scale2 = make_scale((K // 16, N)).to(torch.float8_e4m3fn)
+        global_scale2 = make_scale((1, ))
+        samples.append(
+            SampleInput(
+                mat1_fp4,
+                mat2_fp4,
+                [scale1, global_scale1],
+                [ScalingType.BlockWise1x16, ScalingType.TensorWise],
+                [SwizzleType.SWIZZLE_32_4_4, ],
+                [scale2, global_scale2],
+                [ScalingType.BlockWise1x16, ScalingType.TensorWise],
+                [SwizzleType.SWIZZLE_32_4_4, ],
+                None,  # bias
+                torch.bfloat16,  # out_dtype
+            )
+        )
+
+
+    yield from samples
+
 def sample_inputs_scaled_dot_product_attention(op_info, device, dtype, requires_grad, **kwargs):
     make = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
     batch, seq_q, seq_kv, num_heads, head_dim = 4, 3, 6, 4, 8
@@ -14105,15 +14264,11 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                     ], ),
     BinaryUfuncInfo('logaddexp',
                     dtypes=floating_and_complex_types_and(torch.bfloat16, torch.float16),
-                    dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.float16),
+                    dtypesIfCUDA=floating_and_complex_types_and(torch.bfloat16, torch.float16, torch.complex32),
                     dtypesIfHpu=custom_types(torch.float32, torch.bfloat16),
                     supports_forward_ad=True,
                     supports_fwgrad_bwgrad=True,
-                    supports_rhs_python_scalar=False,
-                    skips=(
-                        # TODO: FIXME: RuntimeError: not implemented for 'ComplexFloat'
-                        DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion', device_type='cuda'),
-                    )),
+                    supports_rhs_python_scalar=False),
     OpInfo('logaddexp2',
            dtypes=floating_types_and(torch.bfloat16, torch.half),
            dtypesIfHpu=custom_types(torch.float32, torch.bfloat16),
@@ -16365,6 +16520,33 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                 'TestUnaryUfuncs', device_type='cuda',
             ), ],
     ),
+    OpInfo(
+        'torch._scaled_mm_v2',
+        sample_inputs_func=sample_inputs_scaled_mm_v2,
+        dtypes=float8_types(),
+        dtypesIfCUDA=empty_types() + (torch.float8_e4m3fn,),
+        supports_out=True,
+        supports_forward_ad=False,
+        supports_autograd=False,
+        decorators=[onlyCUDA, skipCUDAIf(not SM89OrLater or TEST_WITH_ROCM, 'Requires CUDA SM >= 8.9')],
+        skips=(
+            # Sample inputs isn't really parametrized on dtype
+            DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes'),
+            # "add_stub" not implemented for 'Float8_e4m3fn'
+            # "ufunc_add_CUDA" not implemented for 'Float8_e4m3fn'
+            # https://github.com/pytorch/pytorch/issues/107256
+            DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out'),
+            # "mul_cuda" not implemented for float8_e4m3fn
+            # "mul_cpu_reduced_float" not implemented for 'Float8_e4m3fn'
+            # https://github.com/pytorch/pytorch/issues/107256
+            DecorateInfo(unittest.skip("Skipped!"), 'TestSchemaCheckModeOpInfo', 'test_schema_correctness'),
+            # aten::_scaled_mm hit the vmap fallback which is currently disabled
+            DecorateInfo(unittest.skip("Skipped!"), "TestVmapOperatorsOpInfo", "test_op_has_batch_rule"),
+            DecorateInfo(unittest.skip("Skipped!"), "TestVmapOperatorsOpInfo", "test_vmap_exhaustive"),
+            DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness',
+                         dtypes=(torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz)),
+        )
+    ),
     OpInfo(
         'torch._scaled_mm',
         sample_inputs_func=sample_inputs_scaled_mm,
@@ -20320,6 +20502,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                                                   torch.float32: 1e-4}),),
                    dtypes=all_types_and(torch.bool, torch.half, torch.bfloat16),
                    dtypesIfCUDA=all_types_and(torch.bool, torch.half, torch.bfloat16),
+                   supports_sparse=True,
                    supports_sparse_csr=True,
                    supports_sparse_csc=True,
                    supports_sparse_bsr=True,
@@ -23456,10 +23639,12 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
         torch_opinfo_name="logaddexp",
         skips=(
             # failure due to mismatch in edge cases, which boils down to what torch.exp(inf + infj) should be
-            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref', device_type='cpu',
-                         dtypes=(torch.complex64, torch.complex128)),
-            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_torch_fallback', device_type='cpu',
-                         dtypes=(torch.complex64, torch.complex128)),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref',
+                         dtypes=(torch.complex32, torch.complex64, torch.complex128)),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_torch_fallback',
+                         dtypes=(torch.complex32, torch.complex64, torch.complex128)),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_executor',
+                         dtypes=(torch.complex32, torch.complex64, torch.complex128)),
         ),
     ),
     PythonRefInfo(
diff --git a/torch/testing/_internal/common_modules.py b/torch/testing/_internal/common_modules.py
index 120a76eb5ef32..9571cc1209ed6 100644
--- a/torch/testing/_internal/common_modules.py
+++ b/torch/testing/_internal/common_modules.py
@@ -961,6 +961,38 @@ def module_inputs_torch_nn_BatchNorm3d(module_info, device, dtype, requires_grad
                     desc='zero_batch')]
 
 
+def module_error_inputs_torch_nn_BatchNorm1d_2d_3d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    if module_info.module_cls == torch.nn.BatchNorm1d:
+        input_shape = (2, 10)
+    elif module_info.module_cls == torch.nn.BatchNorm2d:
+        input_shape = (2, 10, 5, 5)
+    else:
+        input_shape = (2, 10, 4, 4, 4)
+
+    return [
+        ErrorModuleInput(
+            ModuleInput(
+                constructor_input=FunctionInput(10, eps=-1.0),
+                forward_input=FunctionInput(make_input(input_shape)),
+            ),
+            error_on=ModuleErrorEnum.FORWARD_ERROR,
+            error_type=ValueError,
+            error_regex="eps must be positive"
+        ),
+        ErrorModuleInput(
+            ModuleInput(
+                constructor_input=FunctionInput(10, eps=0.0),
+                forward_input=FunctionInput(make_input(input_shape)),
+            ),
+            error_on=ModuleErrorEnum.FORWARD_ERROR,
+            error_type=ValueError,
+            error_regex="eps must be positive"
+        ),
+    ]
+
+
 def module_inputs_torch_nn_ConvNd(module_info, device, dtype, requires_grad, training, **kwargs):
     N = kwargs['N']
     lazy = kwargs.get('lazy', False)
@@ -3430,6 +3462,7 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
     ModuleInfo(torch.nn.BatchNorm1d,
                train_and_eval_differ=True,
                module_inputs_func=module_inputs_torch_nn_BatchNorm1d,
+               module_error_inputs_func=module_error_inputs_torch_nn_BatchNorm1d_2d_3d,
                skips=(
                    # tracking here rather than in the list in test_aotdispatch.py as eval mode passes
                    # RuntimeError: tried to get Double out of SymInt
@@ -3448,6 +3481,7 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
     ModuleInfo(torch.nn.BatchNorm2d,
                train_and_eval_differ=True,
                module_inputs_func=module_inputs_torch_nn_BatchNorm2d,
+               module_error_inputs_func=module_error_inputs_torch_nn_BatchNorm1d_2d_3d,
                skips=(
                    # See https://github.com/pytorch/pytorch/issues/134580
                    DecorateInfo(expectedFailureMPS, 'TestModule', 'test_memory_format', active_if=operator.itemgetter('training')),
@@ -3468,6 +3502,7 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
     ModuleInfo(torch.nn.BatchNorm3d,
                train_and_eval_differ=True,
                module_inputs_func=module_inputs_torch_nn_BatchNorm3d,
+               module_error_inputs_func=module_error_inputs_torch_nn_BatchNorm1d_2d_3d,
                skips=(
                    # not supported on MPS backend
                    DecorateInfo(skipMPS),
diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py
index c88f7ad45c7ea..5f4fab8c48bbd 100644
--- a/torch/testing/_internal/common_quantization.py
+++ b/torch/testing/_internal/common_quantization.py
@@ -750,7 +750,7 @@ def is_leaf_module(module):
                     and not isinstance(module, torch.nn.Sequential)
                     and type(module) in propagate_qconfig_list
                 )
-                or type(module) in float_to_observed_module_class_mapping.keys()
+                or type(module) in float_to_observed_module_class_mapping
             )
             and not isinstance(module, torch.ao.quantization.DeQuantStub)
         ):
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 0c26738c2f52f..d5afc413daed8 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -114,8 +114,6 @@ class ProfilingMode(Enum):
     PROFILING = 3
 
 # Set by parse_cmd_line_args() if called
-CI_FUNCTORCH_ROOT = ""
-CI_PT_ROOT = ""
 CI_TEST_PREFIX = ""
 DISABLED_TESTS_FILE = ""
 GRAPH_EXECUTOR : Optional[ProfilingMode] = None
@@ -333,7 +331,7 @@ def maybe_load_json(filename):
 if os.getenv("DISABLED_TESTS_FILE", ""):
     disabled_tests_dict = maybe_load_json(os.getenv("DISABLED_TESTS_FILE", ""))
 
-NATIVE_DEVICES = ('cpu', 'cuda', 'xpu', 'meta', 'mps', torch._C._get_privateuse1_backend_name())
+NATIVE_DEVICES = ('cpu', 'cuda', 'xpu', 'meta', 'mps', 'mtia', torch._C._get_privateuse1_backend_name())
 
 # used for managing devices testing for torch profiler UTs
 # for now cpu, cuda and xpu are added for testing torch profiler UTs
@@ -959,8 +957,6 @@ def _get_test_report_path():
     return os.path.join('test-reports', test_source)
 
 def parse_cmd_line_args():
-    global CI_FUNCTORCH_ROOT
-    global CI_PT_ROOT
     global CI_TEST_PREFIX
     global DISABLED_TESTS_FILE
     global GRAPH_EXECUTOR
@@ -1039,10 +1035,8 @@ def run_unittest_help(argv):
 
     set_rng_seed()
 
-# CI Prefix path used only on CI environment
+    # CI Prefix path used only on CI environment
     CI_TEST_PREFIX = str(Path(os.getcwd()))
-    CI_PT_ROOT = str(Path(os.getcwd()).parent)
-    CI_FUNCTORCH_ROOT = str(os.path.join(Path(os.getcwd()).parent, "functorch"))
 
 def wait_for_process(p, timeout=None):
     try:
@@ -1468,6 +1462,44 @@ def is_privateuse1_backend_available():
     return (is_available := getattr(privateuse1_backend_module, "is_available", None)) and is_available()
 
 
+def make_lazy_class(cls):
+
+    def lazy_init(self, cb):
+        self._cb = cb
+        self._value = None
+
+    cls.__init__ = lazy_init
+
+    for basename in [
+        "add", "sub", "mul", "truediv", "floordiv", "mod", "divmod", "pow",
+        "lshift", "rshift", "and", "or", "xor", "neg", "pos", "abs", "invert",
+        "eq", "ne", "lt", "le", "gt", "ge", "bool", "int", "index",
+    ]:
+        name = f"__{basename}__"
+
+        def inner_wrapper(name):
+            use_operator = basename not in ("bool", "int")
+
+            def wrapped(self, *args, **kwargs):
+                if self._cb is not None:
+                    self._value = self._cb()
+                    self._cb = None
+                if not use_operator:
+                    return getattr(self._value, name)(*args, **kwargs)
+                else:
+                    return getattr(operator, name)(self._value, *args, **kwargs)
+            return wrapped
+
+        setattr(cls, name, inner_wrapper(name))
+
+    return cls
+
+
+@make_lazy_class
+class LazyVal:
+    pass
+
+
 IS_FILESYSTEM_UTF8_ENCODING = sys.getfilesystemencoding() == 'utf-8'
 
 TEST_NUMPY = _check_module_exists('numpy')
@@ -1480,6 +1512,8 @@ def is_privateuse1_backend_available():
 TEST_XPU = torch.xpu.is_available()
 TEST_HPU = bool(hasattr(torch, "hpu") and torch.hpu.is_available())
 TEST_CUDA = torch.cuda.is_available()
+TEST_ACCELERATOR = LazyVal(lambda: torch.accelerator.is_available())  # type: ignore[call-arg]
+TEST_MULTIACCELERATOR = LazyVal(lambda: torch.accelerator.device_count() > 1)  # type: ignore[call-arg]
 custom_device_mod = getattr(torch, torch._C._get_privateuse1_backend_name(), None)
 TEST_PRIVATEUSE1 = is_privateuse1_backend_available()
 TEST_PRIVATEUSE1_DEVICE_TYPE = torch._C._get_privateuse1_backend_name()
@@ -2793,7 +2827,7 @@ def matches_test(target: str):
         # parametrized ones (TestSuite disables TestSuiteCPU)
         return classname.startswith(target_classname) and (target_testname in (test._testMethodName, sanitized_testname))
 
-    if any(matches_test(x) for x in slow_tests_dict.keys()):
+    if any(matches_test(x) for x in slow_tests_dict):
         getattr(test, test._testMethodName).__dict__['slow_test'] = True
         if not TEST_WITH_SLOW:
             raise unittest.SkipTest("test is slow; run with PYTORCH_TEST_WITH_SLOW to enable test")
@@ -2960,7 +2994,7 @@ def _to_number(self, number_like, *, id):
             return int(number_like)  # type: ignore[call-overload]
         else:
             number = super()._to_number(number_like, id=id)
-            if type(number) not in self._TYPE_TO_DTYPE.keys():
+            if type(number) not in self._TYPE_TO_DTYPE:
                 self._inputs_not_supported()
             return number
 
@@ -5601,37 +5635,7 @@ def _skip_helper(self, op, device, dtype):
         if not op.supports_autograd and not op.supports_forward_ad:
             self.skipTest("Skipped! autograd not supported.")
 
-def make_lazy_class(cls):
-
-    def lazy_init(self, cb):
-        self._cb = cb
-        self._value = None
-
-    cls.__init__ = lazy_init
-
-    for basename in [
-        "add", "sub", "mul", "truediv", "floordiv", "mod", "divmod", "pow",
-        "lshift", "rshift", "and", "or", "xor", "neg", "pos", "abs", "invert",
-        "eq", "ne", "lt", "le", "gt", "ge", "bool", "int", "index",
-    ]:
-        name = f"__{basename}__"
-
-        def inner_wrapper(name):
-            use_operator = basename not in ("bool", "int")
-
-            def wrapped(self, *args, **kwargs):
-                if self._cb is not None:
-                    self._value = self._cb()
-                    self._cb = None
-                if not use_operator:
-                    return getattr(self._value, name)(*args, **kwargs)
-                else:
-                    return getattr(operator, name)(self._value, *args, **kwargs)
-            return wrapped
-
-        setattr(cls, name, inner_wrapper(name))
 
-    return cls
 
 
 # Base TestCase for NT tests; used to define common helpers, etc.
@@ -5676,11 +5680,6 @@ def branch_nested_state(self):
             nested_tensor_module._tensor_symint_registry = original_tensor_symint_registry
 
 
-@make_lazy_class
-class LazyVal:
-    pass
-
-
 def munge_exc(e, *, suppress_suffix=True, suppress_prefix=True, file=None, skip=0):
     from torch._dynamo.trace_rules import _as_posix_path
 
diff --git a/torch/testing/_internal/distributed/_tensor/common_dtensor.py b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
index 17140f40684dd..6ce7d4b2ca507 100644
--- a/torch/testing/_internal/distributed/_tensor/common_dtensor.py
+++ b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
@@ -3,6 +3,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 
 import contextlib
+import copy
 import functools
 import itertools
 import sys
@@ -32,6 +33,8 @@
     Replicate,
     Shard,
 )
+from torch.distributed.tensor._dtensor_spec import ShardOrderEntry
+from torch.distributed.tensor._redistribute import redistribute_local_tensor
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
     parallelize_module,
@@ -814,3 +817,129 @@ def map_local_tensor_for_rank(tensor, rank, func):
 @maybe_run_for_local_tensor
 def map_local_for_rank(rank, func):
     return func(rank)
+
+
+def reduce_local_int(val, func):
+    return func(val.node._local_ints)
+
+
+def _convert_shard_order_dict_to_ShardOrder(shard_order):
+    """Convert shard_order dict to ShardOrder"""
+    return tuple(
+        ShardOrderEntry(tensor_dim=tensor_dim, mesh_dims=tuple(mesh_dims))
+        for tensor_dim, mesh_dims in shard_order.items()
+    )
+
+
+# TODO(zpcore): remove once the native redistribute supports shard_order arg
+def redistribute(
+    dtensor_input,
+    device_mesh,
+    placements,
+    shard_order,
+    use_graph_based_transform=True,
+):
+    """
+    wrapper function to support shard_order for redistribution
+    This is a simpler version of Redistribute, only considers the forward.
+    """
+    if placements is None:
+        placements = shard_order_to_placement(shard_order, device_mesh)
+    placements = tuple(placements)
+    old_spec = dtensor_input._spec
+    new_spec = copy.deepcopy(old_spec)
+    new_spec.placements = placements
+    if shard_order is not None:
+        new_spec.shard_order = shard_order
+    else:
+        new_spec.shard_order = ()
+    if old_spec == new_spec:
+        return dtensor_input
+    dtensor_input = DTensor.from_local(
+        redistribute_local_tensor(
+            dtensor_input.to_local(),
+            old_spec,
+            new_spec,
+            use_graph_based_transform=use_graph_based_transform,
+        ),
+        device_mesh,
+    )
+    dtensor_input._spec = copy.deepcopy(new_spec)
+    return dtensor_input  # returns DTensor
+
+
+# TODO(zpcore): remove once the native distribute_tensor supports
+# shard_order arg
+def patched_distribute_tensor(
+    input_tensor,
+    device_mesh,
+    placements,
+    shard_order,
+    use_graph_based_transform=True,
+):
+    """wrapper function to support shard_order for tensor distribution"""
+    if placements is None:
+        placements = shard_order_to_placement(shard_order, device_mesh)
+    placements = tuple(placements)
+    tensor_dt = distribute_tensor(input_tensor, device_mesh, placements)
+    # fix the shard order
+    return redistribute(
+        tensor_dt, device_mesh, placements, shard_order, use_graph_based_transform
+    )
+
+
+# TODO(zpcore): remove once the native redistribute supports shard_order arg
+def make_full_tensor(dtensor_input):
+    """wrapper function to support DTensor.full_tensor"""
+    return redistribute(
+        dtensor_input, dtensor_input.device_mesh, placements=None, shard_order=()
+    ).to_local()
+
+
+def shard_order_to_placement(shard_order, mesh):
+    """convert shard_order to placement with only Replicate() and Shard()"""
+    placements: list[Any] = [Replicate() for _ in range(mesh.ndim)]
+    if shard_order is not None:
+        for entry in shard_order:
+            tensor_dim = entry.tensor_dim
+            mesh_dims = entry.mesh_dims
+            for mesh_dim in mesh_dims:
+                placements[mesh_dim] = Shard(tensor_dim)
+    return tuple(placements)
+
+
+def generate_shard_orders(mesh, tensor_rank):
+    # Generate all possible sharding placement of tensor with rank
+    # `tensor_rank` over mesh.
+    def _split_list(lst: list, N: int):
+        def compositions(n: int, k: int):
+            # yields lists of length k, positive ints summing to n
+            for cuts in itertools.combinations(range(1, n), k - 1):
+                # add 0 and n as sentinels, then take consecutive differences
+                yield [b - a for a, b in itertools.pairwise((0, *cuts, n))]
+
+        length = len(lst)
+        for comp in compositions(length, N):
+            result = []
+            start = 0
+            for size in comp:
+                result.append(lst[start : start + size])
+                start += size
+            yield result
+
+    all_mesh = list(range(mesh.ndim))
+    all_device_order = list(itertools.permutations(all_mesh))
+    for device_order in all_device_order:
+        # split on device orders, and assign each device order segment to a tensor dim
+        for num_split in range(1, mesh.ndim + 1):
+            for splitted_list in _split_list(list(range(mesh.ndim)), num_split):
+                for tensor_dims in itertools.combinations(
+                    range(tensor_rank), len(splitted_list)
+                ):
+                    shard_order = {}
+                    assert len(tensor_dims) == len(splitted_list)
+                    for tensor_dim, mesh_dims in zip(tensor_dims, splitted_list):
+                        shard_order[tensor_dim] = device_order[
+                            mesh_dims[0] : mesh_dims[-1] + 1
+                        ]
+                    yield _convert_shard_order_dict_to_ShardOrder(shard_order)
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index a14f670d788be..478d3c978120b 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -215,19 +215,16 @@ def get_profiling_event(event_name, profiler, dedup_gpu_user_annotation=False):
 def get_profiler_nccl_meta(prof):
     """Torch profiler includes nccl metadata in an inserted operator called "record_param_comms"
     We will need to test metadata obtained from profiler here"""
-    tf = tempfile.NamedTemporaryFile(mode="w+t", suffix=".json", delete=False)
-    tf.close()
-    trace_file = tf.name
+    with tempfile.NamedTemporaryFile(mode="w+t", suffix=".json") as tf:
+        tf.close()
+        trace_file = tf.name
 
-    prof.export_chrome_trace(trace_file)
-    with open(trace_file) as f:
-        events = json.load(f)["traceEvents"]
-    print(f"Trace saved to {trace_file}")
+        prof.export_chrome_trace(trace_file)
+        with open(trace_file) as f:
+            events = json.load(f)["traceEvents"]
+        print(f"Trace saved to {trace_file}")
 
-    # Comment to debug
-    os.remove(trace_file)
-
-    return [e for e in events if e.get("name") == "record_param_comms"]
+        return [e for e in events if e.get("name") == "record_param_comms"]
 
 
 # Base error message substring on unfinished reductions.
@@ -7050,8 +7047,8 @@ def _validate_execution_trace_nccl(self, et_file: str) -> None:
 
                 self.assertGreaterEqual(attrs.get("in_msg_nelems", -1), 0)
                 self.assertGreaterEqual(attrs.get("out_msg_nelems", -1), 0)
-                self.assertTrue("in_split_size" in attrs.keys())
-                self.assertTrue("out_split_size" in attrs.keys())
+                self.assertTrue("in_split_size" in attrs)
+                self.assertTrue("out_split_size" in attrs)
                 self.assertEqual(attrs.get("global_rank_start", -1), 0)
                 self.assertEqual(attrs.get("global_rank_stride", -1), 1)
 
@@ -7486,7 +7483,7 @@ def forward(self, x, rank):
                         # iterate offset//2 more times than rank 0, to test nodes
                         # depleting inputs at different times.
                         if num_early_join_ranks > 1:
-                            for rank in mapping.keys():
+                            for rank in mapping:
                                 if rank > 0:
                                     mapping[rank] += offset // 2
                         mapping.update(
@@ -7888,8 +7885,8 @@ def custom_type_validator(x):
                 return x.t
 
             def dict_validator(x):
-                self.assertTrue(EXPECTED_FIELDS[0] in x.keys())
-                self.assertTrue(EXPECTED_FIELDS[1] in x.keys())
+                self.assertTrue(EXPECTED_FIELDS[0] in x)
+                self.assertTrue(EXPECTED_FIELDS[1] in x)
                 self.assertEqual(1, len({t.device for t in x.values()}))
                 self.assertEqual(x[EXPECTED_FIELDS[0]].device.index, self.rank)
                 return x[EXPECTED_FIELDS[0]] + x[EXPECTED_FIELDS[1]]
@@ -9306,7 +9303,7 @@ def get_loss(model_output):
                 "tuple": tuple,
                 "dict": dict,
             }
-            for output_type in type_mapping.keys():
+            for output_type in type_mapping:
                 for _ in range(6):
                     out = model(inp, output_type=output_type)
                     loss = get_loss(out)
diff --git a/torch/testing/_internal/distributed/distributed_utils.py b/torch/testing/_internal/distributed/distributed_utils.py
index 502a300493cea..10002da585442 100644
--- a/torch/testing/_internal/distributed/distributed_utils.py
+++ b/torch/testing/_internal/distributed/distributed_utils.py
@@ -9,9 +9,6 @@
 
 
 class MockProcessGroup(dist.ProcessGroup):
-    def __init__(self, rank, world):
-        super().__init__(rank, world)
-
     def getBackendName(self):
         return "mock_process_group"
 
diff --git a/torch/testing/_internal/distributed/nn/api/remote_module_test.py b/torch/testing/_internal/distributed/nn/api/remote_module_test.py
index 79c55f5b8847b..af136fb8722d1 100644
--- a/torch/testing/_internal/distributed/nn/api/remote_module_test.py
+++ b/torch/testing/_internal/distributed/nn/api/remote_module_test.py
@@ -477,11 +477,13 @@ def test_remote_module_py_pickle_not_supported_script(self):
         for remote_module in self._create_remote_module_iter(
             dst_worker_name, modes=[ModuleCreationMode.MODULE_CTOR_WITH_INTERFACE]
         ):
-            with TemporaryFileName() as fname:
-                with self.assertRaisesRegex(
+            with (
+                TemporaryFileName() as fname,
+                self.assertRaisesRegex(
                     torch.jit.Error, "can only be pickled when using RPC"
-                ):
-                    torch.save(remote_module, fname)
+                ),
+            ):
+                torch.save(remote_module, fname)
 
 
 class ThreeWorkersRemoteModuleTest(CommonRemoteModuleTest):
diff --git a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
index 1b371d3ee6ea0..1abadd33309da 100644
--- a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
@@ -1283,13 +1283,14 @@ def test_autograd_context(self):
 
     @dist_init
     def test_nested_context(self):
-        with dist_autograd.context():
-            # Nested contexts not supported.
-            with self.assertRaisesRegex(
+        with (
+            dist_autograd.context(),
+            self.assertRaisesRegex(
                 RuntimeError, "Already have an autograd context id for this thread"
-            ):
-                with dist_autograd.context():
-                    pass
+            ),
+            dist_autograd.context(),
+        ):
+            pass
 
     @dist_init
     def test_graph_for_builtin_call(self):
diff --git a/torch/testing/_internal/distributed/rpc/jit/rpc_test.py b/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
index 76c089f45800d..82a5d66e87f38 100644
--- a/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
@@ -1021,11 +1021,13 @@ def test_rref_jit_pickle_not_supported(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
         rref_var = rpc_return_rref(worker_name(dst_rank))
-        with TemporaryFileName() as fname:
-            with self.assertRaisesRegex(
+        with (
+            TemporaryFileName() as fname,
+            self.assertRaisesRegex(
                 RuntimeError, "RRef jit pickling is only allowed inside RPC calls"
-            ):
-                save_rref(rref_var, fname)
+            ),
+        ):
+            save_rref(rref_var, fname)
 
     @dist_init
     def test_remote_script_throw(self):
@@ -1294,9 +1296,8 @@ def test_record_function_jit_end_callbacks_with_fork(self):
     def test_call_fork_in_jit_with_profiling(self):
         # Ensures that we can call torch.ops.profiler._call_end_callbacks_on_jit_fut on a jit
         # future from within a script function with torch.jit.fork
-        with _profile() as prof:
-            with torch.autograd.profiler.record_function("foo") as rf:
-                call_fork_with_profiling(rf.record)
+        with _profile() as prof, torch.autograd.profiler.record_function("foo") as rf:
+            call_fork_with_profiling(rf.record)
 
         events = prof.function_events
         function_event = get_function_event(events, "foo")
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index b7c0dd17a1164..c50aadc058cbd 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -2093,17 +2093,15 @@ def _run_test_profiler_with_autograd_context(self):
         dst = (self.rank + 1) % self.world_size
         if self.rank == 1:
             # Cases where we can double wrap messages with profiling information and autograd info.
-            with dist_autograd.context():
-                with _profile() as prof:
-                    self.run_profiling_workload(dst)
+            with dist_autograd.context(), _profile() as prof:
+                self.run_profiling_workload(dst)
 
             self.validate_profiling_workload(dst, prof)
 
             # Ensure that flipped order of ctx managers results in events being
             # recorded as expected.
-            with _profile() as prof:
-                with dist_autograd.context():
-                    self.run_profiling_workload(dst)
+            with _profile() as prof, dist_autograd.context():
+                self.run_profiling_workload(dst)
 
             self.validate_profiling_workload(dst, prof)
 
@@ -3282,10 +3280,10 @@ def test_debug_info(self):
         expected.update(autograd_info)
         # NB: Key ordering is only preserved in python 3.6+. So here, we
         # manually check keys are equal.
-        for key in expected.keys():
+        for key in expected:
             self.assertIn(key, info.keys())
 
-        for key in info.keys():
+        for key in info:
             self.assertIn(key, expected.keys())
 
     @dist_init(setup_rpc=False)
@@ -3518,28 +3516,25 @@ def test_wait_all_multiple_call(self):
     @dist_init
     def test_wait_all_timeout(self):
         expected_error = self.get_timeout_error_regex()
-        with self.assertRaisesRegex(RuntimeError, expected_error):
-            with _wait_all():
-                self.assertTrue(_thread_local_var.future_list == [])
-                dst = worker_name((self.rank + 1) % self.world_size)
-                timeout = 0.1  # 100 ms
-                rpc.rpc_async(dst, my_sleep_func, args=(1,), timeout=timeout)
+        with self.assertRaisesRegex(RuntimeError, expected_error), _wait_all():
+            self.assertTrue(_thread_local_var.future_list == [])
+            dst = worker_name((self.rank + 1) % self.world_size)
+            timeout = 0.1  # 100 ms
+            rpc.rpc_async(dst, my_sleep_func, args=(1,), timeout=timeout)
         self.assertFalse(hasattr(_thread_local_var, "future_list"))
 
     @dist_init
     def test_wait_all_raise_in_user_func(self):
-        with self.assertRaises(ValueError):
-            with _wait_all():
-                self.assertTrue(_thread_local_var.future_list == [])
-                dst = worker_name((self.rank + 1) % self.world_size)
-                rpc.rpc_async(dst, raise_func)
+        with self.assertRaises(ValueError), _wait_all():
+            self.assertTrue(_thread_local_var.future_list == [])
+            dst = worker_name((self.rank + 1) % self.world_size)
+            rpc.rpc_async(dst, raise_func)
         self.assertFalse(hasattr(_thread_local_var, "future_list"))
 
     @dist_init
     def test_wait_all_raise_in_body(self):
-        with self.assertRaises(ValueError):
-            with _wait_all():
-                raise_func()
+        with self.assertRaises(ValueError), _wait_all():
+            raise_func()
         self.assertFalse(hasattr(_thread_local_var, "future_list"))
 
     @dist_init
@@ -3739,11 +3734,13 @@ def test_user_rrefs_confirmed_remote(self):
     @dist_init
     def test_rref_py_pickle_not_supported(self):
         local_rref = RRef(35)
-        with TemporaryFileName() as fname:
-            with self.assertRaisesRegex(
+        with (
+            TemporaryFileName() as fname,
+            self.assertRaisesRegex(
                 RuntimeError, "Can not pickle rref in python pickler"
-            ):
-                torch.save(local_rref, fname)
+            ),
+        ):
+            torch.save(local_rref, fname)
 
     @dist_init
     def test_remote_throw(self):
@@ -3959,17 +3956,14 @@ def test_pickle_future(self):
         errMsg = "Can not pickle torch.futures.Future"
 
         dst = worker_name((self.rank + 1) % self.world_size)
-        with TemporaryFileName():
-            with self.assertRaisesRegex(RuntimeError, errMsg):
-                rpc.rpc_sync(dst, fail_on_fut, args=(fut,))
+        with TemporaryFileName(), self.assertRaisesRegex(RuntimeError, errMsg):
+            rpc.rpc_sync(dst, fail_on_fut, args=(fut,))
 
-        with TemporaryFileName():
-            with self.assertRaisesRegex(RuntimeError, errMsg):
-                rpc.rpc_async(dst, fail_on_fut, args=(fut,))
+        with TemporaryFileName(), self.assertRaisesRegex(RuntimeError, errMsg):
+            rpc.rpc_async(dst, fail_on_fut, args=(fut,))
 
-        with TemporaryFileName():
-            with self.assertRaisesRegex(RuntimeError, errMsg):
-                rpc.remote(dst, fail_on_fut, args=(fut,))
+        with TemporaryFileName(), self.assertRaisesRegex(RuntimeError, errMsg):
+            rpc.remote(dst, fail_on_fut, args=(fut,))
 
     @dist_init
     def test_future_done(self):
diff --git a/torch/testing/_internal/fake_config_module3.py b/torch/testing/_internal/fake_config_module3.py
index 1d3d7f15d901a..ff4118438e74c 100644
--- a/torch/testing/_internal/fake_config_module3.py
+++ b/torch/testing/_internal/fake_config_module3.py
@@ -1,5 +1,5 @@
 import sys
-from typing import Callable, Optional
+from typing import Callable, Optional  # noqa: UP035
 
 from torch.utils._config_module import install_config_module
 
diff --git a/torch/testing/_internal/hop_db.py b/torch/testing/_internal/hop_db.py
index fc6cfa8cf7f4e..d8d707d22ab81 100644
--- a/torch/testing/_internal/hop_db.py
+++ b/torch/testing/_internal/hop_db.py
@@ -103,6 +103,8 @@ def f2(x, y0, y1):
     "dynamo_bypassing_wrapper",  # TODO(soulitzer)
     "foreach_map",
     "aoti_call_delegate",
+    "print",
+    "inductor_compiled_code",  # Tested separately in test_inductor_wrap_inductor_compile_regions
 ]
 
 torch.library.define(
@@ -153,6 +155,7 @@ def sample_inputs_invoke_subgraph(opinfo, device, dtype, requires_grad, **kwargs
 def fn_for_invoke_subgraph(x):
     return torch.sin(x)
 
+
 def simple_invoke_subgraph(x):
     return fn_for_invoke_subgraph(x)
 
@@ -202,6 +205,7 @@ def body_fn(iter_t, x):
 
     return torch._higher_order_ops.while_loop(cond_fn, body_fn, (iter_t, x))
 
+
 def simple_while_loop_stack_output(iter_t, x):
     def cond_fn(iter_t, x):
         return iter_t > 0
@@ -209,7 +213,9 @@ def cond_fn(iter_t, x):
     def body_fn(iter_t, x):
         return iter_t - 1, x.cos()
 
-    return torch._higher_order_ops.while_loop_stack_output(cond_fn, body_fn, (iter_t, x), tuple())
+    return torch._higher_order_ops.while_loop_stack_output(
+        cond_fn, body_fn, (iter_t, x), tuple()
+    )
 
 
 def sample_inputs_local_map_hop(opinfo, device, dtype, requires_grad, **kwargs):
@@ -226,18 +232,21 @@ def sample_inputs_local_map_hop(opinfo, device, dtype, requires_grad, **kwargs):
 def simple_local_map_hop(inp1, inp2):
     def body_gm(inp1, inp2):
         return inp1.cos() + inp2.sin()
+
     gm = torch.fx.symbolic_trace(body_gm)
 
     assert torch.distributed.is_available()
     from torch.distributed.tensor.placement_types import Replicate
+
     gm.meta["local_map_kwargs"] = {
         "in_placements": (Replicate(), Replicate(), Replicate()),
-        "out_placements": ((Replicate(), Replicate(), Replicate()),)
+        "out_placements": ((Replicate(), Replicate(), Replicate()),),
     }
 
     # TODO: Dynamo would rewrite this op differently
     return torch._higher_order_ops.local_map_hop(gm, inp1, inp2)
 
+
 def sample_inputs_scan(opinfo, device, dtype, requires_grad, **kwargs):
     make_arg = functools.partial(
         make_tensor, device=device, dtype=dtype, requires_grad=requires_grad
@@ -249,7 +258,6 @@ def sample_inputs_scan(opinfo, device, dtype, requires_grad, **kwargs):
 
 
 def simple_scan(init, xs):
-
     def combine_fn(carry, x):
         result = carry @ x + x
         return result, carry.clone()
@@ -264,15 +272,14 @@ def simple_invoke_quant(x):
     def fn(x, y):
         return (torch.sin(x) * y,)
 
-    return quant_tracer(fn, x, x)[0] * 2.
+    return quant_tracer(fn, x, x)[0] * 2.0
 
 
 def simple_invoke_quant_packed(x):
     def fn(x):
         return (torch.sin(x),)
 
-    return invoke_quant_packed(fn, x)[0] * 2.
-
+    return invoke_quant_packed(fn, x)[0] * 2.0
 
 
 hop_db = [
@@ -496,6 +503,11 @@ def fn(x):
             DecorateInfo(unittest.expectedFailure, "TestHOP", "test_serialize_export"),
             DecorateInfo(unittest.expectedFailure, "TestHOP", "test_retrace_export"),
         ),
-        decorators=[onlyCUDA, unittest.skipIf(not torch.distributed.is_available(), "requires distributed build")],
+        decorators=[
+            onlyCUDA,
+            unittest.skipIf(
+                not torch.distributed.is_available(), "requires distributed build"
+            ),
+        ],
     ),
 ]
diff --git a/torch/testing/_internal/inductor_utils.py b/torch/testing/_internal/inductor_utils.py
index bd11e01a80250..6bd34c812d641 100644
--- a/torch/testing/_internal/inductor_utils.py
+++ b/torch/testing/_internal/inductor_utils.py
@@ -33,6 +33,10 @@
     OrderedSet,
 )
 from torch.fx.experimental.proxy_tensor import make_fx
+from torch.utils._helion import has_helion
+from torch.utils._pallas import has_pallas
+from torch.utils._triton import has_triton
+from torch.utils._config_module import ConfigModule
 from torch.testing._internal.common_device_type import (
     get_desired_device_type_test_bases,
 )
@@ -43,9 +47,6 @@
     LazyVal,
     TestCase,
 )
-from torch.utils._config_module import ConfigModule
-from torch.utils._helion import has_helion
-from torch.utils._triton import has_triton
 
 log: logging.Logger = logging.getLogger(__name__)
 
@@ -67,6 +68,8 @@ def test_cpu():
 
 HAS_TRITON = has_triton()
 
+HAS_PALLAS = has_pallas()
+
 HAS_HELION = has_helion()
 
 if HAS_TRITON:
diff --git a/torch/testing/_internal/jit_utils.py b/torch/testing/_internal/jit_utils.py
index ce8e68ae1e2c5..7647a6595ec73 100644
--- a/torch/testing/_internal/jit_utils.py
+++ b/torch/testing/_internal/jit_utils.py
@@ -281,13 +281,13 @@ def getExportImportCopyWithPacking(self, m, also_test_file=True, map_location=No
         # Ideally we would like to not have to manually delete the file, but NamedTemporaryFile
         # opens the file, and it cannot be opened multiple times in Windows. To support Windows,
         # close the file after creation and try to remove it manually
-        f = tempfile.NamedTemporaryFile(delete=False)
-        try:
-            f.close()
-            imported.save(f.name)
-            result = torch.jit.load(f.name, map_location=map_location)
-        finally:
-            os.unlink(f.name)
+        with tempfile.NamedTemporaryFile(delete=False) as f:
+            try:
+                f.close()
+                imported.save(f.name)
+                result = torch.jit.load(f.name, map_location=map_location)
+            finally:
+                os.unlink(f.name)
 
         result.apply(lambda s: s._unpack() if s._c._has_method('_unpack') else None)
         return result
@@ -459,70 +459,69 @@ def checkScript(self,
         Checks that a given script generates the same output as the Python
         version using the given inputs.
         """
-        with torch.jit.optimized_execution(optimize):
-            with enable_profiling_mode_for_profiling_tests():
-                extra_profile_runs = any(isinstance(x, torch.Tensor) and x.requires_grad for x in inputs)
-                if isinstance(script, str):
-                    # Compile the string to a Script function
-                    # with enable_profiling_mode():
-                    cu = torch.jit.CompilationUnit(script, _frames_up=frames_up)
-
-                    # Execute the Python function so we can run it later and get its
-                    # outputs
-
-                    frame = self.get_frame_vars(frames_up)
-                    the_locals: dict[str, Any] = {}
-                    execWrapper(script, glob=frame, loc=the_locals)
-                    frame.update(the_locals)
-
-                    python_fn = frame[name]
-                    scripted_fn = getattr(cu, name)
-                else:
-
-                    # Check the string frontend first
-                    source = textwrap.dedent(inspect.getsource(script))
-                    self.checkScript(
-                        source,
-                        inputs,
-                        script.__name__,
-                        optimize=optimize,
-                        inputs_requires_grad=inputs_requires_grad,
-                        capture_output=capture_output,
-                        profiling=profiling,
-                        frames_up=2)
-
-                    # Continue checking the Python frontend
-                    scripted_fn = torch.jit.script(script, _frames_up=1)
-                    python_fn = script
-
-                if inputs_requires_grad:
-                    recording_inputs = do_input_map(lambda t: t.detach().requires_grad_(), inputs)
-                else:
-                    recording_inputs = inputs
-
-                if capture_output:
-                    with self.capture_stdout() as script_stdout:
-                        script_outputs = scripted_fn(*recording_inputs)
-                    with self.capture_stdout():
-                        opt_script_outputs = scripted_fn(*recording_inputs)
-                    with self.capture_stdout():
-                        python_outputs = python_fn(*inputs)
-                    if not IS_WINDOWS:
-                        self.assertExpected(script_stdout[0], subname='stdout')
-                    self.assertEqual(python_outputs, opt_script_outputs, atol=atol, rtol=rtol)
-                else:
-                    # profiling run
+        with torch.jit.optimized_execution(optimize), enable_profiling_mode_for_profiling_tests():
+            extra_profile_runs = any(isinstance(x, torch.Tensor) and x.requires_grad for x in inputs)
+            if isinstance(script, str):
+                # Compile the string to a Script function
+                # with enable_profiling_mode():
+                cu = torch.jit.CompilationUnit(script, _frames_up=frames_up)
+
+                # Execute the Python function so we can run it later and get its
+                # outputs
+
+                frame = self.get_frame_vars(frames_up)
+                the_locals: dict[str, Any] = {}
+                execWrapper(script, glob=frame, loc=the_locals)
+                frame.update(the_locals)
+
+                python_fn = frame[name]
+                scripted_fn = getattr(cu, name)
+            else:
+
+                # Check the string frontend first
+                source = textwrap.dedent(inspect.getsource(script))
+                self.checkScript(
+                    source,
+                    inputs,
+                    script.__name__,
+                    optimize=optimize,
+                    inputs_requires_grad=inputs_requires_grad,
+                    capture_output=capture_output,
+                    profiling=profiling,
+                    frames_up=2)
+
+                # Continue checking the Python frontend
+                scripted_fn = torch.jit.script(script, _frames_up=1)
+                python_fn = script
+
+            if inputs_requires_grad:
+                recording_inputs = do_input_map(lambda t: t.detach().requires_grad_(), inputs)
+            else:
+                recording_inputs = inputs
+
+            if capture_output:
+                with self.capture_stdout() as script_stdout:
                     script_outputs = scripted_fn(*recording_inputs)
-                    if inputs_requires_grad or extra_profile_runs:
-                        opt_script_outputs = scripted_fn(*recording_inputs)
-                    # optimized run
+                with self.capture_stdout():
                     opt_script_outputs = scripted_fn(*recording_inputs)
-                    if TEST_BAILOUTS:
-                        self.checkBailouts(scripted_fn, inputs, opt_script_outputs)
+                with self.capture_stdout():
                     python_outputs = python_fn(*inputs)
-                self.assertEqual(python_outputs, script_outputs, atol=atol, rtol=rtol)
-                self.assertEqual(script_outputs, opt_script_outputs, atol=atol, rtol=rtol)
-                return scripted_fn
+                if not IS_WINDOWS:
+                    self.assertExpected(script_stdout[0], subname='stdout')
+                self.assertEqual(python_outputs, opt_script_outputs, atol=atol, rtol=rtol)
+            else:
+                # profiling run
+                script_outputs = scripted_fn(*recording_inputs)
+                if inputs_requires_grad or extra_profile_runs:
+                    opt_script_outputs = scripted_fn(*recording_inputs)
+                # optimized run
+                opt_script_outputs = scripted_fn(*recording_inputs)
+                if TEST_BAILOUTS:
+                    self.checkBailouts(scripted_fn, inputs, opt_script_outputs)
+                python_outputs = python_fn(*inputs)
+            self.assertEqual(python_outputs, script_outputs, atol=atol, rtol=rtol)
+            self.assertEqual(script_outputs, opt_script_outputs, atol=atol, rtol=rtol)
+            return scripted_fn
 
     def checkTrace(self, func, reference_tensors, input_tensors=None,
                    drop=None, allow_unused=False, verbose=False,
diff --git a/torch/testing/_internal/opinfo/definitions/special.py b/torch/testing/_internal/opinfo/definitions/special.py
index f9dc471ca98aa..47cbcb1fb4268 100644
--- a/torch/testing/_internal/opinfo/definitions/special.py
+++ b/torch/testing/_internal/opinfo/definitions/special.py
@@ -648,6 +648,16 @@ def sample_inputs_erfcx(op_info, device, dtype, requires_grad, **kwargs):
         dtypes=all_types_and(torch.bool),
         ref=lambda x: scipy.special.spherical_jn(0, x) if TEST_SCIPY else None,
         supports_autograd=False,
+        skips=(
+            DecorateInfo(
+                unittest.skip(
+                    "Scipy doesn't support bool inputs to spherical_bessel_j0"
+                ),
+                "TestUnaryUfuncs",
+                "test_reference_numerics_normal",
+                dtypes=(torch.bool,),
+            ),
+        ),
     ),
 ]
 
@@ -768,6 +778,16 @@ def sample_inputs_erfcx(op_info, device, dtype, requires_grad, **kwargs):
                 }
             ),
         ),
+        skips=(
+            DecorateInfo(
+                unittest.skip(
+                    "Scipy doesn't support bool inputs to spherical_bessel_j0"
+                ),
+                "TestUnaryUfuncs",
+                "test_reference_numerics_normal",
+                dtypes=(torch.bool,),
+            ),
+        ),
     ),
     #
     # Elementwise Binary Special OpInfos
diff --git a/torch/testing/_internal/optests/generate_tests.py b/torch/testing/_internal/optests/generate_tests.py
index 17f7e27d67463..398425853f09a 100644
--- a/torch/testing/_internal/optests/generate_tests.py
+++ b/torch/testing/_internal/optests/generate_tests.py
@@ -496,7 +496,7 @@ def __init__(
 
     def maybe_raise_errors_on_exit(self) -> None:
         # Check expected failures first
-        for qualname in self.seen_ops_to_errors.keys():
+        for qualname in self.seen_ops_to_errors:
             option = self.failures_dict.get_status(qualname, self.test_name)
             if len(self.seen_ops_to_errors[qualname]) == 0:
                 if should_update_failures_dict():
@@ -518,7 +518,7 @@ def maybe_raise_errors_on_exit(self) -> None:
                         )
                 continue
         failed_ops = []
-        for qualname in self.seen_ops_to_errors.keys():
+        for qualname in self.seen_ops_to_errors:
             option = self.failures_dict.get_status(qualname, self.test_name)
             if option != "xsuccess":
                 continue
diff --git a/torch/utils/_config_module.py b/torch/utils/_config_module.py
index 12ba497efd79c..16fbad73a3097 100644
--- a/torch/utils/_config_module.py
+++ b/torch/utils/_config_module.py
@@ -11,7 +11,7 @@
 from collections.abc import Callable
 from dataclasses import dataclass
 from types import FunctionType, ModuleType
-from typing import Any, Generic, NoReturn, Optional, TYPE_CHECKING, TypeVar, Union
+from typing import Any, Generic, NoReturn, Optional, TYPE_CHECKING, TypeVar
 from typing_extensions import deprecated
 from unittest import mock
 
@@ -23,7 +23,7 @@
 
 
 # Duplicated, because mypy needs these types statically
-T = TypeVar("T", bound=Union[int, float, bool, None, str, list, set, tuple, dict])
+T = TypeVar("T", bound=int | float | bool | None | str | list | set | tuple | dict)
 
 
 _UNSET_SENTINEL = object()
@@ -69,12 +69,12 @@ class _Config(Generic[T]):
             default behaviour. I.e. user overrides take preference.
     """
 
-    default: Union[T, object]
-    justknob: Optional[str] = None
-    env_name_default: Optional[list[str]] = None
-    env_name_force: Optional[list[str]] = None
-    value_type: Optional[type] = None
-    alias: Optional[str] = None
+    default: T | object
+    justknob: str | None = None
+    env_name_default: list[str] | None = None
+    env_name_force: list[str] | None = None
+    value_type: type | None = None
+    alias: str | None = None
 
     def __post_init__(self) -> None:
         self.env_name_default = _Config.string_or_list_of_string_to_list(
@@ -98,8 +98,8 @@ def __post_init__(self) -> None:
 
     @staticmethod
     def string_or_list_of_string_to_list(
-        val: Optional[Union[str, list[str]]],
-    ) -> Optional[list[str]]:
+        val: str | list[str] | None,
+    ) -> list[str] | None:
         if val is None:
             return None
         if isinstance(val, str):
@@ -116,23 +116,23 @@ def string_or_list_of_string_to_list(
 if TYPE_CHECKING:
 
     def Config(
-        default: Union[T, object] = _UNSET_SENTINEL,
-        justknob: Optional[str] = None,
-        env_name_default: Optional[Union[str, list[str]]] = None,
-        env_name_force: Optional[Union[str, list[str]]] = None,
-        value_type: Optional[type] = None,
-        alias: Optional[str] = None,
+        default: T | object = _UNSET_SENTINEL,
+        justknob: str | None = None,
+        env_name_default: str | list[str] | None = None,
+        env_name_force: str | list[str] | None = None,
+        value_type: type | None = None,
+        alias: str | None = None,
     ) -> T: ...
 
 else:
 
     def Config(
-        default: Union[T, object] = _UNSET_SENTINEL,
-        justknob: Optional[str] = None,
-        env_name_default: Optional[Union[str, list[str]]] = None,
-        env_name_force: Optional[Union[str, list[str]]] = None,
-        value_type: Optional[type] = None,
-        alias: Optional[str] = None,
+        default: T | object = _UNSET_SENTINEL,
+        justknob: str | None = None,
+        env_name_default: str | list[str] | None = None,
+        env_name_force: str | list[str] | None = None,
+        value_type: type | None = None,
+        alias: str | None = None,
     ) -> _Config[T]:
         return _Config(
             default=default,
@@ -144,7 +144,7 @@ def Config(
         )
 
 
-def _read_env_variable(name: str) -> Optional[Union[bool, str]]:
+def _read_env_variable(name: str) -> bool | str | None:
     value = os.environ.get(name)
     if value == "1":
         return True
@@ -165,8 +165,8 @@ class ConfigModuleInstance(ConfigModule):
         _bypass_keys = set({"_is_dirty", "_hash_digest", "__annotations__"})
 
     def visit(
-        source: Union[ModuleType, type],
-        dest: Union[ModuleType, SubConfigProxy],
+        source: ModuleType | type,
+        dest: ModuleType | SubConfigProxy,
         prefix: str,
     ) -> None:
         """Walk the module structure and move everything to module._config"""
@@ -175,7 +175,13 @@ def visit(
             if (
                 key.startswith("__")
                 or isinstance(value, (ModuleType, FunctionType))
-                or (hasattr(value, "__module__") and value.__module__ == "typing")
+                or (
+                    hasattr(value, "__module__")
+                    and (
+                        value.__module__ == "typing"
+                        or value.__module__.startswith("collections.abc")
+                    )
+                )
                 # Handle from torch.utils._config_module import Config
                 or (isinstance(value, type) and issubclass(value, _Config))
             ):
@@ -275,7 +281,7 @@ class _ConfigEntry:
     # _UNSET_SENTINEL indicates the value is not set.
     user_override: Any = _UNSET_SENTINEL
     # The justknob to check for this config
-    justknob: Optional[str] = None
+    justknob: str | None = None
     # environment variables are read at install time
     env_value_force: Any = _UNSET_SENTINEL
     env_value_default: Any = _UNSET_SENTINEL
@@ -291,9 +297,9 @@ class _ConfigEntry:
     # call so the final state is correct. It's just very unintuitive.
     # upstream bug - python/cpython#126886
     hide: bool = False
-    alias: Optional[str] = None
+    alias: str | None = None
 
-    def __init__(self, config: _Config):
+    def __init__(self, config: _Config) -> None:
         self.default = config.default
         self.value_type = (
             config.value_type if config.value_type is not None else type(self.default)
@@ -341,7 +347,7 @@ class ConfigModule(ModuleType):
     _bypass_keys: set[str]
     _compile_ignored_keys: set[str]
     _is_dirty: bool
-    _hash_digest: Optional[bytes]
+    _hash_digest: bytes | None
 
     def __init__(self) -> None:
         raise NotImplementedError(
@@ -405,7 +411,7 @@ def __delattr__(self, name: str) -> None:
 
     def _get_alias_module_and_name(
         self, entry: _ConfigEntry
-    ) -> Optional[tuple[ModuleType, str]]:
+    ) -> tuple[ModuleType, str] | None:
         alias = entry.alias
         if alias is None:
             return None
@@ -459,8 +465,8 @@ def _is_default(self, name: str) -> bool:
 
     def _get_dict(
         self,
-        ignored_keys: Optional[list[str]] = None,
-        ignored_prefixes: Optional[list[str]] = None,
+        ignored_keys: list[str] | None = None,
+        ignored_prefixes: list[str] | None = None,
         skip_default: bool = False,
     ) -> dict[str, Any]:
         """Export a dictionary of current configuration keys and values.
@@ -536,7 +542,7 @@ def add_import(func: Callable) -> None:
             if module_name:
                 imports.add(module_name)
 
-        def list_of_callables_to_string(v: Union[list, set]) -> list[str]:
+        def list_of_callables_to_string(v: list | set) -> list[str]:
             return [f"{get_module_name(item, True)}{item.__name__}" for item in v]
 
         def importable_callable(v: Any) -> bool:
@@ -609,7 +615,7 @@ def to_dict(self) -> dict[str, Any]:
     def shallow_copy_dict(self) -> dict[str, Any]:
         return self.get_config_copy()
 
-    def load_config(self, maybe_pickled_config: Union[bytes, dict[str, Any]]) -> None:
+    def load_config(self, maybe_pickled_config: bytes | dict[str, Any]) -> None:
         """Restore from a prior call to save_config() or shallow_copy_dict()"""
         if not isinstance(maybe_pickled_config, dict):
             config = pickle.loads(maybe_pickled_config)
@@ -631,7 +637,7 @@ def get_serializable_config_copy(self) -> dict[str, Any]:
 
     def patch(
         self,
-        arg1: Optional[Union[str, dict[str, Any]]] = None,
+        arg1: str | dict[str, Any] | None = None,
         arg2: Any = None,
         **kwargs: dict[str, Any],
     ) -> "ContextDecorator":
@@ -692,7 +698,7 @@ def __enter__(self) -> None:
                     raise AssertionError(
                         "prior should be empty when entering ConfigPatch"
                     )
-                for key in self.changes.keys():
+                for key in self.changes:
                     # KeyError on invalid entry
                     prior[key] = config.__getattr__(key)
                 for k, v in self.changes.items():
@@ -786,7 +792,7 @@ class SubConfigProxy:
     `config.triton.cudagraphs` maps to _config["triton.cudagraphs"]
     """
 
-    def __init__(self, config: object, prefix: str):
+    def __init__(self, config: object, prefix: str) -> None:
         # `super().__setattr__` to bypass custom `__setattr__`
         super().__setattr__("_config", config)
         super().__setattr__("_prefix", prefix)
@@ -810,7 +816,7 @@ def patch_object(obj: object, name: str, value: object) -> object:
     return mock.patch.object(obj, name, value)
 
 
-def get_tristate_env(name: str, default: Any = None) -> Optional[bool]:
+def get_tristate_env(name: str, default: Any = None) -> bool | None:
     value = os.environ.get(name)
     if value == "1":
         return True
diff --git a/torch/utils/_content_store.py b/torch/utils/_content_store.py
index 0086a1e874ddf..234355210057a 100644
--- a/torch/utils/_content_store.py
+++ b/torch/utils/_content_store.py
@@ -34,7 +34,6 @@
 import os.path
 import struct
 from collections import defaultdict
-from typing import Optional
 
 import torch
 import torch._prims as prims
@@ -193,9 +192,9 @@ def write_tensor(self, name: str, t: torch.Tensor) -> None:
 class ContentStoreReader:
     def __init__(self, loc: str, *, cache=True) -> None:
         self.loc = loc
-        self.storage_cache: Optional[
-            dict[Optional[torch.device], dict[str, StorageWeakRef]]
-        ] = None
+        self.storage_cache: (
+            dict[torch.device | None, dict[str, StorageWeakRef]] | None
+        ) = None
         if cache:
             self.storage_cache = defaultdict(dict)
 
@@ -207,7 +206,7 @@ def read_storage(self, h: str, *, device=None) -> torch.UntypedStorage:
             if self.storage_cache is not None
             else None
         )
-        s: Optional[torch.UntypedStorage]
+        s: torch.UntypedStorage | None
         if ws is not None:
             s = torch.UntypedStorage._new_with_weak_ptr(ws.cdata)
             if s is not None:
diff --git a/torch/utils/_cpp_embed_headers.py b/torch/utils/_cpp_embed_headers.py
index 1d1577b0d8cb5..88ab41aadffe7 100644
--- a/torch/utils/_cpp_embed_headers.py
+++ b/torch/utils/_cpp_embed_headers.py
@@ -1,10 +1,9 @@
 from collections.abc import Sequence
 from pathlib import Path
 from re import match as _match
-from typing import Optional, Union
 
 
-def read_file(fname: Union[Path, str]) -> list[str]:
+def read_file(fname: Path | str) -> list[str]:
     with open(fname, encoding="utf-8") as f:
         return f.readlines()
 
@@ -36,7 +35,7 @@ def _embed_headers(
 
 
 def embed_headers(
-    fname: str, include_dirs: Optional[Union[Sequence[str], Sequence[Path], str]] = None
+    fname: str, include_dirs: Sequence[str] | Sequence[Path] | str | None = None
 ) -> str:
     if include_dirs is None:
         base_dir = Path(__file__).parent.parent.parent
diff --git a/torch/utils/_cpp_extension_versioner.py b/torch/utils/_cpp_extension_versioner.py
index 2997f90d7c89d..d1391dd9aaab0 100644
--- a/torch/utils/_cpp_extension_versioner.py
+++ b/torch/utils/_cpp_extension_versioner.py
@@ -27,7 +27,7 @@ def hash_build_arguments(hash_value, build_arguments):
 
 
 class ExtensionVersioner:
-    def __init__(self):
+    def __init__(self) -> None:
         self.entries = {}
 
     def get_version(self, name):
diff --git a/torch/utils/_cxx_pytree.py b/torch/utils/_cxx_pytree.py
index 603625ed97c12..f9350124d135a 100644
--- a/torch/utils/_cxx_pytree.py
+++ b/torch/utils/_cxx_pytree.py
@@ -15,8 +15,8 @@
 import functools
 import types
 from collections.abc import Callable, Iterable, Mapping
-from typing import Any, Optional, overload, TypeVar, Union
-from typing_extensions import deprecated, Self, TypeAlias, TypeIs
+from typing import Any, overload, TypeAlias, TypeVar, Union
+from typing_extensions import deprecated, Self, TypeIs
 
 import torch.utils._pytree as python_pytree
 from torch.torch_version import TorchVersion as _TorchVersion
@@ -128,10 +128,10 @@ def register_pytree_node(
     flatten_fn: FlattenFunc,
     unflatten_fn: UnflattenFunc,
     *,
-    serialized_type_name: Optional[str] = None,
-    to_dumpable_context: Optional[ToDumpableContextFn] = None,
-    from_dumpable_context: Optional[FromDumpableContextFn] = None,
-    flatten_with_keys_fn: Optional[FlattenWithKeysFunc] = None,
+    serialized_type_name: str | None = None,
+    to_dumpable_context: ToDumpableContextFn | None = None,
+    from_dumpable_context: FromDumpableContextFn | None = None,
+    flatten_with_keys_fn: FlattenWithKeysFunc | None = None,
 ) -> None:
     """Register a container-like type as pytree node.
 
@@ -196,9 +196,9 @@ def _register_pytree_node(
     flatten_fn: FlattenFunc,
     unflatten_fn: UnflattenFunc,
     *,
-    serialized_type_name: Optional[str] = None,
-    to_dumpable_context: Optional[ToDumpableContextFn] = None,
-    from_dumpable_context: Optional[FromDumpableContextFn] = None,
+    serialized_type_name: str | None = None,
+    to_dumpable_context: ToDumpableContextFn | None = None,
+    from_dumpable_context: FromDumpableContextFn | None = None,
 ) -> None:
     """Register a container-like type as pytree node for the C++ pytree only.
 
@@ -247,9 +247,9 @@ def _private_register_pytree_node(
     flatten_fn: FlattenFunc,
     unflatten_fn: UnflattenFunc,
     *,
-    serialized_type_name: Optional[str] = None,
-    to_dumpable_context: Optional[ToDumpableContextFn] = None,
-    from_dumpable_context: Optional[FromDumpableContextFn] = None,
+    serialized_type_name: str | None = None,
+    to_dumpable_context: ToDumpableContextFn | None = None,
+    from_dumpable_context: FromDumpableContextFn | None = None,
 ) -> None:
     """This is an internal function that is used to register a pytree node type
     for the C++ pytree only. End-users should use :func:`register_pytree_node`
@@ -281,7 +281,7 @@ def treespec_tuple(iterable: Iterable[TreeSpec] = (), /) -> TreeSpec:
 
 
 def treespec_dict(
-    mapping: Union[Mapping[Any, TreeSpec], Iterable[tuple[Any, TreeSpec]]] = (),
+    mapping: Mapping[Any, TreeSpec] | Iterable[tuple[Any, TreeSpec]] = (),
     /,
     **kwargs: TreeSpec,
 ) -> TreeSpec:
@@ -296,7 +296,7 @@ def treespec_dict(
 
 def tree_is_leaf(
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> bool:
     """Check if a pytree is a leaf.
 
@@ -334,7 +334,7 @@ def tree_is_leaf(
 
 def tree_flatten(
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> tuple[list[Any], TreeSpec]:
     """Flatten a pytree.
 
@@ -399,7 +399,7 @@ def tree_unflatten(leaves: Iterable[Any], treespec: TreeSpec) -> PyTree:
 
 def tree_iter(
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> Iterable[Any]:
     """Get an iterator over the leaves of a pytree.
 
@@ -434,7 +434,7 @@ def tree_iter(
 
 def tree_leaves(
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> list[Any]:
     """Get the leaves of a pytree.
 
@@ -469,7 +469,7 @@ def tree_leaves(
 
 def tree_structure(
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> TreeSpec:
     """Get the treespec for a pytree.
 
@@ -506,7 +506,7 @@ def tree_map(
     func: Callable[..., Any],
     tree: PyTree,
     *rests: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> PyTree:
     """Map a multi-input function over pytree args to produce a new pytree.
 
@@ -555,7 +555,7 @@ def tree_map_(
     func: Callable[..., Any],
     tree: PyTree,
     *rests: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> PyTree:
     """Like :func:`tree_map`, but do an inplace call on each leaf and return the original tree.
 
@@ -593,8 +593,8 @@ def tree_map_(
 Type3 = tuple[type[T], type[S], type[U]]
 TypeAny = Union[type[Any], tuple[type[Any], ...], types.UnionType]
 
-Fn2 = Callable[[Union[T, S]], R]
-Fn3 = Callable[[Union[T, S, U]], R]
+Fn2 = Callable[[T | S], R]
+Fn3 = Callable[[T | S | U], R]
 Fn = Callable[[T], R]
 FnAny = Callable[[Any], R]
 
@@ -629,7 +629,7 @@ def map_only(
 
 
 def map_only(
-    type_or_types_or_pred: Union[TypeAny, Callable[[Any], bool]], /
+    type_or_types_or_pred: TypeAny | Callable[[Any], bool], /
 ) -> MapOnlyFn[FnAny[Any]]:
     """
     Suppose you are writing a tree_map over tensors, leaving everything
@@ -677,7 +677,7 @@ def tree_map_only(
     /,
     func: Fn[T, Any],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> PyTree: ...
 
 
@@ -687,7 +687,7 @@ def tree_map_only(
     /,
     func: Fn2[T, S, Any],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> PyTree: ...
 
 
@@ -697,7 +697,7 @@ def tree_map_only(
     /,
     func: Fn3[T, S, U, Any],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> PyTree: ...
 
 
@@ -707,7 +707,7 @@ def tree_map_only(
     /,
     func: FnAny[Any],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> PyTree: ...
 
 
@@ -717,16 +717,16 @@ def tree_map_only(
     /,
     func: FnAny[Any],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> PyTree: ...
 
 
 def tree_map_only(
-    type_or_types_or_pred: Union[TypeAny, Callable[[Any], bool]],
+    type_or_types_or_pred: TypeAny | Callable[[Any], bool],
     /,
     func: FnAny[Any],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> PyTree:
     return tree_map(map_only(type_or_types_or_pred)(func), tree, is_leaf=is_leaf)
 
@@ -737,7 +737,7 @@ def tree_map_only_(
     /,
     func: Fn[T, Any],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> PyTree: ...
 
 
@@ -747,7 +747,7 @@ def tree_map_only_(
     /,
     func: Fn2[T, S, Any],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> PyTree: ...
 
 
@@ -757,7 +757,7 @@ def tree_map_only_(
     /,
     func: Fn3[T, S, U, Any],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> PyTree: ...
 
 
@@ -767,7 +767,7 @@ def tree_map_only_(
     /,
     func: FnAny[Any],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> PyTree: ...
 
 
@@ -777,16 +777,16 @@ def tree_map_only_(
     /,
     func: FnAny[Any],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> PyTree: ...
 
 
 def tree_map_only_(
-    type_or_types_or_pred: Union[TypeAny, Callable[[Any], bool]],
+    type_or_types_or_pred: TypeAny | Callable[[Any], bool],
     /,
     func: FnAny[Any],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> PyTree:
     return tree_map_(map_only(type_or_types_or_pred)(func), tree, is_leaf=is_leaf)
 
@@ -794,7 +794,7 @@ def tree_map_only_(
 def tree_all(
     pred: Callable[[Any], bool],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> bool:
     flat_args = tree_iter(tree, is_leaf=is_leaf)
     return all(map(pred, flat_args))
@@ -803,7 +803,7 @@ def tree_all(
 def tree_any(
     pred: Callable[[Any], bool],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> bool:
     flat_args = tree_iter(tree, is_leaf=is_leaf)
     return any(map(pred, flat_args))
@@ -815,7 +815,7 @@ def tree_all_only(
     /,
     pred: Fn[T, bool],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> bool: ...
 
 
@@ -825,7 +825,7 @@ def tree_all_only(
     /,
     pred: Fn2[T, S, bool],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> bool: ...
 
 
@@ -835,7 +835,7 @@ def tree_all_only(
     /,
     pred: Fn3[T, S, U, bool],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> bool: ...
 
 
@@ -844,7 +844,7 @@ def tree_all_only(
     /,
     pred: FnAny[bool],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> bool:
     flat_args = tree_iter(tree, is_leaf=is_leaf)
     return all(pred(x) for x in flat_args if isinstance(x, type_or_types))
@@ -856,7 +856,7 @@ def tree_any_only(
     /,
     pred: Fn[T, bool],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> bool: ...
 
 
@@ -866,7 +866,7 @@ def tree_any_only(
     /,
     pred: Fn2[T, S, bool],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> bool: ...
 
 
@@ -876,7 +876,7 @@ def tree_any_only(
     /,
     pred: Fn3[T, S, U, bool],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> bool: ...
 
 
@@ -885,7 +885,7 @@ def tree_any_only(
     /,
     pred: FnAny[bool],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> bool:
     flat_args = tree_iter(tree, is_leaf=is_leaf)
     return any(pred(x) for x in flat_args if isinstance(x, type_or_types))
@@ -894,7 +894,7 @@ def tree_any_only(
 def broadcast_prefix(
     prefix_tree: PyTree,
     full_tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> list[Any]:
     """Return a list of broadcasted leaves in ``prefix_tree`` to match the number of leaves in ``full_tree``.
 
@@ -956,8 +956,8 @@ def add_leaves(x: Any, subtree: PyTree) -> None:
 def _broadcast_to_and_flatten(
     tree: PyTree,
     treespec: TreeSpec,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> Optional[list[Any]]:
+    is_leaf: Callable[[PyTree], bool] | None = None,
+) -> list[Any] | None:
     if not _is_pytreespec_instance(treespec):
         raise AssertionError(
             f"_broadcast_to_and_flatten: Expected `treespec` to be instance of PyTreeSpec but got {type(treespec)}"
@@ -969,7 +969,7 @@ def _broadcast_to_and_flatten(
         return None
 
 
-def treespec_dumps(treespec: TreeSpec, protocol: Optional[int] = None) -> str:
+def treespec_dumps(treespec: TreeSpec, protocol: int | None = None) -> str:
     """Serialize a treespec to a JSON string."""
     if not _is_pytreespec_instance(treespec):
         raise TypeError(
@@ -1024,7 +1024,7 @@ def __new__(cls) -> Self:
 
 def tree_flatten_with_path(
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> tuple[list[tuple[KeyPath, Any]], TreeSpec]:
     """Flattens a pytree like :func:`tree_flatten`, but also returns each leaf's key path.
 
@@ -1047,7 +1047,7 @@ def tree_flatten_with_path(
 
 def tree_leaves_with_path(
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> list[tuple[KeyPath, Any]]:
     """Gets the leaves of a pytree like ``tree_leaves`` and returns each leaf's key path.
 
@@ -1070,7 +1070,7 @@ def tree_map_with_path(
     func: Callable[..., Any],
     tree: PyTree,
     *rests: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> PyTree:
     """Like :func:`tree_map`, but the provided callable takes an additional key path argument.
 
diff --git a/torch/utils/_debug_mode.py b/torch/utils/_debug_mode.py
index 09435aa07e68b..745b05d1904d7 100644
--- a/torch/utils/_debug_mode.py
+++ b/torch/utils/_debug_mode.py
@@ -1,22 +1,62 @@
 # mypy: allow-untyped-defs
+"""
+DebugMode is a debugging TorchDispatchMode that intercepts and logs runtime calls
+to a hierarchical string dump. It logs real tensor, DTensor, and optionally FakeTensor
+operations, with some additional handling for DTensor internals.
+
+An example dump from an eager mode DTensor matmul:
+
+    torch.mm(dt$0: f32[8, 8]| S(0), dt$1: f32[8, 32]| S(0))  ->  dt$6: f32[8, 32]| S(0)
+      aten::mm(dt$0: f32[8, 8]| S(0), dt$1: f32[8, 32]| S(0))
+        redistribute_input(1, S(0) -> R)
+          redistribute_input(t$2: f32[1, 32], trace: S(0)->R)
+            _c10d_functional::all_gather_into_tensor(t$2: f32[1, 32], 8, 0)  ->  t$3: f32[8, 32]
+            _c10d_functional::wait_tensor(t$3: f32[8, 32])  ->  t$3: f32[8, 32]
+        aten::mm(t$4: f32[1, 8], t$3: f32[8, 32])  ->  t$5: f32[1, 32]
+
+This mode runs "under" compile, which means it hides itself during compilation, and is re-enabled
+at runtime, and DebugMode-related operations won't show up in the compiled region.
+DebugMode also provides some visibility into non-torch-dispatch calls (e.g. DTensor redistribute calls,
+inductor-generated triton kernels), but requires special handling for these, since dispatch modes
+can't intercept them by default.
+
+The mode also provides some extensions for custom debugging (e.g. adding custom dispatch call hooks
+via dispatch_hooks), or numerics debugging (e.g. tensor hashing for bitwise equivalence/closeness,
+via log_tensor_hashes). These decorators allow annotating string dumps with additional per-call information,
+for any region of runtime code.
+
+Usage::
+
+    with DebugMode() as debug_mode:
+        result = some_pytorch_operation(tensor_input)
+    print(debug_mode.debug_string())
+"""
+
 import contextlib
 import functools
+import inspect
+import os
 import traceback
-from typing import Any, Callable, Optional, TYPE_CHECKING
+import weakref
+from collections.abc import Callable
+from typing import Any, Optional, TYPE_CHECKING, Union  # noqa: F401
 
 import torch
 from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
+from torch.fx.graph import _parse_stack_trace
 from torch.utils._dtype_abbrs import dtype_abbrs
 from torch.utils._python_dispatch import (
     _get_current_dispatch_mode,
     _get_current_dispatch_mode_stack,
     TorchDispatchMode,
 )
-from torch.utils._pytree import tree_all, tree_map
+from torch.utils._pytree import keystr, tree_all, tree_map, tree_map_with_path
 from torch.utils._traceback import CapturedTraceback
+from torch.utils.weak import WeakIdRef
 
 
 if TYPE_CHECKING:
+    from torch._dynamo.device_interface import DeviceInterface
     from torch.distributed._tools.mod_tracker import ModTracker
 
 
@@ -24,8 +64,19 @@
 
 
 REDISTRIBUTE_FUNC = "redistribute_input"
+# registered dispatch call hooks
 _DISPATCH_RECORD_HOOKS: list[Callable] = []
 _DISPATCH_LOG_HOOKS: list[Callable] = []
+# Tracks if we're in inductor benchmarking, and temporarily disables logging
+# (for ignoring autotuning kernel launches which don't affect the user-facing result)
+_IN_INDUCTOR_BENCHMARK = False
+# For record_outputs, log_tensor_hashes hooks for triton kernels.
+# Stores kernel outputs in call.record["output"]
+_RECORD_TRITON_OUTPUTS = False
+# Annotates kernel output hashes, and stores them in call.post_hashes
+_TRITON_OUTPUT_HASH_FN = None
+# Annotates kernel input hashes, and stores them in call.pre_hashes
+_TRITON_INPUT_HASH_FN = None
 
 
 def _stringify_shape(shape) -> str:
@@ -56,29 +107,48 @@ def _stringify_dtensor_spec(spec) -> str:
     return DTensorSpec.format_shard_order_str(spec.placements, spec.shard_order)
 
 
-def _tensor_debug_string(tensor, attributes) -> str:
+class TensorIdTracker:
+    def __init__(self) -> None:
+        self.tensor_memo: dict[WeakIdRef, int] = {}
+        self.next_tensor_id = 0
+
+    def _id(self, tensor) -> int:
+        with torch._C._DisablePythonDispatcher():
+            o = WeakIdRef(tensor)
+
+            def del_memo() -> None:
+                self.tensor_memo.pop(o, None)
+
+            weakref.finalize(tensor, del_memo)
+            if o not in self.tensor_memo:
+                self.tensor_memo[o] = self.next_tensor_id
+                self.next_tensor_id += 1
+            return self.tensor_memo[o]
+
+
+def _tensor_debug_string(tensor, attributes, tensor_memo=None) -> str:
     """Convert tensor to debug string representation."""
 
     if isinstance(tensor, torch.Tensor):
         tensor_debug_str = f"{dtype_abbrs[tensor.dtype]}{_stringify_shape(tensor.shape)}{_stringify_attributes(tensor, attributes)}"
-
+        id_str = f"${tensor_memo._id(tensor)}" if tensor_memo is not None else ""
         if isinstance(tensor, torch.distributed.tensor.DTensor):
             # omitted device mesh
-            return f"dt: {tensor_debug_str}| {_stringify_dtensor_spec(tensor._spec)}"
+            return f"dt{id_str}: {tensor_debug_str}| {_stringify_dtensor_spec(tensor._spec)}"
         elif isinstance(tensor, FakeTensor):
-            return f"ft: {tensor_debug_str}"
+            return f"ft{id_str}: {tensor_debug_str}"
         else:
-            return f"t: {tensor_debug_str}"
+            return f"t{id_str}: {tensor_debug_str}"
     else:
         raise RuntimeError(f"Unsupported tensor type: {type(tensor)}")
 
 
-def _arg_to_str(arg, attributes) -> str:
+def _arg_to_str(arg, attributes, tensor_memo=None) -> str:
     from torch.distributed.tensor._dtensor_spec import DTensorSpec
 
     def to_str(x):
         if isinstance(x, torch.Tensor):
-            return _tensor_debug_string(x, attributes)
+            return _tensor_debug_string(x, attributes, tensor_memo)
         elif isinstance(x, DTensorSpec):
             return _stringify_dtensor_spec(x)
         return x
@@ -87,22 +157,56 @@ def to_str(x):
     return str(arg)
 
 
-def default_hash_fn(t: torch.Tensor, use_scalar: bool = False) -> torch.Tensor:
+def norm_hash_fn(
+    t: torch.Tensor, use_scalar: bool = False
+) -> Union[torch.Tensor, float]:
     """
     from Observer. Computes a hash for a tensor by converting it to float (if needed), making it contiguous,
     replacing NaN/inf values with fixed numbers, and then computing the L1 norm in float64 or complex128.
     This is used to generate a deterministic summary value for tensor comparison.
     """
-    if not (t.is_floating_point() or t.is_complex()):
-        t = t.float()
-    t = t.contiguous()
-    # Clean the tensor to handle NaN/inf values, then compute norm
-    t_clean = torch.nan_to_num(t, nan=0.0, posinf=1.0, neginf=-1.0)
-
-    dtype = torch.complex128 if t.is_complex() else torch.float64
-    out = t_clean.norm(p=1, dtype=dtype)
+    with torch._C._DisablePythonDispatcher():
+        if not (t.is_floating_point() or t.is_complex()):
+            t = t.float()
+        t = t.contiguous()
+
+        if t.is_complex():
+            t_float = t.to(dtype=torch.complex128)
+        else:
+            t_float = t.to(dtype=torch.float64)
+
+        out = t_float.norm(p=1)
+        if use_scalar:
+            return out.item()
+        return out
+
+
+def _compute_rel_diff(hash1, hash2):
+    # Relative difference: |hash1 - hash2| / max(|hash1|, |hash2|, eps)
+    numerator = abs(hash1 - hash2)
+    denominator = max(abs(hash1), abs(hash2), 1e-10)
+    return numerator / denominator
+
+
+def hash_tensor_fn(
+    t: torch.Tensor, use_scalar: bool = False
+) -> Union[torch.Tensor, int]:
+    """
+    wrapper over torch.hash_tensor
+    """
+    if isinstance(t, torch.distributed.tensor.DTensor):
+        t = t.to_local()
+
+    if t.is_floating_point():
+        t_clean = t.to(dtype=torch.float64)
+    elif t.is_complex():
+        t_clean = t.to(dtype=torch.complex128).view(torch.float64)
+    else:
+        t_clean = t.to(dtype=torch.int64)
+
+    out = torch.hash_tensor(t_clean)
     if use_scalar:
-        return out.item()
+        return out.item()  # type: ignore[attribute]
     return out
 
 
@@ -118,7 +222,17 @@ def _get_stack_trace() -> str:
     return "".join(summary.format())
 
 
-def _maybe_get_autograd_trace() -> Optional[str]:
+def _get_user_stack_trace(stack_trace_str: str) -> str | None:
+    # Extract user code stack trace, filtering out torch internals.
+    torch_dir = os.path.dirname(inspect.getfile(torch))
+    filter_fn = lambda file, name, code: not file.startswith(torch_dir + os.path.sep)  # noqa: E731
+    trace = _parse_stack_trace(stack_trace_str, filter_fn=filter_fn)
+    if trace:
+        return f"File: {trace.file}:{trace.lineno} in {trace.name}, code: {trace.code}"
+    return None
+
+
+def _maybe_get_autograd_trace() -> str | None:
     if torch._C._current_autograd_node() is not None:
         tb = torch._C._current_autograd_node().metadata.get("traceback_")  # type: ignore[attr-defined]
         if tb:
@@ -126,16 +240,26 @@ def _maybe_get_autograd_trace() -> Optional[str]:
     return None
 
 
+def _get_op_name(op) -> str:
+    if isinstance(op, torch._ops.OpOverload):
+        op_name = op.__qualname__
+    elif hasattr(op, "__module__") and hasattr(op, "__name__"):
+        op_name = f"{op.__module__}.{op.__name__}"
+    else:
+        op_name = str(op)
+    return op_name
+
+
 class _DebugCall:
     """Base class for tracking operator calls in DebugMode"""
 
     def __init__(
         self,
         call_depth: int,
-        record: Optional[dict[str, Any]] = None,
-        log: Optional[dict[str, Any]] = None,
+        record: dict[str, Any] | None = None,
+        log: dict[str, Any] | None = None,
         stack: bool = False,
-    ):
+    ) -> None:
         self.call_depth = call_depth
         if stack:
             self.stack_trace = _get_stack_trace()
@@ -144,8 +268,11 @@ def __init__(
         # results from dispatch hooks
         self.record = record
         self.log = log
+        self.output_str: str | None = None
 
-    def stringify_args(self, attributes: list[str]) -> None:
+    def stringify_args(
+        self, attributes: list[str], tensor_memo: TensorIdTracker | None = None
+    ) -> None:
         """
         To reduce memory consumption, this method stringifies args/kwargs, stores the result, and deletes original args/kwargs.
         """
@@ -153,6 +280,18 @@ def stringify_args(self, attributes: list[str]) -> None:
             "Subclasses must implement stringify_args(), even if no-op"
         )
 
+    def stringify_output(
+        self,
+        output: Any,
+        attributes: list[str],
+        tensor_memo: TensorIdTracker | None = None,
+    ) -> None:
+        """Store stringified version of call output in self.output_str"""
+        if tree_all(lambda x: x is None, output):
+            return
+        output_str = tree_map(lambda x: _arg_to_str(x, attributes, tensor_memo), output)
+        self.output_str = f"  ->  {str(output_str)}"
+
     def render(self, attributes: list[str]) -> str:
         raise NotImplementedError("Subclasses must implement string render()")
 
@@ -170,20 +309,25 @@ def __init__(
         kwargs: dict,
         call_depth: int,
         stack: bool = False,
-    ):
+    ) -> None:
         super().__init__(call_depth, stack=stack)
         self.op = op
         self.args = args
         self.kwargs = kwargs
 
-        self.args_str: Optional[str] = None
-        self.kwargs_str: Optional[str] = None
+        self.args_str: str | None = None
+        self.kwargs_str: str | None = None
 
-    def stringify_args(self, attributes: list[str]) -> None:
-        self.args_str = ", ".join(_arg_to_str(arg, attributes) for arg in self.args)
+    def stringify_args(
+        self, attributes: list[str], tensor_memo: TensorIdTracker | None = None
+    ) -> None:
+        self.args_str = ", ".join(
+            _arg_to_str(arg, attributes, tensor_memo) for arg in self.args
+        )
         if self.kwargs:
             self.kwargs_str = ", " + ", ".join(
-                f"{k}={_arg_to_str(v, attributes)}" for k, v in self.kwargs.items()
+                f"{k}={_arg_to_str(v, attributes, tensor_memo)}"
+                for k, v in self.kwargs.items()
             )
         else:
             self.kwargs_str = ""
@@ -215,6 +359,8 @@ def render(self, attributes: list[str]) -> str:
 
         base_str = f"{op_name}({args_str}{kwargs_str})"
 
+        if self.output_str:
+            base_str += self.output_str
         if self.log:
             base_str += f"  # {self.log}"
         return base_str
@@ -238,17 +384,19 @@ def __init__(
         transform_info_str,
         call_depth,
         stack=False,
-    ):
+    ) -> None:
         super().__init__(call_depth, stack=stack)
         self.arg = arg
         self.src_placement = src_placement
         self.dst_placement = dst_placement
         self.transform_info_str = transform_info_str
 
-        self.arg_str: Optional[str] = None
+        self.arg_str: str | None = None
 
-    def stringify_args(self, attributes: list[str]) -> None:
-        self.arg_str = f"{_arg_to_str(self.arg, attributes)}"
+    def stringify_args(
+        self, attributes: list[str], tensor_memo: TensorIdTracker | None = None
+    ) -> None:
+        self.arg_str = f"{_arg_to_str(self.arg, attributes, tensor_memo)}"
         del self.arg
 
     def render(self, attributes: list[str]) -> str:
@@ -263,7 +411,11 @@ def render(self, attributes: list[str]) -> str:
             src_placement_str = _arg_to_str(self.src_placement, attributes)
             dst_placement_str = _arg_to_str(self.dst_placement, attributes)
             placement_str = f"{src_placement_str} -> {dst_placement_str}"
-        return f"{REDISTRIBUTE_FUNC}({arg_str}, {placement_str})"
+
+        base_str = f"{REDISTRIBUTE_FUNC}({arg_str}, {placement_str})"
+        if self.output_str:
+            base_str += self.output_str
+        return base_str
 
     def __iter__(self):
         # for BC; tuple(self) returns (op, placement info, kwargs, call_depth)
@@ -284,11 +436,13 @@ def __iter__(self):
 class _NNModuleCall(_DebugCall):
     """Designates entering an nn.Module's forward method"""
 
-    def __init__(self, module_name: str, call_depth: int, stack: bool = False):
+    def __init__(self, module_name: str, call_depth: int, stack: bool = False) -> None:
         super().__init__(call_depth, stack=stack)
         self.module_name = module_name
 
-    def stringify_args(self, attributes: list[str]) -> None:
+    def stringify_args(
+        self, attributes: list[str], tensor_memo: TensorIdTracker | None = None
+    ) -> None:
         pass  # nothing to stringify
 
     def render(self, attributes: list[str]) -> str:
@@ -303,6 +457,92 @@ def __iter__(self):
         ]
 
 
+class _TritonKernelCall(_DebugCall):
+    """Triton kernel call from Inductor"""
+
+    def __init__(
+        self,
+        kernel_name: str,
+        kwargs: dict[str, Any],
+        call_depth: int,
+    ):
+        super().__init__(call_depth)
+        self.kernel_name = kernel_name
+        self.kwargs = kwargs
+        self.kwargs_str: str | None = None
+
+        self.pre_hashes: dict[str, Any] | None = None
+        self.post_hashes: dict[str, Any] | None = None
+
+    def stringify_args(
+        self, attributes: list[str], tensor_memo: TensorIdTracker | None = None
+    ) -> None:
+        # Optionally hash kernel inputs before launch
+        global _TRITON_INPUT_HASH_FN
+        if hash_fn := _TRITON_INPUT_HASH_FN:
+            self.pre_hashes = {
+                k: hash_fn(v)
+                for k, v in self.kwargs.items()
+                if isinstance(v, torch.Tensor)
+            }
+
+        if self.kwargs:
+            self.kwargs_str = ", ".join(
+                f"{k}={_arg_to_str(v, attributes, tensor_memo)}"
+                for k, v in self.kwargs.items()
+            )
+        else:
+            self.kwargs_str = ""
+
+    def render(self, attributes: list[str]) -> str:
+        base_str = f"[triton] {self.kernel_name}({self.kwargs_str})"
+        if self.pre_hashes:
+            pre_hashes_str = ", ".join(f"{k}: {v}" for k, v in self.pre_hashes.items())
+            pre_hashes_str = (
+                "\n  "
+                + "  " * self.call_depth
+                + f"# pre-kernel hashes: {{{pre_hashes_str}}}"
+            )
+        else:
+            pre_hashes_str = ""
+        if self.post_hashes:
+            post_hashes_str = ", ".join(
+                f"{k}: {v}" for k, v in self.post_hashes.items()
+            )
+            post_hashes_str = (
+                "\n  "
+                + "  " * self.call_depth
+                + f"# post-kernel hashes: {{{post_hashes_str}}}"
+            )
+        else:
+            post_hashes_str = ""
+        return f"{base_str}{pre_hashes_str}{post_hashes_str}\n"
+
+    def finalize(self, device_interface: "DeviceInterface"):
+        # synchronize -> hash/store kernel results
+        global _RECORD_TRITON_OUTPUTS, _TRITON_OUTPUT_HASH_FN
+        device_interface.synchronize(device_interface.current_device())
+        if _RECORD_TRITON_OUTPUTS:
+            self.record = {
+                "output": {
+                    k: v.clone() if isinstance(v, torch.Tensor) else v
+                    for k, v in self.kwargs.items()
+                }
+            }
+        if hash_fn := _TRITON_OUTPUT_HASH_FN:
+            self.post_hashes = {
+                k: hash_fn(v)
+                for k, v in self.kwargs.items()
+                if isinstance(v, torch.Tensor)
+            }
+
+        # don't store tensors
+        del self.kwargs
+
+    def __iter__(self):
+        yield from [self.kernel_name, (), self.kwargs_str, self.call_depth]
+
+
 def _run_hook(hook, *args):
     out = hook(*args)
     assert out is None or isinstance(out, dict)
@@ -330,6 +570,20 @@ def _run_dispatch_hooks(call: _DebugCall, func, types, args, kwargs, result) ->
             call.log = log
 
 
+def _get_call_name(call: _DebugCall) -> str:
+    """String identifying _DebugCall (e.g. func, kernel, module name)"""
+    if isinstance(call, _OpCall):
+        return _get_op_name(call.op)
+    elif isinstance(call, _TritonKernelCall):
+        return call.kernel_name
+    elif isinstance(call, _NNModuleCall):
+        return call.module_name
+    elif isinstance(call, _RedistributeCall):
+        return REDISTRIBUTE_FUNC
+    else:
+        return str(call)
+
+
 class DebugMode(TorchDispatchMode):
     def __init__(
         self,
@@ -341,7 +595,9 @@ def __init__(
         record_nn_module=False,
         store_original_args=False,
         record_stack_trace=False,
-    ):
+        record_output=False,
+        record_ids=False,
+    ) -> None:
         super().__init__()
         import torch.distributed.tensor  # noqa: F401
 
@@ -364,7 +620,7 @@ def __init__(
         # This flag currently has no effect on torch.compiled-regions.
         self.record_nn_module = record_nn_module
 
-        self.module_tracker: Optional[ModTracker] = None
+        self.module_tracker: ModTracker | None = None
         if self.record_nn_module:
             self.module_tracker_setup()
 
@@ -378,32 +634,70 @@ def __init__(
         # e.g. via DebugMode(record_stack_trace=True), or torch.autograd.set_detect_anomaly().
         self.record_stack_trace = record_stack_trace
 
+        # Records call outputs in logs (e.g. for __torch_dispatch__, __torch_function__, redistribute_input)
+        self.record_output: bool = record_output
+
+        # Annotates string dumps with graph-style tensor ids, e.g. op($1, $2) -> $3.
+        self.record_ids: bool = record_ids
+
+        self.reset()
+
+    def reset(self) -> None:
         self.operators = []
         self.call_depth = 0
+        self._tensor_memo = TensorIdTracker()
+        self._output_info: dict[int, object] = {}
+
+    def _track_op_output(self, op_index, result) -> None:
+        """Assign IDs to output tensors and store in output_info"""
+        # self._track_tensor_ids(result)
+        self._output_info[op_index] = result
 
     # Without this override, running torch.compile under DebugMode
     # will force torch.compile to always use the “eager” backend
     # With this, DebugMode will not take effect on torch.compile
     @classmethod
-    def ignore_compile_internals(cls):
+    def ignore_compile_internals(cls) -> bool:
         return True
 
-    def _record_call(self, call):
+    def _record_call(self, call) -> None:
+        global _IN_INDUCTOR_BENCHMARK
+        if _IN_INDUCTOR_BENCHMARK:
+            return
+
+        if str(call).startswith("profiler::_record_function"):
+            return
+
         if not self.store_original_args:
-            call.stringify_args(self.record_tensor_attributes)
+            call.stringify_args(
+                self.record_tensor_attributes,
+                self._tensor_memo if self.record_ids else None,
+            )
         self.operators.append(call)
 
+    def _record_call_output(self, call, output) -> None:
+        if not self.record_output:
+            return
+        call.stringify_output(
+            output,
+            self.record_tensor_attributes,
+            self._tensor_memo if self.record_ids else None,
+        )
+
     def __torch_function__(self, func, types, args=(), kwargs=None):
         if kwargs is None:
             kwargs = {}
 
-        self._record_call(
-            _OpCall(func, args, kwargs, self.call_depth, stack=self.record_stack_trace)
+        call = _OpCall(
+            func, args, kwargs, self.call_depth, stack=self.record_stack_trace
         )
+        self._record_call(call)
 
         try:
             self.call_depth += 1
-            return func(*args, **kwargs)
+            result = func(*args, **kwargs)
+            self._record_call_output(call, result)
+            return result
         finally:
             self.call_depth -= 1
 
@@ -445,13 +739,13 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
 
         result = func(*args, **kwargs)
         if call:
+            self._record_call_output(call, result)
             _run_dispatch_hooks(call, func, types, args, kwargs, result)
 
         return result
 
     def __enter__(self):
-        self.operators = []
-        self.call_depth = 0
+        self.reset()
 
         if self.record_torchfunction:
             torch._C._push_on_torch_function_stack(self)
@@ -477,19 +771,19 @@ def __exit__(self, *args):
         if self.record_stack_trace:
             self.anomaly_for_traces.__exit__(*args)
 
-    def module_tracker_setup(self):
+    def module_tracker_setup(self) -> None:
         from torch.distributed._tools.mod_tracker import ModTracker
 
         self.module_tracker = ModTracker()
 
         # module pre-fw hook: record module call
-        def pre_fw_hook(module, input):
+        def pre_fw_hook(module, input) -> None:
             fqn = self.module_tracker._get_mod_name(module)  # type: ignore[attribute, union-attr]
             self.operators.append(_NNModuleCall(fqn, self.call_depth + 1))
             self.call_depth += 1
 
         # module post-fw hook: decrement call depth
-        def post_fw_hook(module, input, output):
+        def post_fw_hook(module, input, output) -> None:
             self.call_depth -= 1
 
         self.module_tracker.register_user_hooks(pre_fw_hook, post_fw_hook)
@@ -500,7 +794,7 @@ def record_redistribute_calls(
         arg,
         src_placement,
         dst_placement,
-        transform_info_str: Optional[str] = None,
+        transform_info_str: str | None = None,
     ):
         try:
             self._record_call(
@@ -518,20 +812,69 @@ def record_redistribute_calls(
         finally:
             self.call_depth -= 1
 
-    def debug_string(self) -> str:
+    def record_triton_kernel(
+        self, kernel_name: str, kwargs: dict[str, Any]
+    ) -> _TritonKernelCall:
+        call = _TritonKernelCall(kernel_name, kwargs, self.call_depth + 1)
+        call.stringify_args(self.record_tensor_attributes)
+        self.operators.append(call)
+        return call
+
+    def debug_string(self, show_stack_trace: bool = False) -> str:
+        """
+        show_stack_trace: If True, display one-line stack trace summaries above groups
+                        of operations (similar to gm.print_readable() style).
+                        Requires record_stack_trace=True.
+        """
         with torch._C.DisableTorchFunction():
-            result = ""
-            result += "\n".join(
-                "  " + "  " * op.call_depth + op.render(self.record_tensor_attributes)
-                for op in self.operators
-            )
-        return result
+            if not show_stack_trace:
+                result = "\n".join(
+                    "  "
+                    + "  " * op.call_depth
+                    + op.render(self.record_tensor_attributes)
+                    for op in self.operators
+                )
+                return result
+
+            # Group operations by stack trace
+            lines = []
+            prev_stack_summary = None
+
+            for op in self.operators:
+                # Get the stack trace: prefer fwd_stack_trace, fallback to stack_trace
+                stack_trace = None
+                if hasattr(op, "fwd_stack_trace") and op.fwd_stack_trace:
+                    stack_trace = op.fwd_stack_trace
+                elif hasattr(op, "stack_trace") and op.stack_trace:
+                    stack_trace = op.stack_trace
+
+                stack_summary = None
+                if stack_trace:
+                    stack_summary = _get_user_stack_trace(stack_trace)
+
+                if stack_summary and stack_summary != prev_stack_summary:
+                    # add blank line before stack trace comment for readability
+                    if lines:  # don't add blank line at the very start
+                        lines.append("")
+                    indent = "  " * (op.call_depth + 1)
+                    lines.append(indent + "# " + stack_summary)
+                    prev_stack_summary = stack_summary
+
+                # Add the operation line
+                line = (
+                    "  "
+                    + "  " * op.call_depth
+                    + op.render(self.record_tensor_attributes)
+                )
+                lines.append(line)
+
+            return "\n".join(lines)
 
     @staticmethod
     @contextlib.contextmanager
     def dispatch_hooks(
-        record_hook: Optional[Callable] = None,
-        log_hook: Optional[Callable] = None,
+        record_hook: Callable | None = None,
+        log_hook: Callable | None = None,
     ):
         """
         Allows installing post-hooks on arguments to intercepted __torch_dispatch__ calls;
@@ -564,35 +907,60 @@ def record_outputs():
         """
 
         def dispatch_hook(func, types, args, kwargs, result):
-            with torch._C._DisablePythonDispatcher():
-                out = tree_map(
-                    lambda x: x.clone() if isinstance(x, torch.Tensor) else x, result
-                )
+            out = tree_map(
+                lambda x: x.clone() if isinstance(x, torch.Tensor) else x, result
+            )
             return {"output": out}
 
-        with DebugMode.dispatch_hooks(record_hook=dispatch_hook):
-            yield
+        global _RECORD_TRITON_OUTPUTS
+        try:
+            _old_record_triton = _RECORD_TRITON_OUTPUTS
+            _RECORD_TRITON_OUTPUTS = True
+            with DebugMode.dispatch_hooks(record_hook=dispatch_hook):
+                yield
+        finally:
+            _RECORD_TRITON_OUTPUTS = _old_record_triton
 
     @staticmethod
     @contextlib.contextmanager
     def log_tensor_hashes(
-        hash_fn: Optional[Callable] = None, hash_inputs: bool = False
+        hash_fn: Union[Callable, str, list[str]] = "norm", hash_inputs: bool = False
     ):
         """
         Installs hook for tensor hash logging.
 
-        hash_fn: optional function for custom hashing
+        hash_fn: One of:
+            - Custom-defined hash function
+            - String: one of ("norm", "hash_tensor")
+                - "norm": uses norm_hash_fn; basically tensor's L1 norm
+                - "hash_tensor": uses torch.hash_tensor (XOR sum reduction)
+            - List of strings: returns tuple of hashes from above options
         hash_inputs: if True, also hashes tensors in (args, kwargs), storing them in "input_hash".
         NOTE: this is currently a post-hook, so e.g. inplace ops will log the "output" hashes.
         """
-        if hash_fn is None:
-            hash_fn = functools.partial(default_hash_fn, use_scalar=True)
+
+        def hash_fn_option(hash_type):
+            assert isinstance(hash_type, str) and hash_type in ["norm", "hash_tensor"]
+            return functools.partial(
+                norm_hash_fn if hash_type == "norm" else hash_tensor_fn, use_scalar=True
+            )
+
+        if callable(hash_fn):
+            fn = hash_fn
+        elif isinstance(hash_fn, str):
+            fn = hash_fn_option(hash_fn)
+        elif isinstance(hash_fn, list):
+            fns = [hash_fn_option(fn) for fn in hash_fn]
+            fn = lambda x: tuple(fn(x) for fn in fns)  # noqa: E731
+        else:
+            raise NotImplementedError(
+                f"log_tensor_hashes() expected hash_fn to be callable, str, or list[str], but found {type(hash_fn)}"
+            )
 
         def _tree_hash(obj):
-            with torch._C._DisablePythonDispatcher():
-                return tree_map(
-                    lambda x: hash_fn(x) if isinstance(x, torch.Tensor) else None, obj
-                )
+            return tree_map(
+                lambda x: fn(x) if isinstance(x, torch.Tensor) else None, obj
+            )
 
         def _dispatch_hash_hook(func, types, args, kwargs, result):
             if "empty" in str(func) or "profiler" in str(func):
@@ -607,11 +975,240 @@ def _dispatch_hash_hook(func, types, args, kwargs, result):
                 return None
             return out
 
-        with DebugMode.dispatch_hooks(log_hook=_dispatch_hash_hook):
+        global _TRITON_INPUT_HASH_FN, _TRITON_OUTPUT_HASH_FN
+        try:
+            if hash_inputs:
+                _old_input_hfn = _TRITON_INPUT_HASH_FN
+                _TRITON_INPUT_HASH_FN = fn
+            _old_output_hfn = _TRITON_OUTPUT_HASH_FN
+            _TRITON_OUTPUT_HASH_FN = fn
+            with DebugMode.dispatch_hooks(log_hook=_dispatch_hash_hook):
+                yield
+        finally:
+            if hash_inputs:
+                _TRITON_INPUT_HASH_FN = _old_input_hfn  # type: ignore[assignment]
+            _TRITON_OUTPUT_HASH_FN = _old_output_hfn
+
+    @staticmethod
+    @contextlib.contextmanager
+    def _benchmarking_inductor():
+        """
+        Context manager for disabling logging during inductor benchmarking,
+        so logs don't contain all kernels launched from autotuning.
+        """
+        global _IN_INDUCTOR_BENCHMARK
+        try:
+            _IN_INDUCTOR_BENCHMARK = True
             yield
+        finally:
+            _IN_INDUCTOR_BENCHMARK = False
+
+    @property
+    def logs(self):
+        return list(self.operators)
+
+    @staticmethod
+    def check_hash_mismatches(
+        logs1: list, logs2: list, compare_inputs: bool = False
+    ) -> list[dict]:
+        """
+        Compares tensor hashes between two DebugMode runs, for checking run-to-run numerical divergence.
+
+        This first validates the two log sequences have identical structure (same operations, input shapes/dtypes, etc.),
+        then compares tensor hash values, and returns a list of call outputs where mismatches were found.
+        Expects input logs to have been run with log_tensor_hashes, and looks for hashes in .log["hash"] & .log["input_hash"]
+        (or .post_hashes & .pre_hashes for triton kernels).
+
+        note: skips checking log pairs where hashes aren't present, but will raise if present in one & not the other.
+
+        Args:
+            logs1: logs from the first DebugMode run (from debug_mode.logs)
+            logs2: logs from the second DebugMode run
+            compare_inputs: If True, also compare input tensor hashes (default: only output checking)
+
+        Returns:
+            List of dictionaries describing hash mismatches. Each dict contains:
+                - call_type: "torch op" or "triton kernel"
+                - call: Operator/kernel name
+                - arg_name: For triton kernels, the argument name; None for torch ops
+                - pytree_path: For torch ops, the pytree path to the differing tensor; None for kernels
+                - hash1: Hash value from the first run
+                - hash2: Hash value from the second run
+                - rel_diff: Relative difference between hash values
+                - is_input_hash: True if this is an input hash, False for output hash
+
+        Raises:
+            ValueError: If logs have different lengths, call types, operator names, or call depths
+
+        Usage::
+
+            # Run model first time
+            with DebugMode() as debug_mode, DebugMode.log_tensor_hashes():
+                model(x)
+                logs1 = debug_mode.logs
+
+            # Run again, in exactly the same way
+            with DebugMode() as debug_mode, DebugMode.log_tensor_hashes():
+                model(x)
+                logs2 = debug_mode.logs
+
+            mismatches = DebugMode.check_hash_mismatches(logs1, logs2)
+            for m in mismatches:
+                print(f"{m['call']}: hash diff {m['rel_diff']:.2e}")
+        """
+        if len(logs1) != len(logs2):
+            raise ValueError(f"Log lengths don't match: {len(logs1)} vs {len(logs2)}")
+
+        difference_info = []
+        for i, (log1, log2) in enumerate(zip(logs1, logs2)):
+            # check call type
+            call1_type = type(log1).__name__
+            call2_type = type(log2).__name__
+            if call1_type != call2_type:
+                raise ValueError(
+                    f"Call types don't match at index {i}: {call1_type} vs {call2_type}"
+                )
+            call_type = call1_type
+
+            # check call name
+            op1_name, op2_name = _get_call_name(log1), _get_call_name(log2)
+            if op1_name != op2_name:
+                raise ValueError(
+                    f"Operators don't match at index {i}: {call_type}[{op1_name}] vs {call_type}[{op2_name}]"
+                )
+            op_name = op1_name
+
+            # check call depth
+            if log1.call_depth != log2.call_depth:
+                raise ValueError(
+                    f"Call depths for {call_type}[{op_name}] don't match at index {i}: {log1.call_depth} vs {log2.call_depth}"
+                )
+
+            # Redistribute: call args should be the same
+            if isinstance(log1, _RedistributeCall):
+                if tuple(log1) != tuple(log2):
+                    raise ValueError(
+                        f"Redistribute calls don't match at index {i}: {log1} vs {log2}"
+                    )
+
+            # Triton kernel: same arg names, arg types
+            elif isinstance(log1, _TritonKernelCall):
+                if log1.kwargs_str != log2.kwargs_str:
+                    raise ValueError(
+                        f"Triton kernel call args don't match for {log1.kernel_name} at index {i}:"
+                        f"\n\nlog1: {log1.kwargs_str}\n\nlog2: {log2.kwargs_str}"
+                    )
+
+                def compare_triton_hashes(hashes1, hashes2, is_input):
+                    assert set(hashes1.keys()) == set(hashes2.keys())  # type: ignore[union-attr]
+                    for key in hashes1.keys():
+                        if hashes1[key] != hashes2[key]:
+                            difference_info.append(
+                                {
+                                    "call_type": "triton kernel",
+                                    "call": op_name,
+                                    "arg_name": key,
+                                    "pytree_path": None,
+                                    "hash1": hashes1[key],
+                                    "hash2": hashes2[key],
+                                    "rel_diff": _compute_rel_diff(
+                                        hashes1[key], hashes2[key]
+                                    ),
+                                    "is_input_hash": is_input,
+                                }
+                            )
+
+                # check output hashes
+                has_post_1, has_post_2 = (
+                    log1.post_hashes is not None,
+                    log2.post_hashes is not None,
+                )
+                if has_post_1 != has_post_2:
+                    raise ValueError(
+                        f"Triton kernel post-hash presence inconsistent for {log1.kernel_name} "
+                        f"at index {i}: log1 has post_hashes={has_post_1}, log2 has post_hashes={has_post_2}"
+                    )
+
+                if has_post_1:
+                    compare_triton_hashes(
+                        log1.post_hashes, log2.post_hashes, is_input=False
+                    )
+
+                # maybe check input hashes
+                if compare_inputs:
+                    has_pre_1, has_pre_2 = (
+                        log1.pre_hashes is not None,
+                        log2.pre_hashes is not None,
+                    )
+                    if has_pre_1 != has_pre_2:
+                        raise ValueError(
+                            f"Triton kernel pre-hash presence inconsistent for {log1.kernel_name} "
+                            f"at index {i}: log1 has pre_hashes={has_pre_1}, log2 has pre_hashes={has_pre_2}"
+                        )
+
+                    if has_pre_1:
+                        compare_triton_hashes(
+                            log1.pre_hashes, log2.pre_hashes, is_input=True
+                        )
+
+            # regular log calls
+            elif isinstance(log1, _OpCall):
+
+                def compare_op_hashes(hashes1, hashes2, is_input):
+                    def _helper(keypath, hash1, hash2):
+                        if hash1 != hash2:
+                            difference_info.append(
+                                {
+                                    "call_type": "torch op",
+                                    "call": op_name,
+                                    "arg_name": None,
+                                    "pytree_path": keystr(keypath),
+                                    "hash1": hash1,
+                                    "hash2": hash2,
+                                    "rel_diff": _compute_rel_diff(hash1, hash2),
+                                    "is_input_hash": is_input,
+                                }
+                            )
+
+                    tree_map_with_path(_helper, hashes1, hashes2)
+
+                # check output hashes
+                has_hash1 = log1.log is not None and "hash" in log1.log
+                has_hash2 = log2.log is not None and "hash" in log2.log
+                if has_hash1 != has_hash2:
+                    raise ValueError(
+                        f"Output hash presence inconsistent for triton kernel {call_type}[{op_name}] "
+                        f"at index {i}: log1 has hash={has_hash1}, log2 has hash={has_hash2}"
+                    )
+
+                if has_hash1:
+                    compare_op_hashes(
+                        log1.log["hash"],  # type: ignore[union-attr]
+                        log2.log["hash"],
+                        is_input=False,
+                    )
+
+                # maybe check input hashes
+                if compare_inputs:
+                    has_hash1 = log1.log is not None and "input_hash" in log1.log
+                    has_hash2 = log2.log is not None and "input_hash" in log2.log
+                    if has_hash1 != has_hash2:
+                        raise ValueError(
+                            f"Input hash presence inconsistent for triton kernel {call_type}[{op_name}] "
+                            f"at index {i}: log1 has input_hash={has_hash1}, log2 has input_hash={has_hash2}"
+                        )
+
+                    if has_hash1:
+                        compare_op_hashes(
+                            log1.log["input_hash"],  # type: ignore[union-attr]
+                            log2.log["input_hash"],
+                            is_input=True,
+                        )
+
+        return difference_info
 
 
-def get_active_debug_mode() -> Optional[DebugMode]:
+def get_active_debug_mode() -> DebugMode | None:
     debug_mode = None
     for mode in _get_current_dispatch_mode_stack():
         if isinstance(mode, DebugMode):
diff --git a/torch/utils/_device.py b/torch/utils/_device.py
index 2780218e03eef..aafa336415ec6 100644
--- a/torch/utils/_device.py
+++ b/torch/utils/_device.py
@@ -1,6 +1,5 @@
 # mypy: allow-untyped-defs
 import functools
-from typing import Optional
 
 import torch
 from torch._C import _len_torch_function_stack
@@ -8,7 +7,7 @@
 from torch.utils._contextlib import context_decorator
 
 
-CURRENT_DEVICE: Optional[torch.device] = None
+CURRENT_DEVICE: torch.device | None = None
 
 
 @functools.lru_cache(1)
@@ -59,7 +58,7 @@ def _device_constructors():
 
 # NB: This is directly called from C++ in torch/csrc/Device.cpp
 class DeviceContext(TorchFunctionMode):
-    def __init__(self, device):
+    def __init__(self, device) -> None:
         # pyrefly: ignore [read-only]
         self.device = torch.device(device)
 
diff --git a/torch/utils/_filelock.py b/torch/utils/_filelock.py
index dabf3bdc5fed8..a291f59b4ba7f 100644
--- a/torch/utils/_filelock.py
+++ b/torch/utils/_filelock.py
@@ -1,5 +1,4 @@
 from types import TracebackType
-from typing import Optional
 from typing_extensions import Self
 
 from filelock import FileLock as base_FileLock
@@ -28,9 +27,9 @@ def __enter__(self) -> Self:
 
     def __exit__(
         self,
-        exc_type: Optional[type[BaseException]],
-        exc_value: Optional[BaseException],
-        traceback: Optional[TracebackType],
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        traceback: TracebackType | None,
     ) -> None:
         self.region_counter.__exit__()
         with _WaitCounter("pytorch.filelock.exit").guard():
diff --git a/torch/utils/_foreach_utils.py b/torch/utils/_foreach_utils.py
index 8b682d96c1918..e88720a93ce3f 100644
--- a/torch/utils/_foreach_utils.py
+++ b/torch/utils/_foreach_utils.py
@@ -1,4 +1,4 @@
-from typing import Optional, TypeAlias
+from typing import TypeAlias
 
 import torch
 from torch import Tensor
@@ -23,7 +23,7 @@ def _get_fused_kernels_supported_devices() -> list[str]:
     ]
 
 
-TensorListList: TypeAlias = list[list[Optional[Tensor]]]
+TensorListList: TypeAlias = list[list[Tensor | None]]
 Indices: TypeAlias = list[int]
 _foreach_supported_types = [torch.Tensor]
 
diff --git a/torch/utils/_import_utils.py b/torch/utils/_import_utils.py
index 240f92acacb9d..47e48fb7144e5 100644
--- a/torch/utils/_import_utils.py
+++ b/torch/utils/_import_utils.py
@@ -1,7 +1,6 @@
 import functools
 import importlib.util
 from types import ModuleType
-from typing import Optional
 
 
 def _check_module_exists(name: str) -> bool:
@@ -24,7 +23,7 @@ def dill_available() -> bool:
 
 
 @functools.lru_cache
-def import_dill() -> Optional[ModuleType]:
+def import_dill() -> ModuleType | None:
     if not dill_available():
         return None
 
diff --git a/torch/utils/_ordered_set.py b/torch/utils/_ordered_set.py
index eea7310222394..f00b4ac31ef74 100644
--- a/torch/utils/_ordered_set.py
+++ b/torch/utils/_ordered_set.py
@@ -8,7 +8,7 @@
     Reversible,
     Set as AbstractSet,
 )
-from typing import Any, cast, Optional, TypeVar
+from typing import Any, cast, TypeVar
 
 
 T = TypeVar("T", bound=Hashable)
@@ -24,7 +24,7 @@ class OrderedSet(MutableSet[T], Reversible[T]):
 
     __slots__ = ("_dict",)
 
-    def __init__(self, iterable: Optional[Iterable[T]] = None):
+    def __init__(self, iterable: Iterable[T] | None = None) -> None:
         self._dict = dict.fromkeys(iterable, None) if iterable is not None else {}
 
     @staticmethod
diff --git a/torch/utils/_pallas.py b/torch/utils/_pallas.py
new file mode 100644
index 0000000000000..2d93e7f32c58e
--- /dev/null
+++ b/torch/utils/_pallas.py
@@ -0,0 +1,101 @@
+import functools
+
+import torch
+
+
+@functools.cache
+def has_jax_package() -> bool:
+    """Check if JAX is installed."""
+    try:
+        import jax  # noqa: F401  # type: ignore[import-not-found]
+
+        return True
+    except ImportError:
+        return False
+
+
+@functools.cache
+def has_pallas_package() -> bool:
+    """Check if Pallas (JAX experimental) is available."""
+    if not has_jax_package():
+        return False
+    try:
+        from jax.experimental import (  # noqa: F401  # type: ignore[import-not-found]
+            pallas as pl,
+        )
+
+        return True
+    except ImportError:
+        return False
+
+
+@functools.cache
+def get_jax_version(fallback: tuple[int, int, int] = (0, 0, 0)) -> tuple[int, int, int]:
+    """Get JAX version as (major, minor, patch) tuple."""
+    try:
+        import jax  # type: ignore[import-not-found]
+
+        version_parts = jax.__version__.split(".")
+        major, minor, patch = (int(v) for v in version_parts[:3])
+        return (major, minor, patch)
+    except (ImportError, ValueError, AttributeError):
+        return fallback
+
+
+@functools.cache
+def has_jax_cuda_backend() -> bool:
+    """Check if JAX has CUDA backend support."""
+    if not has_jax_package():
+        return False
+    try:
+        import jax  # type: ignore[import-not-found]
+
+        # Check if CUDA backend is available
+        devices = jax.devices("gpu")
+        return len(devices) > 0
+    except Exception:
+        return False
+
+
+@functools.cache
+def has_jax_tpu_backend() -> bool:
+    """Check if JAX has TPU backend support."""
+    if not has_jax_package():
+        return False
+    try:
+        import jax  # type: ignore[import-not-found]
+
+        # Check if TPU backend is available
+        devices = jax.devices("tpu")
+        return len(devices) > 0
+    except Exception:
+        return False
+
+
+@functools.cache
+def has_pallas() -> bool:
+    """
+    Check if Pallas backend is fully available for use.
+
+    Requirements:
+    - JAX package installed
+    - Pallas (jax.experimental.pallas) available
+    - A compatible backend (CUDA or TPU) is available in both PyTorch and JAX.
+    """
+    if not has_pallas_package():
+        return False
+
+    # Check for is CUDA is available or if JAX has GPU/CUDA backend
+    has_cuda = torch.cuda.is_available() and has_jax_cuda_backend()
+
+    # Check for TPU backend
+    has_tpu_torch = False
+    try:
+        import torch_xla.core.xla_model as xm
+
+        has_tpu_torch = xm.xla_device_count() > 0
+    except ImportError:
+        pass
+    has_tpu = has_tpu_torch and has_jax_tpu_backend()
+
+    return has_cuda or has_tpu
diff --git a/torch/utils/_python_dispatch.py b/torch/utils/_python_dispatch.py
index 52be3280c9c39..7116d584892ea 100644
--- a/torch/utils/_python_dispatch.py
+++ b/torch/utils/_python_dispatch.py
@@ -6,7 +6,7 @@
 import warnings
 from collections import deque
 from dataclasses import dataclass
-from typing import cast, Optional, overload, Protocol, TYPE_CHECKING, Union
+from typing import cast, overload, Protocol, TYPE_CHECKING
 from typing_extensions import TypeIs
 
 import torch
@@ -19,6 +19,7 @@
     _push_on_torch_dispatch_stack,
     DispatchKey,
 )
+from torch._C._dynamo.guards import set_is_in_mode_without_ignore_compile_internals
 
 
 if TYPE_CHECKING:
@@ -86,7 +87,7 @@ class TorchDispatchMode:
     # Mode authors can implement how the mode interacts with higher order operators.
     supports_higher_order_operators = False
 
-    def __init__(self, _dispatch_key=None):
+    def __init__(self, _dispatch_key=None) -> None:
         if _dispatch_key is not None:
             if not isinstance(_dispatch_key, torch._C.DispatchKey):
                 raise AssertionError("_dispatch_key must be a torch._C.DispatchKey")
@@ -98,7 +99,7 @@ def __init__(self, _dispatch_key=None):
             deque()
         )
 
-    def _lazy_init_old_dispatch_mode_flags(self):
+    def _lazy_init_old_dispatch_mode_flags(self) -> None:
         if not hasattr(self, "old_dispatch_mode_flags"):
             self.old_dispatch_mode_flags: deque[bool] = deque()  # type: ignore[no-redef]
 
@@ -140,6 +141,9 @@ def __enter__(self):
             _is_in_any_mode_without_ignore_compile_internals
             or not self.ignore_compile_internals()
         )
+        set_is_in_mode_without_ignore_compile_internals(
+            _is_in_any_mode_without_ignore_compile_internals
+        )
         _push_mode(self)
         return self
 
@@ -159,6 +163,9 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         _is_in_any_mode_without_ignore_compile_internals = (
             self.old_without_ignore_compile_internals_dispatch_mode_flags.pop()
         )
+        set_is_in_mode_without_ignore_compile_internals(
+            _is_in_any_mode_without_ignore_compile_internals
+        )
         _pop_mode(mb_dk_or_mode_key)
 
     @classmethod
@@ -171,11 +178,11 @@ def push(cls, *args, **kwargs):
         return instance
 
     @classmethod
-    def is_infra_mode(cls):
+    def is_infra_mode(cls) -> bool:
         return False
 
     @classmethod
-    def ignore_compile_internals(cls):
+    def ignore_compile_internals(cls) -> bool:
         """Ignore operators that are compiled via torch.compile.
 
         If ``True``, then this TorchDispatchMode ignores operators that
@@ -207,7 +214,7 @@ def f(x):
         return False
 
 
-def _get_current_dispatch_mode() -> Optional[TorchDispatchMode]:
+def _get_current_dispatch_mode() -> TorchDispatchMode | None:
     """
     Return the top user mode on the stack (the next one that would be
     executed) if there are any.
@@ -287,7 +294,7 @@ def _get_current_dispatch_mode_stack() -> list[TorchDispatchMode]:
     return [_get_dispatch_stack_at(i) for i in range(stack_len)]
 
 
-def _push_mode(mode: TorchDispatchMode):
+def _push_mode(mode: TorchDispatchMode) -> None:
     k = mode._dispatch_key if hasattr(mode, "_dispatch_key") else None
     if k is not None and k != torch._C.DispatchKey.PreDispatch:
         raise AssertionError(
@@ -308,7 +315,7 @@ def _push_mode(mode: TorchDispatchMode):
     _set_mode_pre_dispatch(mode)
 
 
-def _pop_mode(k: Optional[Union[DispatchKey, torch._C._TorchDispatchModeKey]] = None):
+def _pop_mode(k: DispatchKey | torch._C._TorchDispatchModeKey | None = None):
     if k == torch._C.DispatchKey.PreDispatch:  # type: ignore[attr-defined]
         from torch._ops import _pop_mode_from_pre_dispatch
 
@@ -319,7 +326,7 @@ def _pop_mode(k: Optional[Union[DispatchKey, torch._C._TorchDispatchModeKey]] =
 
 
 @contextlib.contextmanager
-def _pop_mode_temporarily(k: Optional[DispatchKey] = None):
+def _pop_mode_temporarily(k: DispatchKey | None = None):
     old = _pop_mode(k)
     try:
         yield old
@@ -429,18 +436,18 @@ def to(
         non_blocking: bool = False,
         copy: bool = False,
         *,
-        memory_format: Optional[torch.memory_format] = None,
+        memory_format: torch.memory_format | None = None,
     ) -> torch.Tensor: ...
 
     @overload
     def to(
         self,
-        device: Optional[torch._prims_common.DeviceLikeType] = None,
-        dtype: Optional[torch.types._dtype] = None,
+        device: torch._prims_common.DeviceLikeType | None = None,
+        dtype: torch.types._dtype | None = None,
         non_blocking: bool = False,
         copy: bool = False,
         *,
-        memory_format: Optional[torch.memory_format] = None,
+        memory_format: torch.memory_format | None = None,
     ) -> torch.Tensor: ...
 
     @overload
@@ -450,7 +457,7 @@ def to(
         non_blocking: bool = False,
         copy: bool = False,
         *,
-        memory_format: Optional[torch.memory_format] = None,
+        memory_format: torch.memory_format | None = None,
     ) -> torch.Tensor: ...
 
 
@@ -544,7 +551,7 @@ def transform_subclass(t, callback, outer_size=None, outer_stride=None):
     return sub
 
 
-def _correct_storage_aliasing(func, schema_info, args, outs):
+def _correct_storage_aliasing(func, schema_info, args, outs) -> None:
     """
     Given: an OpOverload, a SchemaInfo (cached information from torchgen about schema),
     and the inputs/outputs to the OpOverload,
@@ -563,7 +570,7 @@ def _correct_storage_aliasing(func, schema_info, args, outs):
     if not isinstance(outs, (list, tuple)):
         raise AssertionError(f"outs must be a list or tuple, got {type(args)}")
 
-    def alias_non_inplace_storage(arg, ret):
+    def alias_non_inplace_storage(arg, ret) -> None:
         # This is hopefully a reasonable assert:
         # subclasses that rely on this API for output aliasing
         # should always return wrapper tensor subclasses for us to manually alias.
@@ -610,7 +617,7 @@ def alias_non_inplace_storage(arg, ret):
         alias_non_inplace_storage(args[arg_idx], outs[return_idx])
 
 
-def _get_write_alias(x) -> Optional[str]:
+def _get_write_alias(x) -> str | None:
     alias_set = x.alias_set
     if not alias_set or not x.is_write:
         return None
@@ -629,7 +636,7 @@ def _get_write_alias(x) -> Optional[str]:
 class AliasInfo:
     alias_set: set[str]
     is_write: bool
-    name: Optional[str]
+    name: str | None
 
 
 @dataclass
@@ -642,7 +649,7 @@ class SchemaInfo:
     # [_get_write_alias(x) for x in outs]. Guaranteed to contain no Nones; we coerce
     # all-Nones result to empty list instead, and we don't support
     # some-but-not-all-Nones.
-    outs_write_aliases: Optional[list[str]]
+    outs_write_aliases: list[str] | None
 
     # List of (arg_idx, return_idx) where args[arg_idx].alias_set &
     # outs[out_idx].alias_set is not empty, and not args[arg_idx].is_write.
@@ -726,12 +733,12 @@ def get_alias_info(func) -> SchemaInfo:
             if is_read_only_alias_match:
                 read_only_alias_match_indexes.append((arg_idx, return_idx))
 
-    outs_write_aliases_list: list[Optional[str]] = [
+    outs_write_aliases_list: list[str | None] = [
         _get_write_alias(r) for r in out_schemas
     ]
     non_nones = sum(x is not None for x in outs_write_aliases_list)
     if non_nones == 0:
-        outs_write_aliases: Optional[list[str]] = None
+        outs_write_aliases: list[str] | None = None
     elif non_nones != len(outs_write_aliases_list):
         # simplifying assumption: we don't have **any** ops with return types like "-> (Tensor(a!), Tensor)"
         raise RuntimeError("Unsupported schema: " + str(func._schema))
@@ -751,7 +758,7 @@ def get_alias_info(func) -> SchemaInfo:
 
 
 def autograd_would_have_decomposed(
-    func: torch._ops.OpOverload, flat_args: Sequence[Union[torch.Tensor, object]]
+    func: torch._ops.OpOverload, flat_args: Sequence[torch.Tensor | object]
 ) -> bool:
     """
     Suppose that an operator has CompositeImplicitAutograd decomp registered.
diff --git a/torch/utils/_pytree.py b/torch/utils/_pytree.py
index 56704bb3f8024..16877719718af 100644
--- a/torch/utils/_pytree.py
+++ b/torch/utils/_pytree.py
@@ -33,13 +33,13 @@
     Final,
     Generic,
     NoReturn,
-    Optional,
     overload,
     Protocol,
+    TypeAlias,
     TypeVar,
     Union,
 )
-from typing_extensions import deprecated, NamedTuple, Self, TypeAlias
+from typing_extensions import deprecated, NamedTuple, Self
 
 from torch.torch_version import TorchVersion as _TorchVersion
 
@@ -108,7 +108,7 @@ def get(self, parent: Any) -> Any: ...
 
 
 class EnumEncoder(json.JSONEncoder):
-    def default(self, obj: object) -> Union[str, dict[str, Any]]:
+    def default(self, obj: object) -> str | dict[str, Any]:
         if isinstance(obj, Enum):
             return {
                 "__enum__": True,
@@ -126,7 +126,7 @@ def default(self, obj: object) -> Union[str, dict[str, Any]]:
 ToDumpableContextFn = Callable[[Context], DumpableContext]
 FromDumpableContextFn = Callable[[DumpableContext], Context]
 ToStrFunc = Callable[["TreeSpec", list[str]], str]
-MaybeFromStrFunc = Callable[[str], Optional[tuple[Any, Context, str]]]
+MaybeFromStrFunc = Callable[[str], tuple[Any, Context, str] | None]
 KeyPath = tuple[KeyEntry, ...]
 FlattenWithKeysFunc = Callable[[PyTree], tuple[list[tuple[KeyEntry, Any]], Any]]
 
@@ -144,7 +144,7 @@ class NodeDef(NamedTuple):
     type: type[Any]
     flatten_fn: FlattenFunc
     unflatten_fn: UnflattenFunc
-    flatten_with_keys_fn: Optional[FlattenWithKeysFunc]
+    flatten_with_keys_fn: FlattenWithKeysFunc | None
 
 
 _NODE_REGISTRY_LOCK = threading.RLock()
@@ -161,8 +161,8 @@ class NodeDef(NamedTuple):
 class _SerializeNodeDef(NamedTuple):
     typ: type[Any]
     serialized_type_name: str
-    to_dumpable_context: Optional[ToDumpableContextFn]
-    from_dumpable_context: Optional[FromDumpableContextFn]
+    to_dumpable_context: ToDumpableContextFn | None
+    from_dumpable_context: FromDumpableContextFn | None
 
 
 SUPPORTED_SERIALIZED_TYPES: dict[type[Any], _SerializeNodeDef] = {}
@@ -198,10 +198,10 @@ def register_pytree_node(
     flatten_fn: FlattenFunc,
     unflatten_fn: UnflattenFunc,
     *,
-    serialized_type_name: Optional[str] = None,
-    to_dumpable_context: Optional[ToDumpableContextFn] = None,
-    from_dumpable_context: Optional[FromDumpableContextFn] = None,
-    flatten_with_keys_fn: Optional[FlattenWithKeysFunc] = None,
+    serialized_type_name: str | None = None,
+    to_dumpable_context: ToDumpableContextFn | None = None,
+    from_dumpable_context: FromDumpableContextFn | None = None,
+    flatten_with_keys_fn: FlattenWithKeysFunc | None = None,
 ) -> None:
     """Register a container-like type as pytree node.
 
@@ -272,9 +272,9 @@ def register_pytree_node(
 def register_dataclass(
     cls: type[Any],
     *,
-    field_names: Optional[list[str]] = None,
-    drop_field_names: Optional[list[str]] = None,
-    serialized_type_name: Optional[str] = None,
+    field_names: list[str] | None = None,
+    drop_field_names: list[str] | None = None,
+    serialized_type_name: str | None = None,
 ) -> None:
     """
     Registers a type that has the semantics of a ``dataclasses.dataclass`` type
@@ -523,13 +523,13 @@ def _register_pytree_node(
     cls: type[Any],
     flatten_fn: FlattenFunc,
     unflatten_fn: UnflattenFunc,
-    to_str_fn: Optional[ToStrFunc] = None,  # deprecated
-    maybe_from_str_fn: Optional[MaybeFromStrFunc] = None,  # deprecated
+    to_str_fn: ToStrFunc | None = None,  # deprecated
+    maybe_from_str_fn: MaybeFromStrFunc | None = None,  # deprecated
     *,
-    serialized_type_name: Optional[str] = None,
-    to_dumpable_context: Optional[ToDumpableContextFn] = None,
-    from_dumpable_context: Optional[FromDumpableContextFn] = None,
-    flatten_with_keys_fn: Optional[FlattenWithKeysFunc] = None,
+    serialized_type_name: str | None = None,
+    to_dumpable_context: ToDumpableContextFn | None = None,
+    from_dumpable_context: FromDumpableContextFn | None = None,
+    flatten_with_keys_fn: FlattenWithKeysFunc | None = None,
 ) -> None:
     """Register a container-like type as pytree node for the Python pytree only.
 
@@ -593,10 +593,10 @@ def _private_register_pytree_node(
     flatten_fn: FlattenFunc,
     unflatten_fn: UnflattenFunc,
     *,
-    serialized_type_name: Optional[str] = None,
-    to_dumpable_context: Optional[ToDumpableContextFn] = None,
-    from_dumpable_context: Optional[FromDumpableContextFn] = None,
-    flatten_with_keys_fn: Optional[FlattenWithKeysFunc] = None,
+    serialized_type_name: str | None = None,
+    to_dumpable_context: ToDumpableContextFn | None = None,
+    from_dumpable_context: FromDumpableContextFn | None = None,
+    flatten_with_keys_fn: FlattenWithKeysFunc | None = None,
 ) -> None:
     """This is an internal function that is used to register a pytree node type
     for the Python pytree only. End-users should use :func:`register_pytree_node`
@@ -670,7 +670,7 @@ def get(self, obj: Any) -> Any:
 
 
 # Reference: https://github.com/metaopt/optree/blob/main/optree/typing.py
-def is_namedtuple(obj: Union[object, type]) -> bool:
+def is_namedtuple(obj: object | type) -> bool:
     """Return whether the object is an instance of namedtuple or a subclass of namedtuple."""
     cls = obj if isinstance(obj, type) else type(obj)
     return is_namedtuple_class(cls)
@@ -722,7 +722,7 @@ def __new__(
 
 
 # Reference: https://github.com/metaopt/optree/blob/main/optree/typing.py
-def is_structseq(obj: Union[object, type]) -> bool:
+def is_structseq(obj: object | type) -> bool:
     """Return whether the object is an instance of PyStructSequence or a class of PyStructSequence."""
     cls = obj if isinstance(obj, type) else type(obj)
     return is_structseq_class(cls)
@@ -1045,7 +1045,7 @@ def _get_node_type(tree: Any) -> Any:
 # A leaf is defined as anything that is not a Node.
 def tree_is_leaf(
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> bool:
     """Check if a pytree is a leaf.
 
@@ -1072,7 +1072,7 @@ def tree_is_leaf(
     "Please use torch.utils._pytree.tree_is_leaf instead.",
     category=FutureWarning,
 )
-def _is_leaf(tree: PyTree, is_leaf: Optional[Callable[[PyTree], bool]] = None) -> bool:
+def _is_leaf(tree: PyTree, is_leaf: Callable[[PyTree], bool] | None = None) -> bool:
     return tree_is_leaf(tree, is_leaf=is_leaf)
 
 
@@ -1113,8 +1113,11 @@ def __post_init__(self) -> None:
             num_leaves = 1
             num_children = 0
         else:
-            num_nodes = sum((spec.num_nodes for spec in self._children), start=1)
-            num_leaves = sum(spec.num_leaves for spec in self._children)
+            num_nodes = 1
+            num_leaves = 0
+            for child in self._children:
+                num_nodes += child.num_nodes
+                num_leaves += child.num_leaves
             num_children = len(self._children)
         object.__setattr__(self, "num_nodes", num_nodes)
         object.__setattr__(self, "num_leaves", num_leaves)
@@ -1352,7 +1355,7 @@ def treespec_tuple(iterable: Iterable[TreeSpec] = (), /) -> TreeSpec:
 
 
 def treespec_dict(
-    mapping: Union[Mapping[Any, TreeSpec], Iterable[tuple[Any, TreeSpec]]] = (),
+    mapping: Mapping[Any, TreeSpec] | Iterable[tuple[Any, TreeSpec]] = (),
     /,
     **kwargs: TreeSpec,
 ) -> TreeSpec:
@@ -1365,7 +1368,7 @@ def treespec_dict(
 
 def tree_flatten(
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> tuple[list[Any], TreeSpec]:
     """Flattens a pytree into a list of values and a TreeSpec that can be used
     to reconstruct the pytree.
@@ -1403,7 +1406,7 @@ def tree_unflatten(leaves: Iterable[Any], treespec: TreeSpec) -> PyTree:
 
 def tree_iter(
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> Iterable[Any]:
     """Get an iterator over the leaves of a pytree."""
     if tree_is_leaf(tree, is_leaf=is_leaf):
@@ -1420,7 +1423,7 @@ def tree_iter(
 
 def tree_leaves(
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> list[Any]:
     """Get a list of leaves of a pytree."""
     return list(tree_iter(tree, is_leaf=is_leaf))
@@ -1428,7 +1431,7 @@ def tree_leaves(
 
 def tree_structure(
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> TreeSpec:
     """Get the TreeSpec for a pytree."""
     return tree_flatten(tree, is_leaf=is_leaf)[1]
@@ -1438,7 +1441,7 @@ def tree_map(
     func: Callable[..., Any],
     tree: PyTree,
     *rests: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> PyTree:
     """Map a multi-input function over pytree args to produce a new pytree.
 
@@ -1482,7 +1485,7 @@ def tree_map_(
     func: Callable[..., Any],
     tree: PyTree,
     *rests: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> PyTree:
     """Like :func:`tree_map`, but do an inplace call on each leaf and return the original tree.
 
@@ -1516,8 +1519,8 @@ def tree_map_(
 Type3 = tuple[type[T], type[S], type[U]]
 TypeAny = Union[type[Any], tuple[type[Any], ...], types.UnionType]
 
-Fn2 = Callable[[Union[T, S]], R]
-Fn3 = Callable[[Union[T, S, U]], R]
+Fn2 = Callable[[T | S], R]
+Fn3 = Callable[[T | S | U], R]
 Fn = Callable[[T], R]
 FnAny = Callable[[Any], R]
 
@@ -1552,7 +1555,7 @@ def map_only(
 
 
 def map_only(
-    type_or_types_or_pred: Union[TypeAny, Callable[[Any], bool]], /
+    type_or_types_or_pred: TypeAny | Callable[[Any], bool], /
 ) -> MapOnlyFn[FnAny[Any]]:
     """
     Suppose you are writing a tree_map over tensors, leaving everything
@@ -1600,7 +1603,7 @@ def tree_map_only(
     /,
     func: Fn[T, Any],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> PyTree: ...
 
 
@@ -1610,7 +1613,7 @@ def tree_map_only(
     /,
     func: Fn2[T, S, Any],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> PyTree: ...
 
 
@@ -1620,7 +1623,7 @@ def tree_map_only(
     /,
     func: Fn3[T, S, U, Any],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> PyTree: ...
 
 
@@ -1630,7 +1633,7 @@ def tree_map_only(
     /,
     func: FnAny[Any],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> PyTree: ...
 
 
@@ -1640,16 +1643,16 @@ def tree_map_only(
     /,
     func: FnAny[Any],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> PyTree: ...
 
 
 def tree_map_only(
-    type_or_types_or_pred: Union[TypeAny, Callable[[Any], bool]],
+    type_or_types_or_pred: TypeAny | Callable[[Any], bool],
     /,
     func: FnAny[Any],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> PyTree:
     return tree_map(map_only(type_or_types_or_pred)(func), tree, is_leaf=is_leaf)
 
@@ -1660,7 +1663,7 @@ def tree_map_only_(
     /,
     func: Fn[T, Any],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> PyTree: ...
 
 
@@ -1670,7 +1673,7 @@ def tree_map_only_(
     /,
     func: Fn2[T, S, Any],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> PyTree: ...
 
 
@@ -1680,7 +1683,7 @@ def tree_map_only_(
     /,
     func: Fn3[T, S, U, Any],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> PyTree: ...
 
 
@@ -1690,7 +1693,7 @@ def tree_map_only_(
     /,
     func: FnAny[Any],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> PyTree: ...
 
 
@@ -1700,16 +1703,16 @@ def tree_map_only_(
     /,
     func: FnAny[Any],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> PyTree: ...
 
 
 def tree_map_only_(
-    type_or_types_or_pred: Union[TypeAny, Callable[[Any], bool]],
+    type_or_types_or_pred: TypeAny | Callable[[Any], bool],
     /,
     func: FnAny[Any],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> PyTree:
     return tree_map_(map_only(type_or_types_or_pred)(func), tree, is_leaf=is_leaf)
 
@@ -1717,7 +1720,7 @@ def tree_map_only_(
 def tree_all(
     pred: Callable[[Any], bool],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> bool:
     flat_args = tree_iter(tree, is_leaf=is_leaf)
     return all(map(pred, flat_args))
@@ -1726,7 +1729,7 @@ def tree_all(
 def tree_any(
     pred: Callable[[Any], bool],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> bool:
     flat_args = tree_iter(tree, is_leaf=is_leaf)
     return any(map(pred, flat_args))
@@ -1738,7 +1741,7 @@ def tree_all_only(
     /,
     pred: Fn[T, bool],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> bool: ...
 
 
@@ -1748,7 +1751,7 @@ def tree_all_only(
     /,
     pred: Fn2[T, S, bool],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> bool: ...
 
 
@@ -1758,7 +1761,7 @@ def tree_all_only(
     /,
     pred: Fn3[T, S, U, bool],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> bool: ...
 
 
@@ -1767,7 +1770,7 @@ def tree_all_only(
     /,
     pred: FnAny[bool],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> bool:
     flat_args = tree_iter(tree, is_leaf=is_leaf)
     return all(pred(x) for x in flat_args if isinstance(x, type_or_types))
@@ -1779,7 +1782,7 @@ def tree_any_only(
     /,
     pred: Fn[T, bool],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> bool: ...
 
 
@@ -1789,7 +1792,7 @@ def tree_any_only(
     /,
     pred: Fn2[T, S, bool],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> bool: ...
 
 
@@ -1799,7 +1802,7 @@ def tree_any_only(
     /,
     pred: Fn3[T, S, U, bool],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> bool: ...
 
 
@@ -1808,7 +1811,7 @@ def tree_any_only(
     /,
     pred: FnAny[bool],
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> bool:
     flat_args = tree_iter(tree, is_leaf=is_leaf)
     return any(pred(x) for x in flat_args if isinstance(x, type_or_types))
@@ -1825,8 +1828,8 @@ def tree_any_only(
 def _broadcast_to_and_flatten(
     tree: PyTree,
     treespec: TreeSpec,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> Optional[list[Any]]:
+    is_leaf: Callable[[PyTree], bool] | None = None,
+) -> list[Any] | None:
     if not isinstance(treespec, TreeSpec):
         raise AssertionError("treespec must be a TreeSpec")
 
@@ -1867,7 +1870,7 @@ class _TreeSpecSchema:
     - children_spec: A list of children serialized specs.
     """
 
-    type: Optional[str]
+    type: str | None
     context: DumpableContext
     children_spec: list["_TreeSpecSchema"]
 
@@ -1916,7 +1919,7 @@ def _treespec_to_json(treespec: TreeSpec) -> _TreeSpecSchema:
     return _TreeSpecSchema(serialized_type_name, serialized_context, child_schemas)
 
 
-def enum_object_hook(obj: dict[str, Any]) -> Union[Enum, dict[str, Any]]:
+def enum_object_hook(obj: dict[str, Any]) -> Enum | dict[str, Any]:
     if "__enum__" in obj:
         modname, _, classname = obj["fqn"].partition(":")
         mod = importlib.import_module(modname)
@@ -1967,7 +1970,7 @@ def _json_to_treespec(json_schema: DumpableContext) -> TreeSpec:
 _SUPPORTED_PROTOCOLS[1] = _ProtocolFn(_treespec_to_json, _json_to_treespec)
 
 
-def treespec_dumps(treespec: TreeSpec, protocol: Optional[int] = None) -> str:
+def treespec_dumps(treespec: TreeSpec, protocol: int | None = None) -> str:
     if not isinstance(treespec, TreeSpec):
         raise TypeError(
             f"treespec_dumps(treespec, protocol): Expected `treespec` to be instance of "
@@ -2047,7 +2050,7 @@ def arg_tree_leaves(*args: PyTree, **kwargs: PyTree) -> list[Any]:
 
 def tree_flatten_with_path(
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> tuple[list[tuple[KeyPath, Any]], TreeSpec]:
     """Flattens a pytree like :func:`tree_flatten`, but also returns each leaf's key path.
 
@@ -2071,7 +2074,7 @@ def tree_flatten_with_path(
 
 def tree_leaves_with_path(
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> list[tuple[KeyPath, Any]]:
     """Gets the leaves of a pytree like ``tree_leaves`` and returns each leaf's key path.
 
@@ -2093,7 +2096,7 @@ def tree_leaves_with_path(
 def _generate_key_paths(
     key_path: KeyPath,
     tree: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> Iterable[tuple[KeyPath, Any]]:
     if is_leaf and is_leaf(tree):
         yield key_path, tree
@@ -2123,7 +2126,7 @@ def tree_map_with_path(
     func: Callable[..., Any],
     tree: PyTree,
     *rests: PyTree,
-    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+    is_leaf: Callable[[PyTree], bool] | None = None,
 ) -> PyTree:
     """Like :func:`tree_map`, but the provided callable takes an additional key path argument.
 
diff --git a/torch/utils/_strobelight/cli_function_profiler.py b/torch/utils/_strobelight/cli_function_profiler.py
index 024cd93b35788..d2e1595bf2a14 100644
--- a/torch/utils/_strobelight/cli_function_profiler.py
+++ b/torch/utils/_strobelight/cli_function_profiler.py
@@ -8,7 +8,7 @@
 import time
 from collections.abc import Callable, Sequence
 from threading import Lock
-from typing import Any, Optional, TypeVar
+from typing import Any, TypeVar
 from typing_extensions import ParamSpec
 
 
@@ -34,14 +34,14 @@ class StrobelightCLIProfilerError(Exception):
     """
 
 
-def _pid_namespace_link(pid: Optional[int] = None) -> str:
+def _pid_namespace_link(pid: int | None = None) -> str:
     """Returns the link to the process's namespace, example: pid:[4026531836]"""
     PID_NAMESPACE_PATH = "/proc/{}/ns/pid"
     pid = pid or os.getpid()
     return os.readlink(PID_NAMESPACE_PATH.format(pid))
 
 
-def _pid_namespace(pid: Optional[int] = None) -> int:
+def _pid_namespace(pid: int | None = None) -> int:
     """Returns the process's namespace id"""
     pid = pid or os.getpid()
     link = _pid_namespace_link(pid)
@@ -77,11 +77,11 @@ def __init__(
         run_user_name: str = "pytorch-strobelight-ondemand",
         timeout_wait_for_running_sec: int = 60,
         timeout_wait_for_finished_sec: int = 60,
-        recorded_env_variables: Optional[list[str]] = None,
-        sample_tags: Optional[list[str]] = None,
+        recorded_env_variables: list[str] | None = None,
+        sample_tags: list[str] | None = None,
         stack_max_len: int = 127,
         async_stack_max_len: int = 127,
-    ):
+    ) -> None:
         self.stop_at_error = stop_at_error
         self.max_profile_duration_sec = max_profile_duration_sec
         self.sample_each = sample_each
@@ -90,7 +90,7 @@ def __init__(
         self.timeout_wait_for_finished_sec = timeout_wait_for_finished_sec
         # Results of the most recent run.
         # Tracks the strobelight run id of the most recent run
-        self.current_run_id: Optional[int] = None
+        self.current_run_id: int | None = None
         self.sample_tags = sample_tags
 
     def _run_async(self) -> None:
@@ -253,7 +253,7 @@ def _start_strobelight(self) -> bool:
 
     def profile(
         self, work_function: Callable[_P, _R], *args: _P.args, **kwargs: _P.kwargs
-    ) -> Optional[_R]:
+    ) -> _R | None:
         self.current_run_id = None
 
         if locked := StrobelightCLIFunctionProfiler._lock.acquire(False):
@@ -295,16 +295,16 @@ def profile(
 # @strobelight(profiler = StrobelightFunctionProfiler(stop_at_error=True,..))
 # @strobelight(stop_at_error=True,...)
 def strobelight(
-    profiler: Optional[StrobelightCLIFunctionProfiler] = None, **kwargs: Any
-) -> Callable[[Callable[_P, _R]], Callable[_P, Optional[_R]]]:
+    profiler: StrobelightCLIFunctionProfiler | None = None, **kwargs: Any
+) -> Callable[[Callable[_P, _R]], Callable[_P, _R | None]]:
     if not profiler:
         profiler = StrobelightCLIFunctionProfiler(**kwargs)
 
     def strobelight_inner(
         work_function: Callable[_P, _R],
-    ) -> Callable[_P, Optional[_R]]:
+    ) -> Callable[_P, _R | None]:
         @functools.wraps(work_function)
-        def wrapper_function(*args: _P.args, **kwargs: _P.kwargs) -> Optional[_R]:
+        def wrapper_function(*args: _P.args, **kwargs: _P.kwargs) -> _R | None:
             # pyrefly: ignore [bad-argument-type]
             return profiler.profile(work_function, *args, **kwargs)
 
diff --git a/torch/utils/_strobelight/examples/cli_function_profiler_example.py b/torch/utils/_strobelight/examples/cli_function_profiler_example.py
index 322cd321199a1..fb957da009279 100644
--- a/torch/utils/_strobelight/examples/cli_function_profiler_example.py
+++ b/torch/utils/_strobelight/examples/cli_function_profiler_example.py
@@ -14,7 +14,7 @@ def fn(x, y, z):
     # use decorator with default profiler or optional profile arguments.
     @strobelight(sample_each=10000, stop_at_error=False)
     @torch.compile()
-    def work():
+    def work() -> None:
         for _ in range(10):
             torch._dynamo.reset()
             for j in range(5):
@@ -27,7 +27,7 @@ def work():
     profiler = StrobelightCLIFunctionProfiler(stop_at_error=False)
 
     @strobelight(profiler, sample_tags=["something", "another"])
-    def work2():
+    def work2() -> None:
         sum = 0
         for _ in range(100000000):
             sum += 1  # noqa: SIM113
diff --git a/torch/utils/_sympy/functions.py b/torch/utils/_sympy/functions.py
index 297d7f4eec9a8..0816a2c23d648 100644
--- a/torch/utils/_sympy/functions.py
+++ b/torch/utils/_sympy/functions.py
@@ -4,7 +4,7 @@
 import operator
 import sys
 from collections.abc import Callable
-from typing import Optional, SupportsFloat, TYPE_CHECKING, TypeVar, Union
+from typing import SupportsFloat, TYPE_CHECKING, TypeVar
 from typing_extensions import TypeVarTuple, Unpack
 
 import sympy
@@ -102,11 +102,11 @@ def _is_symbols_binary_summation(expr: sympy.Expr) -> bool:
 
 def _keep_float(
     f: Callable[[Unpack[_Ts]], _T],
-) -> Callable[[Unpack[_Ts]], Union[_T, sympy.Float]]:
+) -> Callable[[Unpack[_Ts]], _T | sympy.Float]:
     @functools.wraps(f)
-    def inner(*args: Unpack[_Ts]) -> Union[_T, sympy.Float]:
+    def inner(*args: Unpack[_Ts]) -> _T | sympy.Float:
         # pyrefly: ignore [bad-argument-type]
-        r: Union[_T, sympy.Float] = f(*args)
+        r: _T | sympy.Float = f(*args)
         if any(isinstance(a, sympy.Float) for a in args) and not isinstance(
             r, sympy.Float
         ):
@@ -117,7 +117,7 @@ def inner(*args: Unpack[_Ts]) -> Union[_T, sympy.Float]:
     return inner
 
 
-def fuzzy_eq(x: Optional[bool], y: Optional[bool]) -> Optional[bool]:
+def fuzzy_eq(x: bool | None, y: bool | None) -> bool | None:
     if None in (x, y):
         return None
     return x == y
@@ -216,9 +216,7 @@ def _sympystr(self, printer: sympy.printing.StrPrinter) -> str:
     # Automatic evaluation.
     # https://docs.sympy.org/latest/guides/custom-functions.html#best-practices-for-eval
     @classmethod
-    def eval(
-        cls, base: sympy.Integer, divisor: sympy.Integer
-    ) -> Union[sympy.Basic, None]:
+    def eval(cls, base: sympy.Integer, divisor: sympy.Integer) -> sympy.Basic | None:
         # python test/test_dynamic_shapes.py -k TestDimConstraints.test_dim_constraints_solve_full
         # Assert triggered by inequality solver
         # assert base.is_integer, base
@@ -324,7 +322,7 @@ class ModularIndexing(sympy.Function):
     @classmethod
     def eval(
         cls, base: sympy.Integer, divisor: sympy.Integer, modulus: sympy.Integer
-    ) -> Optional[sympy.Basic]:
+    ) -> sympy.Basic | None:
         if base == 0 or modulus == 1:
             return sympy.S.Zero
         if (
@@ -373,7 +371,7 @@ def eval(
 
         return None
 
-    def _eval_is_nonnegative(self) -> Optional[bool]:
+    def _eval_is_nonnegative(self) -> bool | None:
         # pyrefly: ignore [missing-attribute]
         p, q = self.args[:2]
         return fuzzy_eq(p.is_nonnegative, q.is_nonnegative)  # type: ignore[attr-defined]
@@ -387,23 +385,21 @@ class Where(sympy.Function):
     nargs: tuple[int, ...] = (3,)
     precedence: int = 35  # lower precedence than add
 
-    def _eval_is_integer(self) -> Optional[bool]:
+    def _eval_is_integer(self) -> bool | None:
         return True if self.args[1].is_integer and self.args[2].is_integer else None  # type: ignore[attr-defined]
 
-    def _eval_is_nonnegative(self) -> Optional[bool]:
+    def _eval_is_nonnegative(self) -> bool | None:
         return (
             True
             if self.args[1].is_nonnegative and self.args[2].is_nonnegative  # type: ignore[attr-defined]
             else None
         )
 
-    def _eval_is_positive(self) -> Optional[bool]:
+    def _eval_is_positive(self) -> bool | None:
         return True if self.args[1].is_positive and self.args[2].is_positive else None  # type: ignore[attr-defined]
 
     @classmethod
-    def eval(
-        cls, c: sympy.Basic, p: sympy.Basic, q: sympy.Basic
-    ) -> Optional[sympy.Basic]:
+    def eval(cls, c: sympy.Basic, p: sympy.Basic, q: sympy.Basic) -> sympy.Basic | None:
         if c == sympy.true:
             return p
         elif c == sympy.false:
@@ -419,7 +415,7 @@ class PythonMod(sympy.Function):
     is_integer: bool = True
 
     @classmethod
-    def eval(cls, p: sympy.Expr, q: sympy.Expr) -> Optional[sympy.Expr]:
+    def eval(cls, p: sympy.Expr, q: sympy.Expr) -> sympy.Expr | None:
         # python test/dynamo/test_export.py -k ExportTests.test_trivial_constraint
         # Triggered by sympy.solvers.inequalities.reduce_inequalities
         # assert p.is_integer, p
@@ -465,13 +461,13 @@ def eval(cls, p: sympy.Expr, q: sympy.Expr) -> Optional[sympy.Expr]:
         return None
 
     # NB: args[1] for PythonMod
-    def _eval_is_nonnegative(self) -> Optional[bool]:
+    def _eval_is_nonnegative(self) -> bool | None:
         return True if self.args[1].is_positive else None  # type: ignore[attr-defined]
 
-    def _eval_is_nonpositive(self) -> Optional[bool]:
+    def _eval_is_nonpositive(self) -> bool | None:
         return True if self.args[1].is_negative else None  # type: ignore[attr-defined]
 
-    def _ccode(self, printer):
+    def _ccode(self, printer) -> str:
         # pyrefly: ignore [missing-attribute]
         p = printer.parenthesize(self.args[0], PRECEDENCE["Atom"] - 0.5)
         # pyrefly: ignore [missing-attribute]
@@ -558,7 +554,7 @@ def eval(cls, number):
         if isinstance(number, sympy.Number):
             return sympy.Integer(math.ceil(float(number)))
 
-    def _ccode(self, printer):
+    def _ccode(self, printer) -> str:
         # pyrefly: ignore [missing-attribute]
         number = printer.parenthesize(self.args[0], self.args[0].precedence - 0.5)
         return f"ceil({number})"
@@ -664,7 +660,7 @@ def __new__(cls, *original_args, **assumptions):
     @classmethod
     def _satisfy_unique_summations_symbols(
         cls, args
-    ) -> Optional[set[sympy.core.symbol.Symbol]]:
+    ) -> set[sympy.core.symbol.Symbol] | None:
         """
         One common case in some models is building expressions of the form
         max(max(max(a+b...), c+d), e+f) which is simplified to max(a+b, c+d, e+f, ...).
@@ -719,8 +715,8 @@ def _satisfy_unique_summations_symbols(
 
     @classmethod
     def _unique_symbols(
-        cls, args, initial_set: Optional[set[sympy.core.symbol.Symbol]] = None
-    ) -> Optional[set[sympy.core.symbol.Symbol]]:
+        cls, args, initial_set: set[sympy.core.symbol.Symbol] | None = None
+    ) -> set[sympy.core.symbol.Symbol] | None:
         """
         Return seen_symbols if all atoms in all args are all unique symbols,
         else returns None. initial_set can be used to represent initial value for seen_symbols
@@ -1164,7 +1160,7 @@ def eval(cls, base, divisor):
         if isinstance(base, sympy.Integer) and isinstance(divisor, sympy.Integer):
             return sympy.Float(int(base) / int(divisor))
 
-    def _ccode(self, printer):
+    def _ccode(self, printer) -> str:
         # pyrefly: ignore [missing-attribute]
         base = printer.parenthesize(self.args[0], PRECEDENCE["Atom"] - 0.5)
         # pyrefly: ignore [missing-attribute]
@@ -1331,11 +1327,11 @@ class Identity(sympy.Function):
 
     precedence = 10
 
-    def __repr__(self):  # type: ignore[override]
+    def __repr__(self) -> str:  # type: ignore[override]
         # pyrefly: ignore [missing-attribute]
         return f"Identity({self.args[0]})"
 
-    def _sympystr(self, printer):
+    def _sympystr(self, printer) -> str:
         """Controls how sympy's StrPrinter prints this"""
         # pyrefly: ignore [missing-attribute]
         return f"({printer.doprint(self.args[0])})"
@@ -1427,6 +1423,8 @@ def eval(cls, a):
 def make_opaque_bitwise_fn(name, real_op_name):
     if name == "bitwise_and":
         prec = PRECEDENCE["BitwiseAnd"]
+    elif name == "bitwise_xor":
+        prec = PRECEDENCE["BitwiseXor"]
     elif name == "bitwise_or":
         prec = PRECEDENCE["BitwiseOr"]
     else:
@@ -1462,3 +1460,4 @@ def eval(cls, a, b):
 
 BitwiseFn_bitwise_and = make_opaque_bitwise_fn("bitwise_and", "and_")
 BitwiseFn_bitwise_or = make_opaque_bitwise_fn("bitwise_or", "or_")
+BitwiseFn_bitwise_xor = make_opaque_bitwise_fn("bitwise_xor", "xor")
diff --git a/torch/utils/_sympy/interp.py b/torch/utils/_sympy/interp.py
index 6dc496a0ddb13..574233aec673a 100644
--- a/torch/utils/_sympy/interp.py
+++ b/torch/utils/_sympy/interp.py
@@ -10,7 +10,7 @@
 
 import functools
 import logging
-from typing import Any, Union
+from typing import Any
 
 import sympy
 from sympy.logic.boolalg import Boolean as SympyBoolean, BooleanAtom
@@ -20,6 +20,7 @@
 from .functions import (
     BitwiseFn_bitwise_and,
     BitwiseFn_bitwise_or,
+    BitwiseFn_bitwise_xor,
     CeilToInt,
     CleanDiv,
     FloatPow,
@@ -108,6 +109,7 @@ def handlers():
         OpaqueUnaryFn_log2: "log2",
         BitwiseFn_bitwise_and: "bitwise_and",
         BitwiseFn_bitwise_or: "bitwise_or",
+        BitwiseFn_bitwise_xor: "bitwise_xor",
     }
     # TODO: This is kind of pointless, we shouldn't be generating sympy.sin
     # for these functions, they should be Opaque instead
@@ -184,7 +186,7 @@ def _run_sympy_handler(analysis, args, expr, index_dtype=torch.int64):
 def sympy_interp(
     analysis,
     env: dict[sympy.Symbol, Any],
-    expr: Union[sympy.Expr, SympyBoolean],
+    expr: sympy.Expr | SympyBoolean,
     *,
     index_dtype=torch.int64,
     missing_handler=None,
diff --git a/torch/utils/_sympy/numbers.py b/torch/utils/_sympy/numbers.py
index f675de25ad8a7..8b08e01d8e52b 100644
--- a/torch/utils/_sympy/numbers.py
+++ b/torch/utils/_sympy/numbers.py
@@ -42,7 +42,7 @@ class IntInfinity(Number, metaclass=Singleton):
     def __new__(cls):
         return AtomicExpr.__new__(cls)
 
-    def _sympystr(self, printer):
+    def _sympystr(self, printer) -> str:
         return "int_oo"
 
     def _eval_subs(self, old, new):
@@ -237,7 +237,7 @@ def _eval_subs(self, old, new):
         if self == old:
             return new
 
-    def _sympystr(self, printer):
+    def _sympystr(self, printer) -> str:
         return "-int_oo"
 
     """
diff --git a/torch/utils/_sympy/printers.py b/torch/utils/_sympy/printers.py
index 526443577b3f8..7006b7f7fdc65 100644
--- a/torch/utils/_sympy/printers.py
+++ b/torch/utils/_sympy/printers.py
@@ -1,5 +1,4 @@
 import sys
-from typing import Optional
 
 import sympy
 from sympy.printing.precedence import PRECEDENCE, precedence
@@ -23,7 +22,7 @@ def _print_Mul(self, expr: sympy.Expr) -> str:
     def _print_Not(self, expr: sympy.Expr) -> str:
         return f"not ({self._print(expr.args[0])})"
 
-    def _print_Add(self, expr: sympy.Expr, order: Optional[str] = None) -> str:
+    def _print_Add(self, expr: sympy.Expr, order: str | None = None) -> str:
         return self.stringify(expr.args, " + ", precedence(expr))
 
     def _print_Relational(self, expr: sympy.Expr) -> str:
@@ -35,6 +34,9 @@ def _print_BitwiseFn_bitwise_and(self, expr: sympy.Expr) -> str:
     def _print_BitwiseFn_bitwise_or(self, expr: sympy.Expr) -> str:
         return self.stringify(expr.args, " | ", PRECEDENCE["BitwiseOr"])
 
+    def _print_BitwiseFn_bitwise_xor(self, expr: sympy.Expr) -> str:
+        return self.stringify(expr.args, " ^ ", PRECEDENCE["BitwiseXor"])
+
     # NB: this is OK to put here, because Mod is only defined for positive
     # numbers, and so across C/Python its behavior is consistent
     def _print_Mod(self, expr: sympy.Expr) -> str:
@@ -306,6 +308,24 @@ def _print_RoundDecimal(self, expr: sympy.Expr) -> str:
             raise TypeError("ndigits must be an instance of sympy.Integer")
         return f"round({self._print(number)}, {ndigits})"
 
+    def _print_Piecewise(self, expr: sympy.Expr) -> str:
+        # Convert Piecewise(expr_cond_pairs) to nested ternary expressions
+        # Piecewise((e1, c1), (e2, c2), ..., (eN, cN))
+        # becomes: e1 if c1 else (e2 if c2 else (... else eN))
+        result: str | None = None
+        for expr_i, cond_i in reversed(expr.args):
+            expr_str = self._print(expr_i)
+            if cond_i == True:  # noqa: E712
+                # This is the default case
+                result = expr_str
+            else:
+                cond_str = self._print(cond_i)
+                if result is None:
+                    result = expr_str
+                else:
+                    result = f"({expr_str} if {cond_str} else {result})"
+        return result if result else "0"
+
 
 class CppPrinter(ExprPrinter):
     def _print_Integer(self, expr: sympy.Expr) -> str:
@@ -327,6 +347,24 @@ def _print_Where(self, expr: sympy.Expr) -> str:
         )
         return f"{c} ? {p} : {q}"
 
+    def _print_Piecewise(self, expr: sympy.Expr) -> str:
+        # Convert Piecewise(expr_cond_pairs) to nested ternary operators
+        # Piecewise((e1, c1), (e2, c2), ..., (eN, cN))
+        # becomes: c1 ? e1 : (c2 ? e2 : (... : eN))
+        result: str | None = None
+        for expr_i, cond_i in reversed(expr.args):
+            expr_str = self.parenthesize(expr_i, PRECEDENCE["Atom"] - 0.5)
+            if cond_i == True:  # noqa: E712
+                # This is the default case
+                result = expr_str
+            else:
+                cond_str = self.parenthesize(cond_i, PRECEDENCE["Atom"] - 0.5)
+                if result is None:
+                    result = expr_str
+                else:
+                    result = f"{cond_str} ? {expr_str} : {result}"
+        return f"({result})" if result else "0"
+
     def _print_ModularIndexing(self, expr: sympy.Expr) -> str:
         x, div, mod = expr.args
         x = self.doprint(x)
diff --git a/torch/utils/_sympy/reference.py b/torch/utils/_sympy/reference.py
index c3a3878f3c8c1..b5e248be3fad8 100644
--- a/torch/utils/_sympy/reference.py
+++ b/torch/utils/_sympy/reference.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 import math
 import operator
-from typing import Union
+from typing import NoReturn
 
 import sympy
 
@@ -10,6 +10,7 @@
     _keep_float,
     BitwiseFn_bitwise_and,
     BitwiseFn_bitwise_or,
+    BitwiseFn_bitwise_xor,
     FloatPow,
     FloatTrueDiv,
     FloorDiv,
@@ -139,7 +140,7 @@ def floordiv(a, b):
         return FloorDiv(a, b)
 
     @staticmethod
-    def truncdiv(a, b):
+    def truncdiv(a, b) -> NoReturn:
         raise NotImplementedError("TODO: truncdiv")
 
     @staticmethod
@@ -207,6 +208,10 @@ def bitwise_and(a, b):
     def bitwise_or(a, b):
         return BitwiseFn_bitwise_or(a, b)
 
+    @staticmethod
+    def bitwise_xor(a, b):
+        return BitwiseFn_bitwise_xor(a, b)
+
 
 # Unlike ReferenceAnalysis, does NOT sympyify, instead, works with plain
 # Python types and is FX traceable.  Inheritance here is purely for code
@@ -257,11 +262,11 @@ def to_dtype(x, dtype):
         raise NotImplementedError(f"to_dtype {dtype} NYI")
 
     @staticmethod
-    def exp(x):
+    def exp(x) -> NoReturn:
         raise AssertionError("exp is not valid shape sympy expr")
 
     @staticmethod
-    def log(x):
+    def log(x) -> NoReturn:
         raise AssertionError("log is not valid shape sympy expr")
 
     @staticmethod
@@ -327,6 +332,10 @@ def bitwise_and(a, b):
     def bitwise_or(a, b):
         return a | b
 
+    @staticmethod
+    def bitwise_xor(a, b):
+        return a ^ b
+
 
 # Like PythonReferenceAnalysis, but some export-unfriendly choices of
 # operators to make things faster
@@ -359,7 +368,7 @@ class TensorReferenceAnalysis:
     # function isn't traced correctly.  Here for completeness.
     @staticmethod
     def constant(c, dtype):
-        d: Union[int, float, bool]
+        d: int | float | bool
         if dtype is torch.int64:
             d = int(c)
         elif dtype is torch.double:
@@ -386,6 +395,10 @@ def bitwise_and(a, b):
     def bitwise_or(a, b):
         return torch.ops.aten.bitwise_or(a, b)
 
+    @staticmethod
+    def bitwise_xor(a, b):
+        return torch.ops.aten.bitwise_xor(a, b)
+
     @staticmethod
     def eq(a, b):
         return torch.ops.aten.eq.Tensor(a, b)
@@ -448,7 +461,7 @@ def to_dtype(x, dtype):
         return _to_dtype(x, dtype)
 
     @staticmethod
-    def mod(x, y):
+    def mod(x, y) -> NoReturn:
         # TODO: https://github.com/pytorch/pytorch/pull/133654
         raise NotImplementedError(
             "no C-style modulus operation available from frontend atm"
@@ -484,7 +497,7 @@ def floordiv(a, b):
         return torch.ops.aten.div.Tensor_mode(a, b, rounding_mode="floor")
 
     @staticmethod
-    def truncdiv(a, b):
+    def truncdiv(a, b) -> NoReturn:
         raise NotImplementedError(
             "no C-style truncdiv operation available from frontend atm"
         )
@@ -575,7 +588,7 @@ def round_to_int(a, dtype):
         return torch.ops.aten.round.default(a)
 
     @staticmethod
-    def round_decimal(a, b):
+    def round_decimal(a, b) -> NoReturn:
         raise NotImplementedError(
             "round decimal doesn't support Tensor second argument atm"
         )
diff --git a/torch/utils/_sympy/singleton_int.py b/torch/utils/_sympy/singleton_int.py
index 0bac76121f8b6..57d5615e55271 100644
--- a/torch/utils/_sympy/singleton_int.py
+++ b/torch/utils/_sympy/singleton_int.py
@@ -17,7 +17,7 @@ def __new__(cls, *args, coeff=None, **kwargs):
 
     # The semantics of this class should match that of NestedIntSymNodeImpl in
     # c10/core/NestedIntSymNodeImpl.h
-    def __init__(self, val, *, coeff=1):
+    def __init__(self, val, *, coeff=1) -> None:
         self._val = val
         self._coeff = coeff
         super().__init__()
diff --git a/torch/utils/_sympy/solve.py b/torch/utils/_sympy/solve.py
index 840957f4109cb..3bd5e1484601f 100644
--- a/torch/utils/_sympy/solve.py
+++ b/torch/utils/_sympy/solve.py
@@ -1,5 +1,4 @@
 import logging
-from typing import Optional
 
 import sympy
 
@@ -20,7 +19,7 @@
 INEQUALITY_TYPES = (sympy.Gt, sympy.Ge, sympy.Lt, sympy.Le)
 
 
-def mirror_rel_op(type: type) -> Optional[type[sympy.Rel]]:
+def mirror_rel_op(type: type) -> type[sympy.Rel] | None:
     return _MIRROR_REL_OP.get(type)
 
 
@@ -43,7 +42,7 @@ def try_solve(
     thing: sympy.Basic,
     trials: int = 5,
     floordiv_inequality: bool = True,
-) -> Optional[tuple[sympy.Rel, sympy.Expr]]:
+) -> tuple[sympy.Rel, sympy.Expr] | None:
     mirror = mirror_rel_op(type(expr))
 
     # Ignore unsupported expressions:
diff --git a/torch/utils/_sympy/symbol.py b/torch/utils/_sympy/symbol.py
index cd25478e6ed18..61a7c147458e0 100644
--- a/torch/utils/_sympy/symbol.py
+++ b/torch/utils/_sympy/symbol.py
@@ -14,7 +14,6 @@
 
 from collections.abc import Iterable
 from enum import auto, Enum
-from typing import Union
 
 import sympy
 
@@ -88,7 +87,7 @@ def make_symbol(prefix: SymT, idx: int, **kwargs) -> sympy.Symbol:
 
 # This type is a little wider than it should be, because free_symbols says
 # that it contains Basic, rather than Symbol
-def symbol_is_type(sym: sympy.Basic, prefix: Union[SymT, Iterable[SymT]]) -> bool:
+def symbol_is_type(sym: sympy.Basic, prefix: SymT | Iterable[SymT]) -> bool:
     if not isinstance(sym, sympy.Symbol):
         raise AssertionError("expected sympy.Symbol")
     name_str = sym.name.lower()  # Match capitalized names like XBLOCK, RBLOCK
@@ -98,5 +97,5 @@ def symbol_is_type(sym: sympy.Basic, prefix: Union[SymT, Iterable[SymT]]) -> boo
         return name_str.startswith(tuple(prefix_str[p] for p in prefix))
 
 
-def free_symbol_is_type(e: sympy.Expr, prefix: Union[SymT, Iterable[SymT]]) -> bool:
+def free_symbol_is_type(e: sympy.Expr, prefix: SymT | Iterable[SymT]) -> bool:
     return any(symbol_is_type(v, prefix) for v in e.free_symbols)
diff --git a/torch/utils/_sympy/value_ranges.py b/torch/utils/_sympy/value_ranges.py
index ef7c1696480b5..7134228ce952b 100644
--- a/torch/utils/_sympy/value_ranges.py
+++ b/torch/utils/_sympy/value_ranges.py
@@ -10,7 +10,6 @@
 from collections.abc import Callable
 from typing import (
     Generic,
-    Optional,
     overload,
     SupportsFloat,
     TYPE_CHECKING,
@@ -18,6 +17,7 @@
     TypeVar,
     Union,
 )
+from typing_extensions import TypeIs
 
 import sympy
 from sympy.logic.boolalg import Boolean as SympyBoolean, BooleanAtom
@@ -111,6 +111,10 @@ def vr_is_expr(vr: ValueRanges[_T]) -> TypeGuard[ValueRanges[sympy.Expr]]:
     return not vr.is_bool
 
 
+def is_sympy_integer(value) -> TypeIs[sympy.Integer]:
+    return isinstance(value, sympy.Integer)
+
+
 ExprIn = Union[int, float, sympy.Expr]
 BoolIn = Union[bool, SympyBoolean]
 AllIn = Union[ExprIn, BoolIn]
@@ -325,16 +329,16 @@ def unknown_bool() -> ValueRanges[SympyBoolean]:
     @overload
     @staticmethod
     # work around the fact that bool and int overlap
-    def wrap(arg: Union[ExprIn, ExprVR]) -> ExprVR:  # type: ignore[overload-overlap]
+    def wrap(arg: ExprIn | ExprVR) -> ExprVR:  # type: ignore[overload-overlap]
         ...
 
     @overload
     @staticmethod
-    def wrap(arg: Union[BoolIn, BoolVR]) -> BoolVR:  # type: ignore[misc]
+    def wrap(arg: BoolIn | BoolVR) -> BoolVR:  # type: ignore[misc]
         ...
 
     @staticmethod
-    def wrap(arg: Union[AllIn, AllVR]) -> AllVR:
+    def wrap(arg: AllIn | AllVR) -> AllVR:
         if isinstance(arg, ValueRanges):
             return arg
         if isinstance(arg, float) and math.isnan(arg):
@@ -343,29 +347,29 @@ def wrap(arg: Union[AllIn, AllVR]) -> AllVR:
         return ValueRanges(arg, arg)  # type: ignore[arg-type]
 
     @staticmethod
-    def increasing_map(x: Union[ExprIn, ExprVR], fn: ExprFn) -> ExprVR:
+    def increasing_map(x: ExprIn | ExprVR, fn: ExprFn) -> ExprVR:
         """Increasing: x <= y => f(x) <= f(y)."""
         x = ValueRanges.wrap(x)
         return ValueRanges(fn(x.lower), fn(x.upper))
 
     @overload
     @staticmethod
-    def decreasing_map(x: Union[ExprIn, ExprVR], fn: ExprFn) -> ExprVR: ...
+    def decreasing_map(x: ExprIn | ExprVR, fn: ExprFn) -> ExprVR: ...
 
     @overload
     @staticmethod
-    def decreasing_map(x: Union[BoolIn, BoolVR], fn: BoolFn) -> BoolVR:  # type: ignore[misc]
+    def decreasing_map(x: BoolIn | BoolVR, fn: BoolFn) -> BoolVR:  # type: ignore[misc]
         ...
 
     @staticmethod
-    def decreasing_map(x: Union[AllIn, AllVR], fn: AllFn) -> AllVR:
+    def decreasing_map(x: AllIn | AllVR, fn: AllFn) -> AllVR:
         """Decreasing: x <= y => f(x) >= f(y)."""
         x = ValueRanges.wrap(x)
         # consistently either Expr or Bool, but we don't know it here
         return ValueRanges(fn(x.upper), fn(x.lower))  # type: ignore[arg-type]
 
     @staticmethod
-    def monotone_map(x: Union[ExprIn, ExprVR], fn: ExprFn) -> ExprVR:
+    def monotone_map(x: ExprIn | ExprVR, fn: ExprFn) -> ExprVR:
         """It's increasing or decreasing."""
         x = ValueRanges.wrap(x)
         l = fn(x.lower)
@@ -373,7 +377,7 @@ def monotone_map(x: Union[ExprIn, ExprVR], fn: ExprFn) -> ExprVR:
         return ValueRanges(min(l, u), max(l, u))
 
     @staticmethod
-    def convex_min_zero_map(x: Union[ExprIn, ExprVR], fn: ExprFn) -> ExprVR:
+    def convex_min_zero_map(x: ExprIn | ExprVR, fn: ExprFn) -> ExprVR:
         """Fn is convex and has a minimum at 0."""
         x = ValueRanges.wrap(x)
         if 0 in x:
@@ -387,23 +391,23 @@ def convex_min_zero_map(x: Union[ExprIn, ExprVR], fn: ExprFn) -> ExprVR:
     @overload
     @staticmethod
     def coordinatewise_increasing_map(
-        x: Union[ExprIn, ExprVR],
-        y: Union[ExprIn, ExprVR],
+        x: ExprIn | ExprVR,
+        y: ExprIn | ExprVR,
         fn: ExprFn2,
     ) -> ExprVR: ...
 
     @overload
     @staticmethod
     def coordinatewise_increasing_map(  # type: ignore[misc]
-        x: Union[BoolIn, BoolVR],
-        y: Union[BoolIn, BoolVR],
+        x: BoolIn | BoolVR,
+        y: BoolIn | BoolVR,
         fn: BoolFn2,
     ) -> BoolVR: ...
 
     @staticmethod
     def coordinatewise_increasing_map(
-        x: Union[AllIn, AllVR],
-        y: Union[AllIn, AllVR],
+        x: AllIn | AllVR,
+        y: AllIn | AllVR,
         fn: AllFn2,
     ) -> AllVR:
         """
@@ -566,6 +570,44 @@ def bitwise_or(cls, a, b):
             upper = -1
         return ValueRanges(min(a.lower, b.lower), upper)
 
+    @classmethod
+    def bitwise_xor(cls, a, b):
+        a, b = ValueRanges.wrap(a), ValueRanges.wrap(b)
+        if a.is_bool and b.is_bool:
+            bounds = {
+                a.lower ^ b.lower,
+                a.lower ^ b.upper,
+                a.upper ^ b.lower,
+                a.upper ^ b.upper,
+            }
+
+            has_false = any(bound == sympy.false for bound in bounds)
+            has_true = any(bound == sympy.true for bound in bounds)
+
+            if has_false and has_true:
+                lower, upper = sympy.false, sympy.true
+            elif has_true:
+                lower = upper = sympy.true
+            elif has_false:
+                lower = upper = sympy.false
+            else:
+                raise AssertionError(f"Non-boolean xor result: {bounds}")
+
+            return ValueRanges(lower, upper)
+        if a.is_bool:
+            a = cls._bool_to_int(a)
+        if b.is_bool:
+            b = cls._bool_to_int(b)
+        if (
+            a.lower == a.upper
+            and b.lower == b.upper
+            and is_sympy_integer(a.lower)
+            and is_sympy_integer(b.lower)
+        ):
+            value_range = a.lower ^ b.lower
+            return ValueRanges(value_range, value_range)
+        return ValueRanges(-int_oo, int_oo)
+
     @staticmethod
     def eq(a, b):
         a = ValueRanges.wrap(a)
@@ -1037,7 +1079,7 @@ def trunc(x):
 
 
 def bound_sympy(
-    expr: sympy.Expr, ranges: Optional[dict[sympy.Symbol, ValueRanges]] = None
+    expr: sympy.Expr, ranges: dict[sympy.Symbol, ValueRanges] | None = None
 ) -> ValueRanges:
     log.debug(
         "bound_sympy(%s)%s",
diff --git a/torch/utils/_thunk.py b/torch/utils/_thunk.py
index 28689f2f76f18..b5ab598077f4e 100644
--- a/torch/utils/_thunk.py
+++ b/torch/utils/_thunk.py
@@ -1,5 +1,5 @@
 from collections.abc import Callable
-from typing import Generic, Optional, TypeVar
+from typing import Generic, TypeVar
 
 
 R = TypeVar("R")
@@ -12,12 +12,12 @@ class Thunk(Generic[R]):
     function once it is forced.
     """
 
-    f: Optional[Callable[[], R]]
-    r: Optional[R]
+    f: Callable[[], R] | None
+    r: R | None
 
     __slots__ = ["f", "r"]
 
-    def __init__(self, f: Callable[[], R]):
+    def __init__(self, f: Callable[[], R]) -> None:
         self.f = f
         self.r = None
 
diff --git a/torch/utils/_traceback.py b/torch/utils/_traceback.py
index 21fadb297be80..f5415002092a2 100644
--- a/torch/utils/_traceback.py
+++ b/torch/utils/_traceback.py
@@ -5,7 +5,6 @@
 import tempfile
 import traceback
 from types import TracebackType
-from typing import Optional
 
 
 # This file contains utilities for ensuring dynamically compile()'d
@@ -144,7 +143,7 @@ def shorten_filename(fn, *, base=None):
         return fn[len(prefix) + 1 :]
 
 
-def format_frame(frame, *, base=None, line=False):
+def format_frame(frame, *, base=None, line=False) -> str:
     """
     Format a FrameSummary in a short way, without printing full absolute path or code.
 
@@ -164,11 +163,11 @@ def format_traceback_short(tb):
 class CapturedTraceback:
     __slots__ = ["tb", "skip"]
 
-    def __init__(self, tb, skip=0):
+    def __init__(self, tb, skip=0) -> None:
         self.tb = tb
         self.skip = skip
 
-    def cleanup(self):
+    def cleanup(self) -> None:
         self.tb = None
 
     def summary(self):
@@ -234,7 +233,7 @@ def format_all(tbs):
         import torch._C._profiler
 
         # Directly populate tracebacks that already have cached summaries
-        rs: list[Optional[list[str]]] = []
+        rs: list[list[str] | None] = []
         delayed_idxs = []
         for i, tb in enumerate(tbs):
             if tb.tb is None:
diff --git a/torch/utils/_typing_utils.py b/torch/utils/_typing_utils.py
index ffb6b383e4e6b..f28c9f94100b7 100644
--- a/torch/utils/_typing_utils.py
+++ b/torch/utils/_typing_utils.py
@@ -1,6 +1,6 @@
 """Miscellaneous utilities to aid with typing."""
 
-from typing import Optional, TypeVar
+from typing import TypeVar
 
 
 # Helper to turn Optional[T] into T when we know None either isn't
@@ -8,7 +8,7 @@
 T = TypeVar("T")
 
 
-def not_none(obj: Optional[T]) -> T:
+def not_none(obj: T | None) -> T:
     if obj is None:
         raise TypeError("Invariant encountered: value was None when it should not be")
     return obj
diff --git a/torch/utils/_zip.py b/torch/utils/_zip.py
index b159b61de06aa..5dd98e43c4a77 100644
--- a/torch/utils/_zip.py
+++ b/torch/utils/_zip.py
@@ -36,7 +36,7 @@ def remove_prefix(text, prefix):
     return text
 
 
-def write_to_zip(file_path, strip_file_path, zf, prepend_str=""):
+def write_to_zip(file_path, strip_file_path, zf, prepend_str="") -> None:
     stripped_file_path = prepend_str + remove_prefix(file_path, strip_file_dir + "/")
     path = Path(stripped_file_path)
     if path.name in DENY_LIST:
diff --git a/torch/utils/backcompat/__init__.py b/torch/utils/backcompat/__init__.py
index a8413b656e906..f6ec989be1e07 100644
--- a/torch/utils/backcompat/__init__.py
+++ b/torch/utils/backcompat/__init__.py
@@ -8,11 +8,11 @@
 
 
 class Warning:
-    def __init__(self, setter, getter):
+    def __init__(self, setter, getter) -> None:
         self.setter = setter
         self.getter = getter
 
-    def set_enabled(self, value):
+    def set_enabled(self, value) -> None:
         self.setter(value)
 
     def get_enabled(self):
diff --git a/torch/utils/backend_registration.py b/torch/utils/backend_registration.py
index 67e9a7311a09d..2300306d22d2d 100644
--- a/torch/utils/backend_registration.py
+++ b/torch/utils/backend_registration.py
@@ -1,5 +1,4 @@
 # mypy: allow-untyped-defs
-from typing import Optional, Union
 
 import torch
 from torch._C import _get_privateuse1_backend_name, _rename_privateuse1_backend
@@ -82,7 +81,7 @@ def rename_privateuse1_backend(backend_name: str) -> None:
     _privateuse1_backend_name = backend_name
 
 
-def _check_register_once(module, attr):
+def _check_register_once(module, attr) -> None:
     if hasattr(module, attr):
         raise RuntimeError(
             f"The custom device module of {module} has already been registered with {attr}"
@@ -90,7 +89,7 @@ def _check_register_once(module, attr):
 
 
 def _normalization_device(
-    custom_backend_name: str, device: Optional[Union[int, str, torch.device]] = None
+    custom_backend_name: str, device: int | str | torch.device | None = None
 ) -> int:
     def _get_current_device_index():
         _get_device_index = "current_device"
@@ -137,7 +136,7 @@ def wrap_tensor_backend(self: torch.Tensor) -> bool:
 
     def wrap_tensor_to(
         self: torch.Tensor,
-        device: Optional[Union[int, torch.device]] = None,
+        device: int | torch.device | None = None,
         non_blocking=False,
         **kwargs,
     ) -> torch.Tensor:
@@ -188,7 +187,7 @@ def _generate_module_methods_for_privateuse1_backend(custom_backend_name: str) -
 
     def wrap_module_to(
         self: torch.nn.modules.module.T,
-        device: Optional[Union[int, torch.device]] = None,
+        device: int | torch.device | None = None,
     ) -> torch.nn.modules.module.T:
         r"""Move all model parameters and buffers to the custom device.
 
@@ -268,7 +267,7 @@ def wrap_module_to(
 
 
 def _generate_storage_methods_for_privateuse1_backend(
-    custom_backend_name: str, unsupported_dtype: Optional[list[torch.dtype]] = None
+    custom_backend_name: str, unsupported_dtype: list[torch.dtype] | None = None
 ) -> None:
     # Attribute is registered in the _StorageBase class
     # and UntypedStorage obtains through inheritance.
@@ -355,7 +354,7 @@ def generate_methods_for_privateuse1_backend(
     for_module: bool = True,
     for_packed_sequence: bool = True,
     for_storage: bool = False,
-    unsupported_dtype: Optional[list[torch.dtype]] = None,
+    unsupported_dtype: list[torch.dtype] | None = None,
 ) -> None:
     r"""
     Automatically generate attributes and methods for the custom backend after rename privateuse1 backend.
@@ -448,33 +447,33 @@ def func_name(*args, **kwargs):
 
 
 class _DummyBackendModule:
-    def is_initialized(self):
+    def is_initialized(self) -> bool:
         return True
 
-    def is_available(self):
+    def is_available(self) -> bool:
         return True
 
-    def current_device(self):
+    def current_device(self) -> int:
         return 0
 
-    def _is_in_bad_fork(self):
+    def _is_in_bad_fork(self) -> bool:
         return False
 
-    def manual_seed_all(self, seed: int):
+    def manual_seed_all(self, seed: int) -> None:
         pass
 
-    def device_count(self):
+    def device_count(self) -> int:
         return 1
 
 
 class _DummyPrivateUse1Hook(torch._C._acc.PrivateUse1Hooks):
-    def is_available(self):
+    def is_available(self) -> bool:
         return True
 
-    def has_primary_context(self, dev_id):
+    def has_primary_context(self, dev_id) -> bool:
         return True
 
-    def is_built(self):
+    def is_built(self) -> bool:
         return True
 
 
@@ -485,7 +484,7 @@ def type_(self):
 
 def _setup_privateuseone_for_python_backend(
     rename=None, backend_module=None, hook=None, device_guard=None
-):
+) -> None:
     """This function will prepare the PrivateUse1 dispatch key to be used as a python backend.
 
     WARNING: this API is experimental and might change without notice.
diff --git a/torch/utils/benchmark/examples/compare.py b/torch/utils/benchmark/examples/compare.py
index 5d797a5b0a2bf..1c266e7cf9a6e 100644
--- a/torch/utils/benchmark/examples/compare.py
+++ b/torch/utils/benchmark/examples/compare.py
@@ -20,7 +20,7 @@ class FauxTorch:
     writing serialized measurements, but this simplifies that model to
     make the example clearer.
     """
-    def __init__(self, real_torch, extra_ns_per_element):
+    def __init__(self, real_torch, extra_ns_per_element) -> None:
         self._real_torch = real_torch
         self._extra_ns_per_element = extra_ns_per_element
 
@@ -45,7 +45,7 @@ def matmul(self, *args, **kwargs):
         return self.extra_overhead(self._real_torch.matmul(*args, **kwargs))
 
 
-def main():
+def main() -> None:
     tasks = [
         ("add", "add", "torch.add(x, y)"),
         ("add", "add (extra +0)", "torch.add(x, y + zero)"),
diff --git a/torch/utils/benchmark/examples/fuzzer.py b/torch/utils/benchmark/examples/fuzzer.py
index ee2c9f9c04ed1..80a4e733928d8 100644
--- a/torch/utils/benchmark/examples/fuzzer.py
+++ b/torch/utils/benchmark/examples/fuzzer.py
@@ -9,7 +9,7 @@
 import torch.utils.benchmark as benchmark_utils
 
 
-def main():
+def main() -> None:
     add_fuzzer = benchmark_utils.Fuzzer(
         parameters=[
             [
diff --git a/torch/utils/benchmark/examples/op_benchmark.py b/torch/utils/benchmark/examples/op_benchmark.py
index 8a76331d3404f..f65599ee18a4f 100644
--- a/torch/utils/benchmark/examples/op_benchmark.py
+++ b/torch/utils/benchmark/examples/op_benchmark.py
@@ -16,7 +16,7 @@
 _MEASURE_TIME = 1.0
 
 
-def assert_dicts_equal(dict_0, dict_1):
+def assert_dicts_equal(dict_0, dict_1) -> None:
     """Builtin dict comparison will not compare numpy arrays.
     e.g.
         x = {"a": np.ones((2, 1))}
@@ -28,7 +28,7 @@ def assert_dicts_equal(dict_0, dict_1):
         raise AssertionError("dict values differ for keys other than 'dtype'")
 
 
-def run(n, stmt, fuzzer_cls):
+def run(n, stmt, fuzzer_cls) -> None:
     float_iter = fuzzer_cls(seed=0, dtype=torch.float32).take(n)
     int_iter = fuzzer_cls(seed=0, dtype=torch.int32).take(n)
     raw_results = []
@@ -97,7 +97,7 @@ def run(n, stmt, fuzzer_cls):
         print(spacer)
 
 
-def main():
+def main() -> None:
     run(n=100, stmt="torch.median(x, dim=0)", fuzzer_cls=UnaryOpFuzzer)
     run(n=100, stmt="torch.square(x)", fuzzer_cls=UnaryOpFuzzer)
     run(n=100, stmt="x + y", fuzzer_cls=BinaryOpFuzzer)
diff --git a/torch/utils/benchmark/examples/sparse/compare.py b/torch/utils/benchmark/examples/sparse/compare.py
index fa00fb1818cda..e61b0cc063469 100644
--- a/torch/utils/benchmark/examples/sparse/compare.py
+++ b/torch/utils/benchmark/examples/sparse/compare.py
@@ -19,7 +19,7 @@ class FauxTorch:
     writing serialized measurements, but this simplifies that model to
     make the example clearer.
     """
-    def __init__(self, real_torch, extra_ns_per_element):
+    def __init__(self, real_torch, extra_ns_per_element) -> None:
         self._real_torch = real_torch
         self._extra_ns_per_element = extra_ns_per_element
 
@@ -28,7 +28,7 @@ def sparse(self):
         return self.Sparse(self._real_torch, self._extra_ns_per_element)
 
     class Sparse:
-        def __init__(self, real_torch, extra_ns_per_element):
+        def __init__(self, real_torch, extra_ns_per_element) -> None:
             self._real_torch = real_torch
             self._extra_ns_per_element = extra_ns_per_element
 
@@ -73,7 +73,7 @@ def gen_sparse(size, density, dtype, device='cpu'):
     indices, values = generate_coo_data(size, sparse_dim, nnz, dtype, device)
     return torch.sparse_coo_tensor(indices, values, size, dtype=dtype, device=device)
 
-def main():
+def main() -> None:
     tasks = [
         ("matmul", "x @ y", "torch.sparse.mm(x, y)"),
         ("matmul", "x @ y + 0", "torch.sparse.mm(x, y) + zero"),
diff --git a/torch/utils/benchmark/examples/sparse/fuzzer.py b/torch/utils/benchmark/examples/sparse/fuzzer.py
index a5aac22179d86..c2a5bc2f112bb 100644
--- a/torch/utils/benchmark/examples/sparse/fuzzer.py
+++ b/torch/utils/benchmark/examples/sparse/fuzzer.py
@@ -8,7 +8,7 @@
 
 import torch.utils.benchmark as benchmark_utils
 
-def main():
+def main() -> None:
     add_fuzzer = benchmark_utils.Fuzzer(
         parameters=[
             [
diff --git a/torch/utils/benchmark/examples/sparse/op_benchmark.py b/torch/utils/benchmark/examples/sparse/op_benchmark.py
index b574b0223d489..20a7429d551b7 100644
--- a/torch/utils/benchmark/examples/sparse/op_benchmark.py
+++ b/torch/utils/benchmark/examples/sparse/op_benchmark.py
@@ -14,7 +14,7 @@
 
 _MEASURE_TIME = 1.0
 
-def assert_dicts_equal(dict_0, dict_1):
+def assert_dicts_equal(dict_0, dict_1) -> None:
     """Builtin dict comparison will not compare numpy arrays.
     e.g.
         x = {"a": np.ones((2, 1))}
@@ -25,7 +25,7 @@ def assert_dicts_equal(dict_0, dict_1):
     if all(np.all(v != dict_1[k]) for k, v in dict_0.items() if k != "dtype"):
         raise AssertionError("dict values differ for keys other than 'dtype'")
 
-def run(n, stmt, fuzzer_cls):
+def run(n, stmt, fuzzer_cls) -> None:
     float_iter = fuzzer_cls(seed=0, dtype=torch.float32).take(n)
     double_iter = fuzzer_cls(seed=0, dtype=torch.float64).take(n)
     raw_results = []
@@ -92,7 +92,7 @@ def run(n, stmt, fuzzer_cls):
         print(spacer)
 
 
-def main():
+def main() -> None:
     run(n=100, stmt="torch.sparse.sum(x, dim=0)", fuzzer_cls=UnaryOpSparseFuzzer)
     run(n=100, stmt="torch.sparse.softmax(x, dim=0)", fuzzer_cls=UnaryOpSparseFuzzer)
     run(n=100, stmt="x + y", fuzzer_cls=BinaryOpSparseFuzzer)
diff --git a/torch/utils/benchmark/examples/spectral_ops_fuzz_test.py b/torch/utils/benchmark/examples/spectral_ops_fuzz_test.py
index a3c8cbe5b12c2..81a33c34bc822 100644
--- a/torch/utils/benchmark/examples/spectral_ops_fuzz_test.py
+++ b/torch/utils/benchmark/examples/spectral_ops_fuzz_test.py
@@ -63,7 +63,7 @@ def run_benchmark(name: str, function: object, dtype: torch.dtype, seed: int, de
 BENCHMARK_NAMES = [b.name for b in BENCHMARKS]
 DEVICE_NAMES = ['cpu', 'cuda']
 
-def _output_csv(file, results):
+def _output_csv(file, results) -> None:
     file.write('benchmark,device,num_threads,numel,shape,contiguous,dim,mean (us),median (us),iqr (us)\n')
     for measurement in results:
         metadata = measurement.metadata
diff --git a/torch/utils/benchmark/op_fuzzers/binary.py b/torch/utils/benchmark/op_fuzzers/binary.py
index 75f394179b3e0..e53c310111bec 100644
--- a/torch/utils/benchmark/op_fuzzers/binary.py
+++ b/torch/utils/benchmark/op_fuzzers/binary.py
@@ -14,7 +14,7 @@
 
 
 class BinaryOpFuzzer(Fuzzer):
-    def __init__(self, seed, dtype=torch.float32, cuda=False):
+    def __init__(self, seed, dtype=torch.float32, cuda=False) -> None:
         super().__init__(
             parameters=[
                 # Dimensionality of x and y. (e.g. 1D, 2D, or 3D.)
diff --git a/torch/utils/benchmark/op_fuzzers/sparse_binary.py b/torch/utils/benchmark/op_fuzzers/sparse_binary.py
index 014361877dea1..8e6269464e0d5 100644
--- a/torch/utils/benchmark/op_fuzzers/sparse_binary.py
+++ b/torch/utils/benchmark/op_fuzzers/sparse_binary.py
@@ -14,7 +14,7 @@
 
 
 class BinaryOpSparseFuzzer(Fuzzer):
-    def __init__(self, seed, dtype=torch.float32, cuda=False):
+    def __init__(self, seed, dtype=torch.float32, cuda=False) -> None:
         super().__init__(
             parameters=[
                 # Dimensionality of x and y. (e.g. 1D, 2D, or 3D.)
diff --git a/torch/utils/benchmark/op_fuzzers/sparse_unary.py b/torch/utils/benchmark/op_fuzzers/sparse_unary.py
index 07d2aeeeabaf2..18921becd078c 100644
--- a/torch/utils/benchmark/op_fuzzers/sparse_unary.py
+++ b/torch/utils/benchmark/op_fuzzers/sparse_unary.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 
 import numpy as np
 import torch
@@ -20,7 +20,7 @@
 ))
 
 class UnaryOpSparseFuzzer(Fuzzer):
-    def __init__(self, seed: Optional[int], dtype: _dtype | None = None, cuda: bool = False) -> None:
+    def __init__(self, seed: int | None, dtype: _dtype | None = None, cuda: bool = False) -> None:
         if dtype is None:
             dtype = getattr(torch, 'float32', None)
         super().__init__(
diff --git a/torch/utils/benchmark/op_fuzzers/spectral.py b/torch/utils/benchmark/op_fuzzers/spectral.py
index 2b9e92d7a2c7b..c324e338dca5d 100644
--- a/torch/utils/benchmark/op_fuzzers/spectral.py
+++ b/torch/utils/benchmark/op_fuzzers/spectral.py
@@ -29,7 +29,7 @@ def power_range(upper_bound, base):
 
 class SpectralOpFuzzer(benchmark.Fuzzer):
     def __init__(self, *, seed: int, dtype=torch.float64,
-                 cuda: bool = False, probability_regular: float = 1.0):
+                 cuda: bool = False, probability_regular: float = 1.0) -> None:
         super().__init__(
             parameters=[
                 # Dimensionality of x. (e.g. 1D, 2D, or 3D.)
diff --git a/torch/utils/benchmark/op_fuzzers/unary.py b/torch/utils/benchmark/op_fuzzers/unary.py
index e780b421f24c8..6008adfe45921 100644
--- a/torch/utils/benchmark/op_fuzzers/unary.py
+++ b/torch/utils/benchmark/op_fuzzers/unary.py
@@ -14,7 +14,7 @@
 
 
 class UnaryOpFuzzer(Fuzzer):
-    def __init__(self, seed, dtype=torch.float32, cuda=False):
+    def __init__(self, seed, dtype=torch.float32, cuda=False) -> None:
         super().__init__(
             parameters=[
                 # Dimensionality of x. (e.g. 1D, 2D, or 3D.)
diff --git a/torch/utils/benchmark/utils/common.py b/torch/utils/benchmark/utils/common.py
index 10fe1d898de0f..d4f328d19083f 100644
--- a/torch/utils/benchmark/utils/common.py
+++ b/torch/utils/benchmark/utils/common.py
@@ -8,7 +8,7 @@
 import tempfile
 import textwrap
 import time
-from typing import cast, Any, Optional
+from typing import cast, Any
 from collections.abc import Iterable, Iterator
 import uuid
 
@@ -34,10 +34,10 @@ class TaskSpec:
     stmt: str
     setup: str
     global_setup: str = ""
-    label: Optional[str] = None
-    sub_label: Optional[str] = None
-    description: Optional[str] = None
-    env: Optional[str] = None
+    label: str | None = None
+    sub_label: str | None = None
+    description: str | None = None
+    env: str | None = None
     num_threads: int = 1
 
     @property
@@ -82,7 +82,7 @@ class Measurement:
     number_per_run: int
     raw_times: list[float]
     task_spec: TaskSpec
-    metadata: Optional[dict[Any, Any]] = None  # Reserved for user payloads.
+    metadata: dict[Any, Any] | None = None  # Reserved for user payloads.
 
     def __post_init__(self) -> None:
         self._sorted_times: tuple[float, ...] = ()
@@ -297,7 +297,7 @@ def set_torch_threads(n: int) -> Iterator[None]:
         torch.set_num_threads(prior_num_threads)
 
 
-def _make_temp_dir(prefix: Optional[str] = None, gc_dev_shm: bool = False) -> str:
+def _make_temp_dir(prefix: str | None = None, gc_dev_shm: bool = False) -> str:
     """Create a temporary directory. The caller is responsible for cleanup.
 
     This function is conceptually similar to `tempfile.mkdtemp`, but with
diff --git a/torch/utils/benchmark/utils/compare.py b/torch/utils/benchmark/utils/compare.py
index 21a83926a2e82..c1e232e6e0426 100644
--- a/torch/utils/benchmark/utils/compare.py
+++ b/torch/utils/benchmark/utils/compare.py
@@ -3,7 +3,6 @@
 import collections
 import enum
 import itertools as it
-from typing import Optional
 
 from torch.utils.benchmark.utils import common
 from torch import tensor as _tensor
@@ -29,12 +28,12 @@ class Colorize(enum.Enum):
 class _Column:
     def __init__(
         self,
-        grouped_results: list[tuple[Optional[common.Measurement], ...]],
+        grouped_results: list[tuple[common.Measurement | None, ...]],
         time_scale: float,
         time_unit: str,
         trim_significant_figures: bool,
         highlight_warnings: bool,
-    ):
+    ) -> None:
         self._grouped_results = grouped_results
         self._flat_results = [*it.chain.from_iterable(grouped_results)]
         self._time_scale = time_scale
@@ -60,7 +59,7 @@ def __init__(
     def get_results_for(self, group):
         return self._grouped_results[group]
 
-    def num_to_str(self, value: Optional[float], estimated_sigfigs: int, spread: Optional[float]):
+    def num_to_str(self, value: float | None, estimated_sigfigs: int, spread: float | None):
         if value is None:
             return " " * len(self.num_to_str(1, estimated_sigfigs, None))
 
@@ -79,7 +78,7 @@ def optional_min(seq):
 
 class _Row:
     def __init__(self, results, row_group, render_env, env_str_len,
-                 row_name_str_len, time_scale, colorize, num_threads=None):
+                 row_name_str_len, time_scale, colorize, num_threads=None) -> None:
         super().__init__()
         self._results = results
         self._row_group = row_group
@@ -91,7 +90,7 @@ def __init__(self, results, row_group, render_env, env_str_len,
         self._columns: tuple[_Column, ...] = ()
         self._num_threads = num_threads
 
-    def register_columns(self, columns: tuple[_Column, ...]):
+    def register_columns(self, columns: tuple[_Column, ...]) -> None:
         self._columns = columns
 
     def as_column_strings(self):
@@ -156,7 +155,7 @@ def __init__(
             colorize: Colorize,
             trim_significant_figures: bool,
             highlight_warnings: bool
-    ):
+    ) -> None:
         if len({r.label for r in results}) != 1:
             raise AssertionError("All results must share the same label")
 
@@ -175,17 +174,17 @@ def __init__(
         self.rows, self.columns = self.populate_rows_and_columns()
 
     @staticmethod
-    def row_fn(m: common.Measurement) -> tuple[int, Optional[str], str]:
+    def row_fn(m: common.Measurement) -> tuple[int, str | None, str]:
         return m.num_threads, m.env, m.as_row_name
 
     @staticmethod
-    def col_fn(m: common.Measurement) -> Optional[str]:
+    def col_fn(m: common.Measurement) -> str | None:
         return m.description
 
     def populate_rows_and_columns(self) -> tuple[tuple[_Row, ...], tuple[_Column, ...]]:
         rows: list[_Row] = []
         columns: list[_Column] = []
-        ordered_results: list[list[Optional[common.Measurement]]] = [
+        ordered_results: list[list[common.Measurement | None]] = [
             [None for _ in self.column_keys]
             for _ in self.row_keys
         ]
@@ -205,7 +204,7 @@ def populate_rows_and_columns(self) -> tuple[tuple[_Row, ...], tuple[_Column, ..
         prior_num_threads = -1
         prior_env = ""
         row_group = -1
-        rows_by_group: list[list[list[Optional[common.Measurement]]]] = []
+        rows_by_group: list[list[list[common.Measurement | None]]] = []
         for (num_threads, env, _), row in zip(self.row_keys, ordered_results, strict=True):
             thread_transition = (num_threads != prior_num_threads)
             if thread_transition:
@@ -283,17 +282,17 @@ class Compare:
     Args:
         results: List of Measurement to display.
     """
-    def __init__(self, results: list[common.Measurement]):
+    def __init__(self, results: list[common.Measurement]) -> None:
         self._results: list[common.Measurement] = []
         self.extend_results(results)
         self._trim_significant_figures = False
         self._colorize = Colorize.NONE
         self._highlight_warnings = False
 
-    def __str__(self):
+    def __str__(self) -> str:
         return "\n".join(self._render())
 
-    def extend_results(self, results):
+    def extend_results(self, results) -> None:
         """Append results to already stored ones.
 
         All added results must be instances of ``Measurement``.
@@ -305,22 +304,22 @@ def extend_results(self, results):
                 )
         self._results.extend(results)
 
-    def trim_significant_figures(self):
+    def trim_significant_figures(self) -> None:
         """Enables trimming of significant figures when building the formatted table."""
         self._trim_significant_figures = True
 
-    def colorize(self, rowwise=False):
+    def colorize(self, rowwise=False) -> None:
         """Colorize formatted table.
 
         Colorize columnwise by default.
         """
         self._colorize = Colorize.ROWWISE if rowwise else Colorize.COLUMNWISE
 
-    def highlight_warnings(self):
+    def highlight_warnings(self) -> None:
         """Enables warning highlighting when building formatted table."""
         self._highlight_warnings = True
 
-    def print(self):
+    def print(self) -> None:
         """Print formatted table"""
         print(str(self))
 
diff --git a/torch/utils/benchmark/utils/compile.py b/torch/utils/benchmark/utils/compile.py
index 777120c811057..dd15a582a2749 100644
--- a/torch/utils/benchmark/utils/compile.py
+++ b/torch/utils/benchmark/utils/compile.py
@@ -1,5 +1,5 @@
 # mypy: allow-untyped-defs
-from typing import Any, cast, Optional, Union
+from typing import Any, cast
 from collections.abc import Callable
 
 import torch
@@ -25,7 +25,7 @@
     print("tabulate is not installed, please pip install tabulate to use this utility")
 
 if HAS_TABULATE:
-    def _enable_tensor_cores():
+    def _enable_tensor_cores() -> None:
         global _warned_tensor_cores
 
         if torch.cuda.is_available():
@@ -36,15 +36,15 @@ def _enable_tensor_cores():
                     print("we will enable it automatically by setting `torch.set_float32_matmul_precision('high')`")
                     _warned_tensor_cores = True
 
-    def _disable_tensor_cores():
+    def _disable_tensor_cores() -> None:
         torch.set_float32_matmul_precision(_default_float_32_precision)
 
     def bench_loop(
-        model: Union[torch.nn.Module, Callable],
-        sample_input: Union[torch.Tensor, Any],
+        model: torch.nn.Module | Callable,
+        sample_input: torch.Tensor | Any,
         num_iters: int = 5,
-        optimizer: Optional[torch.optim.Optimizer] = None,
-        loss_fn: Optional[Callable] = None,
+        optimizer: torch.optim.Optimizer | None = None,
+        loss_fn: Callable | None = None,
     ):
         # Define the statement and setup for the benchmark
         if optimizer and loss_fn:
@@ -74,13 +74,13 @@ def bench_loop(
         return round(avg_time, 2)
 
     def benchmark_compile(
-        model: Union[torch.nn.Module, Callable],
-        sample_input: Union[torch.Tensor, Any],
+        model: torch.nn.Module | Callable,
+        sample_input: torch.Tensor | Any,
         num_iters: int = 5,
-        backend: Optional[str] = None,
-        mode: Optional[str] = "default",
-        optimizer: Optional[torch.optim.Optimizer] = None,
-        loss_fn : Union[torch.nn.Module, Callable, None] = None,
+        backend: str | None = None,
+        mode: str | None = "default",
+        optimizer: torch.optim.Optimizer | None = None,
+        loss_fn : torch.nn.Module | Callable | None = None,
     ):
         """
         Use this utility to benchmark torch.compile
@@ -119,11 +119,11 @@ def benchmark_compile(
 
 
     def bench_all(
-        model : Union[torch.nn.Module, Callable],
-        sample_input: Union[torch.Tensor, Any],
+        model : torch.nn.Module | Callable,
+        sample_input: torch.Tensor | Any,
         num_iters : int = 5,
-        optimizer: Optional[torch.optim.Optimizer] = None,
-        loss_fn : Union[torch.nn.Module, Callable, None] = None,
+        optimizer: torch.optim.Optimizer | None = None,
+        loss_fn : torch.nn.Module | Callable | None = None,
     ):
         """
         This is a simple utility that can be used to benchmark torch.compile
@@ -155,7 +155,7 @@ def bench_all(
         for backend in torch._dynamo.list_backends():
 
             if backend == "inductor":
-                mode_options = cast(list[Optional[str]], list(torch._inductor.list_mode_options().keys())) + [None]
+                mode_options = cast(list[str | None], list(torch._inductor.list_mode_options().keys())) + [None]
                 for mode in mode_options:
                     if mode == "default":
                         continue
diff --git a/torch/utils/benchmark/utils/cpp_jit.py b/torch/utils/benchmark/utils/cpp_jit.py
index 969eb6abb6954..a298146ce17c7 100644
--- a/torch/utils/benchmark/utils/cpp_jit.py
+++ b/torch/utils/benchmark/utils/cpp_jit.py
@@ -5,7 +5,7 @@
 import shutil
 import textwrap
 import threading
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from torch.utils.benchmark.utils._stubs import CallgrindModuleType, TimeitModuleType
@@ -29,7 +29,7 @@
 #   ````
 # `setup` and `stmt` do not change, so we can reuse the executable from the
 # first pass through the loop.
-_BUILD_ROOT: Optional[str] = None
+_BUILD_ROOT: str | None = None
 
 def _get_build_root() -> str:
     global _BUILD_ROOT
@@ -64,7 +64,7 @@ def _get_build_root() -> str:
 #   analysis and the shims no longer justify their maintenance and code
 #   complexity costs) back testing paths will be removed.
 
-CXX_FLAGS: Optional[list[str]]
+CXX_FLAGS: list[str] | None
 if hasattr(torch.__config__, "_cxx_flags"):
     try:
         CXX_FLAGS = torch.__config__._cxx_flags().strip().split()
@@ -89,7 +89,7 @@ def _get_build_root() -> str:
     EXTRA_INCLUDE_PATHS.append(os.path.join(CONDA_PREFIX, "include"))
 
 
-COMPAT_CALLGRIND_BINDINGS: Optional[CallgrindModuleType] = None
+COMPAT_CALLGRIND_BINDINGS: CallgrindModuleType | None = None
 def get_compat_bindings() -> CallgrindModuleType:
     with LOCK:
         global COMPAT_CALLGRIND_BINDINGS
diff --git a/torch/utils/benchmark/utils/fuzzer.py b/torch/utils/benchmark/utils/fuzzer.py
index f343722ef686d..38f771d23632e 100644
--- a/torch/utils/benchmark/utils/fuzzer.py
+++ b/torch/utils/benchmark/utils/fuzzer.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 import functools
 import itertools as it
-from typing import Any, Optional, Union
+from typing import Any
 from collections.abc import Callable
 
 import torch
@@ -25,11 +25,11 @@ class FuzzedParameter:
     def __init__(
         self,
         name: str,
-        minval: Optional[Union[int, float]] = None,
-        maxval: Optional[Union[int, float]] = None,
-        distribution: Optional[Union[str, dict[Any, float]]] = None,
+        minval: int | float | None = None,
+        maxval: int | float | None = None,
+        distribution: str | dict[Any, float] | None = None,
         strict: bool = False,
-    ):
+    ) -> None:
         """
         Args:
             name:
@@ -159,10 +159,10 @@ class ParameterAlias:
 
     Chains of alias' are allowed, but may not contain cycles.
     """
-    def __init__(self, alias_to):
+    def __init__(self, alias_to) -> None:
         self.alias_to = alias_to
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return f"ParameterAlias[alias_to: {self.alias_to}]"
 
 
@@ -188,18 +188,18 @@ class FuzzedTensor:
     def __init__(
         self,
         name: str,
-        size: tuple[Union[str, int], ...],
-        steps: Optional[tuple[Union[str, int], ...]] = None,
+        size: tuple[str | int, ...],
+        steps: tuple[str | int, ...] | None = None,
         probability_contiguous: float = 0.5,
-        min_elements: Optional[int] = None,
-        max_elements: Optional[int] = None,
-        max_allocation_bytes: Optional[int] = None,
-        dim_parameter: Optional[str] = None,
-        roll_parameter: Optional[str] = None,
+        min_elements: int | None = None,
+        max_elements: int | None = None,
+        max_allocation_bytes: int | None = None,
+        dim_parameter: str | None = None,
+        roll_parameter: str | None = None,
         dtype=torch.float32,
         cuda=False,
-        tensor_constructor: Optional[Callable] = None
-    ):
+        tensor_constructor: Callable | None = None
+    ) -> None:
         """
         Args:
             name:
@@ -329,7 +329,7 @@ def resolve(values, dim):
         allocation_size = tuple(size_i * step_i for size_i, step_i in zip(size, steps, strict=True))
         return size, steps, allocation_size
 
-    def satisfies_constraints(self, params):
+    def satisfies_constraints(self, params) -> bool:
         size, _, allocation_size = self._get_size_and_steps(params)
         # Product is computed in Python to avoid integer overflow.
         num_elements = prod(size)
@@ -353,11 +353,11 @@ def nullable_greater(left, right):
 class Fuzzer:
     def __init__(
         self,
-        parameters: list[Union[FuzzedParameter, list[FuzzedParameter]]],
-        tensors: list[Union[FuzzedTensor, list[FuzzedTensor]]],
-        constraints: Optional[list[Callable]] = None,
-        seed: Optional[int] = None
-    ):
+        parameters: list[FuzzedParameter | list[FuzzedParameter]],
+        tensors: list[FuzzedTensor | list[FuzzedTensor]],
+        constraints: list[Callable] | None = None,
+        seed: int | None = None
+    ) -> None:
         """
         Args:
             parameters:
@@ -422,9 +422,9 @@ def rejection_rate(self):
         return self._rejections / self._total_generated
 
     def _generate(self, state):
-        strict_params: dict[str, Union[float, int, ParameterAlias]] = {}
+        strict_params: dict[str, float | int | ParameterAlias] = {}
         for _ in range(1000):
-            candidate_params: dict[str, Union[float, int, ParameterAlias]] = {}
+            candidate_params: dict[str, float | int | ParameterAlias] = {}
             for p in self._parameters:
                 if p.strict:
                     if p.name in strict_params:
diff --git a/torch/utils/benchmark/utils/sparse_fuzzer.py b/torch/utils/benchmark/utils/sparse_fuzzer.py
index cd84900c5b438..a2a573b9b44fd 100644
--- a/torch/utils/benchmark/utils/sparse_fuzzer.py
+++ b/torch/utils/benchmark/utils/sparse_fuzzer.py
@@ -1,5 +1,4 @@
 # mypy: allow-untyped-defs
-from typing import Optional, Union
 from numbers import Number
 import torch
 from torch.utils.benchmark import FuzzedTensor
@@ -9,17 +8,17 @@ class FuzzedSparseTensor(FuzzedTensor):
     def __init__(
         self,
         name: str,
-        size: tuple[Union[str, int], ...],
-        min_elements: Optional[int] = None,
-        max_elements: Optional[int] = None,
-        dim_parameter: Optional[str] = None,
-        sparse_dim: Optional[str] = None,
-        nnz: Optional[str] = None,
-        density: Optional[str] = None,
-        coalesced: Optional[str] = None,
+        size: tuple[str | int, ...],
+        min_elements: int | None = None,
+        max_elements: int | None = None,
+        dim_parameter: str | None = None,
+        sparse_dim: str | None = None,
+        nnz: str | None = None,
+        density: str | None = None,
+        coalesced: str | None = None,
         dtype=torch.float32,
         cuda=False
-    ):
+    ) -> None:
         """
         Args:
             name:
diff --git a/torch/utils/benchmark/utils/timer.py b/torch/utils/benchmark/utils/timer.py
index 3dc17edeb7964..f131261b8f36d 100644
--- a/torch/utils/benchmark/utils/timer.py
+++ b/torch/utils/benchmark/utils/timer.py
@@ -2,7 +2,7 @@
 import enum
 import timeit
 import textwrap
-from typing import overload, Any, NoReturn, Optional, Union
+from typing import overload, Any, NoReturn
 from collections.abc import Callable
 
 import torch
@@ -52,7 +52,7 @@ def __init__(
         self._stmt: str = textwrap.dedent(stmt)
         self._setup: str = textwrap.dedent(setup)
         self._global_setup: str = textwrap.dedent(global_setup)
-        self._timeit_module: Optional[TimeitModuleType] = None
+        self._timeit_module: TimeitModuleType | None = None
 
     def timeit(self, number: int) -> float:
         if self._timeit_module is None:
@@ -181,14 +181,14 @@ def __init__(
         setup: str = "pass",
         global_setup: str = "",
         timer: Callable[[], float] = timer,
-        globals: Optional[dict[str, Any]] = None,
-        label: Optional[str] = None,
-        sub_label: Optional[str] = None,
-        description: Optional[str] = None,
-        env: Optional[str] = None,
+        globals: dict[str, Any] | None = None,
+        label: str | None = None,
+        sub_label: str | None = None,
+        description: str | None = None,
+        env: str | None = None,
         num_threads: int = 1,
-        language: Union[Language, str] = Language.PYTHON,
-    ):
+        language: Language | str = Language.PYTHON,
+    ) -> None:
         if not isinstance(stmt, str):
             raise ValueError("Currently only a `str` stmt is supported.")
 
@@ -277,7 +277,7 @@ def timeit(self, number: int = 1000000) -> common.Measurement:
     def repeat(self, repeat: int = -1, number: int = -1) -> None:
         raise NotImplementedError("See `Timer.blocked_autorange.`")
 
-    def autorange(self, callback: Optional[Callable[[int, float], NoReturn]] = None) -> None:
+    def autorange(self, callback: Callable[[int, float], NoReturn] | None = None) -> None:
         raise NotImplementedError("See `Timer.blocked_autorange.`")
 
     def _threaded_measurement_loop(
@@ -286,8 +286,8 @@ def _threaded_measurement_loop(
         time_hook: Callable[[], float],
         stop_hook: Callable[[list[float]], bool],
         min_run_time: float,
-        max_run_time: Optional[float] = None,
-        callback: Optional[Callable[[int, float], NoReturn]] = None
+        max_run_time: float | None = None,
+        callback: Callable[[int, float], NoReturn] | None = None
     ) -> list[float]:
         total_time = 0.0
         can_stop = False
@@ -325,7 +325,7 @@ def _estimate_block_size(self, min_run_time: float) -> int:
 
     def blocked_autorange(
         self,
-        callback: Optional[Callable[[int, float], NoReturn]] = None,
+        callback: Callable[[int, float], NoReturn] | None = None,
         min_run_time: float = 0.2,
     ) -> common.Measurement:
         """Measure many replicates while keeping timer overhead to a minimum.
@@ -389,7 +389,7 @@ def adaptive_autorange(
             *,
             min_run_time: float = 0.01,
             max_run_time: float = 10.0,
-            callback: Optional[Callable[[int, float], NoReturn]] = None,
+            callback: Callable[[int, float], NoReturn] | None = None,
     ) -> common.Measurement:
         """Similar to `blocked_autorange` but also checks for variablility in measurements
         and repeats until iqr/median is smaller than `threshold` or `max_run_time` is reached.
@@ -472,7 +472,7 @@ def collect_callgrind(
         self,
         number: int = 100,
         *,
-        repeats: Optional[int] = None,
+        repeats: int | None = None,
         collect_baseline: bool = True,
         retain_out_file: bool = False,
     ) -> Any:
diff --git a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
index 9080f82721600..17ecea8bbb559 100644
--- a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
+++ b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
@@ -12,7 +12,7 @@
 import textwrap
 from typing import (
     cast, Any, NamedTuple,
-    Optional, Union, TYPE_CHECKING)
+    Union, TYPE_CHECKING)
 from collections.abc import Callable
 from collections.abc import Iterator
 
@@ -55,7 +55,7 @@ class FunctionCounts:
 
     # For normal use, torch._tensor_str.PRINT_OPTS.linewidth determines
     # the print settings. This is simply to allow hermetic unit tests.
-    _linewidth: Optional[int] = None
+    _linewidth: int | None = None
 
     def __iter__(self) -> Iterator[FunctionCount]:
         yield from self._data
@@ -64,7 +64,7 @@ def __len__(self) -> int:
         return len(self._data)
 
     def __getitem__(self, item: Any) -> Union[FunctionCount, "FunctionCounts"]:
-        data: Union[FunctionCount, tuple[FunctionCount, ...]] = self._data[item]
+        data: FunctionCount | tuple[FunctionCount, ...] = self._data[item]
         return (
             FunctionCounts(cast(tuple[FunctionCount, ...], data), self.inclusive, truncate_rows=False)
             if isinstance(data, tuple) else data
@@ -105,7 +105,7 @@ def __sub__(
     ) -> "FunctionCounts":
         return self._merge(other, operator.neg)
 
-    def __mul__(self, other: Union[int, float]) -> "FunctionCounts":
+    def __mul__(self, other: int | float) -> "FunctionCounts":
         return self._from_dict({
             fn: int(c * other) for c, fn in self._data
         }, self.inclusive)
@@ -178,7 +178,7 @@ class CallgrindStats:
     baseline_exclusive_stats: FunctionCounts
     stmt_inclusive_stats: FunctionCounts
     stmt_exclusive_stats: FunctionCounts
-    stmt_callgrind_out: Optional[str]
+    stmt_callgrind_out: str | None
 
     def __repr__(self) -> str:
         base_stats = self.baseline_exclusive_stats
@@ -311,11 +311,11 @@ class CopyIfCallgrind:
 
     See `GlobalsBridge` for why this matters.
     """
-    def __init__(self, value: Any, *, setup: Optional[str] = None):
+    def __init__(self, value: Any, *, setup: str | None = None) -> None:
         for method, supported_types in _GLOBALS_ALLOWED_TYPES.items():
             if any(isinstance(value, t) for t in supported_types):
                 self._value: Any = value
-                self._setup: Optional[str] = setup
+                self._setup: str | None = setup
                 self._serialization: Serialization = method
                 break
         else:
@@ -334,7 +334,7 @@ def value(self) -> Any:
         return self._value
 
     @property
-    def setup(self) -> Optional[str]:
+    def setup(self) -> str | None:
         return self._setup
 
     @property
@@ -485,7 +485,7 @@ def construct(self) -> str:
 
 class _ValgrindWrapper:
     def __init__(self) -> None:
-        self._bindings_module: Optional[CallgrindModuleType] = None
+        self._bindings_module: CallgrindModuleType | None = None
         valgrind_symbols = (
             "_valgrind_supported_platform",
             "_valgrind_toggle",
@@ -511,7 +511,7 @@ def __init__(self) -> None:
                     check=False,
                 ).returncode
 
-        self._build_type: Optional[str] = None
+        self._build_type: str | None = None
         build_search = re.search("BUILD_TYPE=(.+),", torch.__config__.show())  # type: ignore[no-untyped-call]
         if build_search is not None:
             self._build_type = build_search.groups()[0].split(",")[0]
@@ -576,7 +576,7 @@ def _invoke(
         collect_baseline: bool,
         is_python: bool,
         retain_out_file: bool,
-    ) -> tuple[tuple[FunctionCounts, FunctionCounts, Optional[str]], ...]:
+    ) -> tuple[tuple[FunctionCounts, FunctionCounts, str | None], ...]:
         """Core invocation method for Callgrind collection.
 
         Valgrind operates by effectively replacing the CPU with an emulated
@@ -607,8 +607,7 @@ def _invoke(
 
         def run(args: list[str], **kwargs: Any) -> tuple[CompletedProcessType, str]:
             # https://thraxil.org/users/anders/posts/2008/03/13/Subprocess-Hanging-PIPE-is-your-enemy/
-            f_stdout_stderr = open(stdout_stderr_log, "wb")
-            try:
+            with open(stdout_stderr_log, "wb") as f_stdout_stderr:
                 invocation = subprocess.run(
                     args,
                     stdout=f_stdout_stderr,
@@ -617,8 +616,6 @@ def run(args: list[str], **kwargs: Any) -> tuple[CompletedProcessType, str]:
                 )
                 with open(stdout_stderr_log) as f:
                     return invocation, f.read()
-            finally:
-                f_stdout_stderr.close()
 
         try:
             if is_python:
@@ -732,7 +729,7 @@ class ScanState(enum.Enum):
                     raise AssertionError(f"Failed to parse {fpath}")
                 return FunctionCounts(tuple(sorted(fn_counts, reverse=True)), inclusive=inclusive)
 
-            def read_results(i: int) -> tuple[FunctionCounts, FunctionCounts, Optional[str]]:
+            def read_results(i: int) -> tuple[FunctionCounts, FunctionCounts, str | None]:
                 if i == repeats and not collect_baseline:
                     # Null baseline.
                     return (
@@ -742,7 +739,7 @@ def read_results(i: int) -> tuple[FunctionCounts, FunctionCounts, Optional[str]]
                     )
 
                 fpath = f"{callgrind_out}.{i + 1}"  # Callgrind one-indexes files.
-                callgrind_out_contents: Optional[str] = None
+                callgrind_out_contents: str | None = None
                 if retain_out_file:
                     with open(fpath) as f:
                         callgrind_out_contents = f.read()
@@ -767,7 +764,7 @@ def _construct_script(
         collect_baseline: bool,
         error_log: str,
         stat_log: str,
-        bindings: Optional[CallgrindModuleType],
+        bindings: CallgrindModuleType | None,
     ) -> str:
         def block_stmt(stmt: str, indent: int = 0) -> str:
             """Partially unroll benchmark loop.
@@ -914,7 +911,7 @@ def check_result(completed_process):
         )
 
 
-CALLGRIND_SINGLETON: Optional[_ValgrindWrapper] = None
+CALLGRIND_SINGLETON: _ValgrindWrapper | None = None
 def wrapper_singleton() -> _ValgrindWrapper:
     global CALLGRIND_SINGLETON
     if CALLGRIND_SINGLETON is None:
diff --git a/torch/utils/bundled_inputs.py b/torch/utils/bundled_inputs.py
index ccb56172a077b..e91129a03864b 100644
--- a/torch/utils/bundled_inputs.py
+++ b/torch/utils/bundled_inputs.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # mypy: allow-untyped-defs
-from typing import Any, TypeVar, Optional, NamedTuple, Union
+from typing import Any, TypeVar, NamedTuple
 from collections.abc import Callable, Sequence
 import textwrap
 import torch
@@ -40,10 +40,10 @@ class InflatableArg(NamedTuple):
 
 def bundle_inputs(
         model: torch.jit.ScriptModule,
-        inputs: Union[Optional[Sequence[tuple[Any, ...]]], dict[Callable, Optional[Sequence[tuple[Any, ...]]]]],
-        info: Optional[Union[list[str], dict[Callable, list[str]]]] = None,
+        inputs: Sequence[tuple[Any, ...]] | None | dict[Callable, Sequence[tuple[Any, ...]] | None],
+        info: list[str] | dict[Callable, list[str]] | None = None,
         *,
-        _receive_inflate_expr: Optional[list[str]] = None,
+        _receive_inflate_expr: list[str] | None = None,
 ) -> torch.jit.ScriptModule:
     """Create and return a copy of the specified model with inputs attached.
 
@@ -130,9 +130,9 @@ def bundle_inputs(
 
 def augment_model_with_bundled_inputs(
         model: torch.jit.ScriptModule,
-        inputs: Optional[Sequence[tuple[Any, ...]]] = None,
-        _receive_inflate_expr: Optional[list[str]] = None,  # For debugging.
-        info: Optional[list[str]] = None,  # Optional argument to provide info about forward or its inputs
+        inputs: Sequence[tuple[Any, ...]] | None = None,
+        _receive_inflate_expr: list[str] | None = None,  # For debugging.
+        info: list[str] | None = None,  # Optional argument to provide info about forward or its inputs
         skip_size_check=False,
 ) -> None:
     """Add bundled sample inputs to a model for the forward function.
@@ -184,9 +184,9 @@ def augment_model_with_bundled_inputs(
 
 def augment_many_model_functions_with_bundled_inputs(
         model: torch.jit.ScriptModule,
-        inputs: dict[Callable, Optional[Sequence[tuple[Any, ...]]]],
-        _receive_inflate_expr: Optional[list[str]] = None,  # For debugging.
-        info: Optional[dict[Callable, list[str]]] = None,  # Optional argument to provide info about the function or its inputs
+        inputs: dict[Callable, Sequence[tuple[Any, ...]] | None],
+        _receive_inflate_expr: list[str] | None = None,  # For debugging.
+        info: dict[Callable, list[str]] | None = None,  # Optional argument to provide info about the function or its inputs
         skip_size_check=False,
 ) -> None:
     """Add bundled sample inputs to a model for an arbitrary list of public functions.
@@ -366,7 +366,7 @@ def get_bundled_inputs_functions_and_info(self):
 
 def _inflate_expr(
     arg: T, ref: str, inflate_helper_fn_name: str, skip_size_check: bool = False
-) -> tuple[Union[T, torch.Tensor], str, Optional[str]]:
+) -> tuple[T | torch.Tensor, str, str | None]:
     # Allow custom inflation expressions any object.
     # For example, calling custom image-decoding ops.
     # Or just use "{}" as the format string to ignore size limits.
diff --git a/torch/utils/checkpoint.py b/torch/utils/checkpoint.py
index d9802c06e9444..71a67ed751fd8 100644
--- a/torch/utils/checkpoint.py
+++ b/torch/utils/checkpoint.py
@@ -14,6 +14,7 @@
 from torch.utils._pytree import tree_map
 from torch.testing._internal.logging_tensor import capture_logs, LoggingTensorMode
 from torch.utils._python_dispatch import TorchDispatchMode
+from typing import NoReturn
 
 __all__ = [
     "checkpoint",
@@ -32,6 +33,7 @@
     "SelectiveCheckpointContext",
     "create_selective_checkpoint_contexts",
     "SAC_IGNORED_OPS",
+    "GraphExecGroup",
 ]
 
 _DEFAULT_DETERMINISM_MODE = "default"
@@ -104,10 +106,10 @@ class DefaultDeviceType:
     to save and restore for recomputation.
     """
 
-    _default_device_type = "cuda"
+    _default_device_type: Optional[str] = None
 
     @staticmethod
-    def set_device_type(device: str = "cuda"):
+    def set_device_type(device: str = "cuda") -> None:
         """
         Set the default device type for checkpointing.
 
@@ -124,13 +126,16 @@ def get_device_type() -> str:
         Returns:
             str: The current default device type.
         """
+        if not DefaultDeviceType._default_device_type:
+            DefaultDeviceType._default_device_type = acc.type if (acc := torch.accelerator.current_accelerator(True)) else "cpu"
+
         return DefaultDeviceType._default_device_type
 
 
 def _infer_device_type(*args):
     device_types = []
 
-    def add_device_types(arg):
+    def add_device_types(arg) -> None:
         nonlocal device_types
         if isinstance(arg, torch.Tensor) and arg.device.type != "cpu":
             device_types.append(arg.device.type)
@@ -166,7 +171,7 @@ def get_device_states(*args) -> Tuple[List[int], List[torch.Tensor]]:
     # the conditionals short-circuit.
     fwd_device_ids = []
 
-    def add_device_ids(arg):
+    def add_device_ids(arg) -> None:
         nonlocal fwd_device_ids
         if isinstance(arg, torch.Tensor) and arg.device.type not in {"cpu", "meta"}:
             fwd_device_ids.append(arg.get_device())
@@ -601,7 +606,7 @@ def forward(input):
     return run_function(end + 1, len(functions) - 1, functions)(input)
 
 
-def _internal_assert(cond):
+def _internal_assert(cond) -> None:
     if not cond:
         raise AssertionError(
             "Something went unexpectedly wrong in activation checkpoint. "
@@ -779,7 +784,7 @@ class _Handle:
 
 
 class _Holder:
-    def __init__(self):
+    def __init__(self) -> None:
         self.handles: Dict[int, Optional[_Handle]] = {}
 
 
@@ -817,12 +822,12 @@ def get_args(saved_tensors):
         ctx.save_for_backward(*tensors)
 
     @staticmethod
-    def backward(ctx, *grad_outputs):
+    def backward(ctx, *grad_outputs) -> NoReturn:
         raise AssertionError("Did not expect to backward on this graph")
 
 
 class _CheckpointFrame:
-    def __init__(self, recompute_fn, early_stop, unpack_error_cb, metadata_fn):
+    def __init__(self, recompute_fn, early_stop, unpack_error_cb, metadata_fn) -> None:
         self.recompute_fn = recompute_fn
         self.input_saver = None
         self.weak_holders: List[ReferenceType] = []
@@ -847,7 +852,7 @@ def __init__(self, recompute_fn, early_stop, unpack_error_cb, metadata_fn):
         self.forward_completed = False
         self.ignore_saved_mismatch = False
 
-    def check_recomputed_tensors_match(self, gid):
+    def check_recomputed_tensors_match(self, gid) -> None:
         if self.ignore_saved_mismatch:
             # TODO: we can probably make this check stricter by checking that
             #       the metadata of the first tensors still match.
@@ -999,7 +1004,7 @@ def _get_debug_context_and_cb() -> Tuple[Callable[[], Any], Callable[[Checkpoint
     cpp_tb = platform.machine() == 'x86_64' and platform.system() == 'Linux'
 
     class CaptureLogs:
-        def __init__(self):
+        def __init__(self) -> None:
             self.logs = None
             self.tbs = None
 
@@ -1016,7 +1021,7 @@ def logging_mode():
     capture_logs_fwd = CaptureLogs()
     capture_logs_recompute = CaptureLogs()
 
-    def unpack_error_cb(e: CheckpointError):
+    def unpack_error_cb(e: CheckpointError) -> NoReturn:
         def get_str_tb(label, capture_logs):
             out = ""
             total_len = len(capture_logs.logs)
@@ -1071,7 +1076,7 @@ class _StopRecomputationError(Exception):
 
 
 class _recomputation_hook(torch.autograd.graph.saved_tensors_hooks):
-    def __init__(self, target_frame_ref: ReferenceType, gid: int):
+    def __init__(self, target_frame_ref: ReferenceType, gid: Union["GraphExecGroup", int]) -> None:
         def pack_hook(x):
             x = x.detach() if x.requires_grad else x
             target_frame = target_frame_ref()
@@ -1132,7 +1137,7 @@ def _run_fn_with_dynamo_disabled(fn, *args, **kwargs):
 
 
 class _checkpoint_hook(torch.autograd.graph.saved_tensors_hooks):
-    def __init__(self, frame):
+    def __init__(self, frame) -> None:
         def pack_hook(x):
             # See Rule 4 above
             holder = _Holder()
@@ -1144,10 +1149,14 @@ def pack_hook(x):
             return holder
 
         def unpack_hook(holder):
-            gid = torch._C._current_graph_task_id()
-            if gid == -1:
-                # generate a temporary id if we trigger unpack outside of a backward call
-                gid = int(uuid.uuid4())
+            # First check if we're inside a GraphExecGroup context
+            gid: Union[GraphExecGroup, None, int] = GraphExecGroup._get_current_group()
+            if gid is None:
+                # Fallback to using the current graph task id
+                gid = torch._C._current_graph_task_id()
+                if gid == -1:
+                    # generate a temporary id if we trigger unpack outside of a backward call
+                    gid = int(uuid.uuid4())
 
             if not frame.is_recomputed[gid]:
                 ctx = frame.input_saver.grad_fn
@@ -1167,10 +1176,17 @@ def unpack_hook(holder):
             _internal_assert(gid in holder.handles)
 
             if holder.handles[gid] is None:
+                extra = ""
+                if torch._C._get_graph_exec_group() is not None:
+                    extra = (
+                        "Performing two backward calls that overlap (i.e. require the same "
+                        "saved activation in order to compute gradients) is not allowed while "
+                        "under the torch.utils.checkpoint.GraphExecGroup context. "
+                    )
                 raise CheckpointError(
                     "torch.utils.checkpoint: Unpack is being triggered for a tensor that was already "
-                    "unpacked once. If you are calling ctx.saved_tensors in backward, make sure to do "
-                    "so only once. Otherwise please open an issue with details on your use case."
+                    f"unpacked once. {extra}If you are calling ctx.saved_tensors in backward, make sure "
+                    "to do so only once. Otherwise please open an issue with details on your use case."
                 )
             _internal_assert(holder.handles[gid] in frame.recomputed[gid])
             ret = frame.recomputed[gid][holder.handles[gid]]
@@ -1196,7 +1212,7 @@ def _is_compiling(func, args, kwargs):
 
 class _VersionWrapper:
     # Check that cached tensors are not mutated.
-    def __init__(self, val):
+    def __init__(self, val) -> None:
         self.val: Union[torch.Tensor, Any] = val
         self.version: Optional[int] = val._version if isinstance(val, torch.Tensor) else None
 
@@ -1251,7 +1267,7 @@ class SelectiveCheckpointContext:
         >>>     context_fn=context_fn,
         >>> )
     """
-    def __init__(self, *, is_recompute):
+    def __init__(self, *, is_recompute) -> None:
         self.is_recompute = is_recompute
 
 
@@ -1300,8 +1316,12 @@ def _policy_from_bool(b):
 
 
 class _CachingTorchDispatchMode(TorchDispatchMode):
+    @classmethod
+    def ignore_compile_internals(cls):
+        return True
+
     # Used together with _CachedTorchDispatchMode to implement SAC.
-    def __init__(self, policy_fn, storage):
+    def __init__(self, policy_fn, storage) -> None:
         self.policy_fn = policy_fn
         self.storage = storage
 
@@ -1336,8 +1356,12 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         return out
 
 class _CachedTorchDispatchMode(TorchDispatchMode):
+    @classmethod
+    def ignore_compile_internals(cls):
+        return True
+
     # Used together with _CachedTorchDispatchMode to implement SAC.
-    def __init__(self, policy_fn, storage, allow_cache_entry_mutation):
+    def __init__(self, policy_fn, storage, allow_cache_entry_mutation) -> None:
         self.policy_fn = policy_fn
         self.storage = storage
         self.allow_cache_entry_mutation = allow_cache_entry_mutation
@@ -1542,7 +1566,7 @@ def _checkpoint_without_reentrant_generator(
             had_device_in_fwd = True
             fwd_devices, fwd_device_states = get_device_states(*args)
 
-    def recompute_fn(*inputs):
+    def recompute_fn(*inputs) -> None:
         kwargs, *args = inputs
         # This will be called later during recomputation. This wrapping enables
         # the necessary global state to be captured.
@@ -1593,6 +1617,40 @@ def recompute_fn(*inputs):
 
     return
 
+
+class GraphExecGroup:
+    """Any checkpointed regions encountered by backward under the same instance
+    of this context manager will trigger recompute at most once, even if
+    there are multiple calls to backward.
+
+    Backward calls under the same instance of this context manager must execute
+    over non-overlapping regions of the backward graph even if retain_graph=True.
+    In particular, any two backward call cannot use the same saved activation for
+    gradient computation.
+
+    .. note::
+        This context manager only affects checkpoint with use_reentrant=False, and
+        is a no-op otherwise.
+    """
+
+    def __enter__(self) -> "GraphExecGroup":
+        if torch._C._get_graph_exec_group() is not None:
+            raise RuntimeError(
+                "GraphExecGroup contexts cannot be nested. "
+                f"Already inside group {torch._C._get_graph_exec_group()}"
+            )
+        torch._C._set_graph_exec_group(self)
+        return self
+
+    def __exit__(self, *args: object) -> None:
+        torch._C._set_graph_exec_group(None)
+
+    @classmethod
+    def _get_current_group(cls) -> Optional["GraphExecGroup"]:
+        # Private API to be used by utils like AC
+        return torch._C._get_graph_exec_group()
+
+
 # Note: [compiled autograd and checkpoint unpack hook]
 # When tracing via compiled autograd, this hook will be visible to the
 # compiler if the forward of this checkpointed region ran in eager.
diff --git a/torch/utils/collect_env.py b/torch/utils/collect_env.py
index 3b8b62cfde6d4..1f5f3f0f60575 100644
--- a/torch/utils/collect_env.py
+++ b/torch/utils/collect_env.py
@@ -803,14 +803,14 @@ def get_version_or_na(cfg, prefix):
 
 def pretty_str(envinfo):
     def replace_nones(dct, replacement="Could not collect"):
-        for key in dct.keys():
+        for key in dct:
             if dct[key] is not None:
                 continue
             dct[key] = replacement
         return dct
 
     def replace_bools(dct, true="Yes", false="No"):
-        for key in dct.keys():
+        for key in dct:
             if dct[key] is True:
                 dct[key] = true
             elif dct[key] is False:
@@ -899,7 +899,7 @@ def get_pretty_env_info():
     return pretty_str(get_env_info())
 
 
-def main():
+def main() -> None:
     print("Collecting environment information...")
     output = get_pretty_env_info()
     print(output)
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 235b7e104c702..dd0e42a4ae0cd 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -23,7 +23,6 @@
 import torch._appdirs
 from .file_baton import FileBaton
 from ._cpp_extension_versioner import ExtensionVersioner
-from typing import Optional, Union
 from typing_extensions import deprecated
 from torch.torch_version import TorchVersion, Version
 
@@ -82,7 +81,7 @@
            "verify_ninja_availability", "remove_extension_h_precompiler_headers", "get_cxx_compiler", "check_compiler_is_gcc"]
 # Taken directly from python stdlib < 3.9
 # See https://github.com/pytorch/pytorch/issues/48617
-def _nt_quote_args(args: Optional[list[str]]) -> list[str]:
+def _nt_quote_args(args: list[str] | None) -> list[str]:
     """Quote command-line arguments for DOS/Windows conventions.
 
     Just wraps every argument which contains blanks in double quotes, and
@@ -93,7 +92,7 @@ def _nt_quote_args(args: Optional[list[str]]) -> list[str]:
         return []
     return [f'"{arg}"' if ' ' in arg else arg for arg in args]
 
-def _find_cuda_home() -> Optional[str]:
+def _find_cuda_home() -> str | None:
     """Find the CUDA install path."""
     # Guess #1
     cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH')
@@ -119,7 +118,7 @@ def _find_cuda_home() -> Optional[str]:
         logger.warning("No CUDA runtime is found, using CUDA_HOME='%s'", cuda_home)
     return cuda_home
 
-def _find_rocm_home() -> Optional[str]:
+def _find_rocm_home() -> str | None:
     """Find the ROCm install path."""
     # Guess #1
     rocm_home = os.environ.get('ROCM_HOME') or os.environ.get('ROCM_PATH')
@@ -141,7 +140,7 @@ def _find_rocm_home() -> Optional[str]:
         logger.warning("No ROCm runtime is found, using ROCM_HOME='%s'", rocm_home)
     return rocm_home
 
-def _find_sycl_home() -> Optional[str]:
+def _find_sycl_home() -> str | None:
     sycl_home = None
     icpx_path = shutil.which('icpx')
     # Guess 1: for source code build developer/user, we'll have icpx in PATH,
@@ -310,7 +309,7 @@ def _get_sycl_arch_list():
 # If arch list returned by _get_sycl_arch_list() is empty, then sycl kernels will be compiled
 # for default spir64 target and avoid device specific compilations entirely. Further, kernels
 # will be JIT compiled at runtime.
-def _append_sycl_targets_if_missing(cflags):
+def _append_sycl_targets_if_missing(cflags) -> None:
     if any(flag.startswith('-fsycl-targets=') for flag in cflags):
         # do nothing: user has manually specified sycl targets
         return
@@ -367,7 +366,7 @@ def _accepted_compilers_for_platform() -> list[str]:
     # gnu-c++ and gnu-cc are the conda gcc compilers
     return ['clang++', 'clang'] if IS_MACOS else ['g++', 'gcc', 'gnu-c++', 'gnu-cc', 'clang++', 'clang']
 
-def _maybe_write(filename, new_content):
+def _maybe_write(filename, new_content) -> None:
     r'''
     Equivalent to writing the content into the file but will not touch the file
     if it already had the right content (to avoid triggering recompile).
@@ -559,7 +558,7 @@ def _check_cuda_version(compiler_name: str, compiler_version: TorchVersion) -> N
 
 
 # Specify Visual Studio C runtime library for hipcc
-def _set_hipcc_runtime_lib(is_standalone, debug):
+def _set_hipcc_runtime_lib(is_standalone, debug) -> None:
     if is_standalone:
         if debug:
             COMMON_HIP_FLAGS.append('-fms-runtime-lib=static_dbg')
@@ -571,7 +570,7 @@ def _set_hipcc_runtime_lib(is_standalone, debug):
         else:
             COMMON_HIP_FLAGS.append('-fms-runtime-lib=dll')
 
-def _append_sycl_std_if_no_std_present(cflags):
+def _append_sycl_std_if_no_std_present(cflags) -> None:
     if not any(flag.startswith('-sycl-std=') for flag in cflags):
         cflags.append('-sycl-std=2020')
 
@@ -616,7 +615,7 @@ class BuildExtension(build_ext):
     def with_options(cls, **options):
         """Return a subclass with alternative constructor that extends any original keyword arguments to the original constructor with the given options."""
         class cls_with_options(cls):  # type: ignore[misc, valid-type]
-            def __init__(self, *args, **kwargs):
+            def __init__(self, *args, **kwargs) -> None:
                 kwargs.update(options)
                 super().__init__(*args, **kwargs)
 
@@ -742,7 +741,7 @@ def unix_cuda_flags(cflags):
 
             return cflags
 
-        def convert_to_absolute_paths_inplace(paths):
+        def convert_to_absolute_paths_inplace(paths) -> None:
             # Helper function. See Note [Absolute include_dirs]
             if paths is not None:
                 for i in range(len(paths)):
@@ -1123,7 +1122,7 @@ def _check_abi(self) -> tuple[str, TorchVersion]:
             raise UserWarning(msg)
         return compiler, version
 
-    def _add_compile_flag(self, extension, flag):
+    def _add_compile_flag(self, extension, flag) -> None:
         extension.extra_compile_args = copy.deepcopy(extension.extra_compile_args)
         if isinstance(extension.extra_compile_args, dict):
             for args in extension.extra_compile_args.values():
@@ -1133,7 +1132,7 @@ def _add_compile_flag(self, extension, flag):
 
     # Simple hipify, replace the first occurrence of CUDA with HIP
     # in flags starting with "-" and containing "CUDA", but exclude -I flags
-    def _hipify_compile_flags(self, extension):
+    def _hipify_compile_flags(self, extension) -> None:
         if isinstance(extension.extra_compile_args, dict) and 'nvcc' in extension.extra_compile_args:
             modified_flags = []
             for flag in extension.extra_compile_args['nvcc']:
@@ -1154,7 +1153,7 @@ def _hipify_compile_flags(self, extension):
                     modified_flags.append(flag)
             extension.extra_compile_args['nvcc'] = modified_flags
 
-    def _define_torch_extension_name(self, extension):
+    def _define_torch_extension_name(self, extension) -> None:
         # pybind11 doesn't support dots in the names
         # so in order to support extensions in the packages
         # like torch._C, we take the last part of the string
@@ -1547,7 +1546,7 @@ def include_paths(device_type: str = "cpu", torch_include_dirs=True) -> list[str
     return paths
 
 
-def library_paths(device_type: str = "cpu", torch_include_dirs: bool = True, cross_target_platform: Optional[str] = None) -> list[str]:
+def library_paths(device_type: str = "cpu", torch_include_dirs: bool = True, cross_target_platform: str | None = None) -> list[str]:
     """
     Get the library paths required to build a C++ or CUDA extension.
 
@@ -1605,7 +1604,7 @@ def library_paths(device_type: str = "cpu", torch_include_dirs: bool = True, cro
 
 
 def load(name,
-         sources: Union[str, list[str]],
+         sources: str | list[str],
          extra_cflags=None,
          extra_cuda_cflags=None,
          extra_sycl_cflags=None,
@@ -1613,8 +1612,8 @@ def load(name,
          extra_include_paths=None,
          build_directory=None,
          verbose=False,
-         with_cuda: Optional[bool] = None,
-         with_sycl: Optional[bool] = None,
+         with_cuda: bool | None = None,
+         with_sycl: bool | None = None,
          is_python_module=True,
          is_standalone=False,
          keep_intermediates=True):
@@ -1733,7 +1732,7 @@ def load(name,
 def _get_pybind11_abi_build_flags() -> list[str]:
     return []
 
-def check_compiler_is_gcc(compiler):
+def check_compiler_is_gcc(compiler) -> bool:
     if not IS_LINUX:
         return False
 
@@ -1746,21 +1745,18 @@ def check_compiler_is_gcc(compiler):
             version_string = subprocess.check_output([compiler, '--version'], stderr=subprocess.STDOUT, env=env).decode(*SUBPROCESS_DECODE_ARGS)
         except Exception:
             return False
-    # Check for 'gcc' or 'g++' for sccache wrapper
+    # Check for GCC by verifying both COLLECT_GCC and gcc version string are present
+    # This works for c++, g++, gcc, and versioned variants like g++-13
     pattern = re.compile("^COLLECT_GCC=(.*)$", re.MULTILINE)
-    results = re.findall(pattern, version_string)
-    if len(results) != 1:
-        return False
-    compiler_path = os.path.realpath(results[0].strip())
-    # On RHEL/CentOS c++ is a gcc compiler wrapper
-    if os.path.basename(compiler_path) == 'c++' and 'gcc version' in version_string:
+    has_collect_gcc = pattern.search(version_string) is not None
+    if has_collect_gcc and 'gcc version' in version_string:
         return True
     return False
 
 def _check_and_build_extension_h_precompiler_headers(
         extra_cflags,
         extra_include_paths,
-        is_standalone=False):
+        is_standalone=False) -> None:
     r'''
     Precompiled Headers(PCH) can pre-build the same headers and reduce build time for pytorch load_inline modules.
     GCC official manual: https://gcc.gnu.org/onlinedocs/gcc-4.0.4/gcc/Precompiled-Headers.html
@@ -1821,7 +1817,7 @@ def check_pch_signature_in_file(file_path, signature):
             # check if string present in a file
             return signature == content
 
-    def _create_if_not_exist(path_dir):
+    def _create_if_not_exist(path_dir) -> None:
         if not os.path.exists(path_dir):
             try:
                 Path(path_dir).mkdir(parents=True, exist_ok=True)
@@ -1829,15 +1825,15 @@ def _create_if_not_exist(path_dir):
                 if exc.errno != errno.EEXIST:
                     raise RuntimeError(f"Fail to create path {path_dir}") from exc
 
-    def write_pch_signature_to_file(file_path, pch_sign):
+    def write_pch_signature_to_file(file_path, pch_sign) -> None:
         _create_if_not_exist(os.path.dirname(file_path))
         with open(file_path, "w") as f:
             f.write(pch_sign)
             f.close()
 
-    def build_precompile_header(pch_cmd):
+    def build_precompile_header(pch_cmd) -> None:
         try:
-            subprocess.check_output(pch_cmd, shell=True, stderr=subprocess.STDOUT)
+            subprocess.check_output(shlex.split(pch_cmd), stderr=subprocess.STDOUT)
         except subprocess.CalledProcessError as e:
             raise RuntimeError(f"Compile PreCompile Header fail, command: {pch_cmd}") from e
 
@@ -1876,8 +1872,8 @@ def build_precompile_header(pch_cmd):
             build_precompile_header(pch_cmd)
             write_pch_signature_to_file(head_file_signature, pch_sign)
 
-def remove_extension_h_precompiler_headers():
-    def _remove_if_file_exists(path_file):
+def remove_extension_h_precompiler_headers() -> None:
+    def _remove_if_file_exists(path_file) -> None:
         if os.path.exists(path_file):
             os.remove(path_file)
 
@@ -2098,11 +2094,11 @@ def _jit_compile(name,
                  extra_include_paths,
                  build_directory: str,
                  verbose: bool,
-                 with_cuda: Optional[bool],
-                 with_sycl: Optional[bool],
+                 with_cuda: bool | None,
+                 with_sycl: bool | None,
                  is_python_module,
                  is_standalone,
-                 keep_intermediates=True) -> Union[types.ModuleType, str]:
+                 keep_intermediates=True) -> types.ModuleType | str:
     if is_python_module and is_standalone:
         raise ValueError("`is_python_module` and `is_standalone` are mutually exclusive.")
 
@@ -2204,8 +2200,8 @@ def _write_ninja_file_and_compile_objects(
         sycl_dlink_post_cflags,
         build_directory: str,
         verbose: bool,
-        with_cuda: Optional[bool],
-        with_sycl: Optional[bool]) -> None:
+        with_cuda: bool | None,
+        with_sycl: bool | None) -> None:
     verify_ninja_availability()
 
     compiler = get_cxx_compiler()
@@ -2262,8 +2258,8 @@ def _write_ninja_file_and_build_library(
         extra_include_paths,
         build_directory: str,
         verbose: bool,
-        with_cuda: Optional[bool],
-        with_sycl: Optional[bool],
+        with_cuda: bool | None,
+        with_sycl: bool | None,
         is_standalone: bool = False) -> None:
     verify_ninja_availability()
 
@@ -2313,7 +2309,7 @@ def _write_ninja_file_and_build_library(
         error_prefix=f"Error building extension '{name}'")
 
 
-def is_ninja_available():
+def is_ninja_available() -> bool:
     """Return ``True`` if the `ninja <https://ninja-build.org/>`_ build system is available on the system, ``False`` otherwise."""
     try:
         subprocess.check_output(['ninja', '--version'])
@@ -2323,7 +2319,7 @@ def is_ninja_available():
         return True
 
 
-def verify_ninja_availability():
+def verify_ninja_availability() -> None:
     """Raise ``RuntimeError`` if `ninja <https://ninja-build.org/>`_ build system is not available on the system, does nothing otherwise."""
     if not is_ninja_available():
         raise RuntimeError("Ninja is required to load C++ extensions (pip install ninja to get it)")
@@ -2392,7 +2388,7 @@ def _prepare_ldflags(extra_ldflags, with_cuda, verbose, is_standalone):
     return extra_ldflags
 
 
-def _get_cuda_arch_flags(cflags: Optional[list[str]] = None) -> list[str]:
+def _get_cuda_arch_flags(cflags: list[str] | None = None) -> list[str]:
     """
     Determine CUDA arch flags to use.
 
@@ -2498,7 +2494,7 @@ def _get_cuda_arch_flags(cflags: Optional[list[str]] = None) -> list[str]:
     return sorted(set(flags))
 
 
-def _get_rocm_arch_flags(cflags: Optional[list[str]] = None) -> list[str]:
+def _get_rocm_arch_flags(cflags: list[str] | None = None) -> list[str]:
     # If cflags is given, there may already be user-provided arch flags in it
     # (from `extra_compile_args`). If user also specified -fgpu-rdc or -fno-gpu-rdc, we
     # assume they know what they're doing. Otherwise, we force -fno-gpu-rdc default.
@@ -2562,7 +2558,7 @@ def _get_build_directory(name: str, verbose: bool) -> str:
     return build_directory
 
 
-def _get_num_workers(verbose: bool) -> Optional[int]:
+def _get_num_workers(verbose: bool) -> int | None:
     max_jobs = os.environ.get('MAX_JOBS')
     if max_jobs is not None and max_jobs.isdigit():
         if verbose:
diff --git a/torch/utils/data/_utils/collate.py b/torch/utils/data/_utils/collate.py
index cb051f6642dcf..733e84a9afae6 100644
--- a/torch/utils/data/_utils/collate.py
+++ b/torch/utils/data/_utils/collate.py
@@ -13,7 +13,6 @@
 import copy
 import re
 from collections.abc import Callable
-from typing import Optional, Union
 
 import torch
 
@@ -119,7 +118,7 @@ def default_convert(data):
 def collate(
     batch,
     *,
-    collate_fn_map: Optional[dict[Union[type, tuple[type, ...]], Callable]] = None,
+    collate_fn_map: dict[type | tuple[type, ...], Callable] | None = None,
 ):
     r"""
     General collate function that handles collection type of element within each batch.
@@ -247,7 +246,7 @@ def collate(
 def collate_tensor_fn(
     batch,
     *,
-    collate_fn_map: Optional[dict[Union[type, tuple[type, ...]], Callable]] = None,
+    collate_fn_map: dict[type | tuple[type, ...], Callable] | None = None,
 ):
     elem = batch[0]
     out = None
@@ -279,7 +278,7 @@ def collate_tensor_fn(
 def collate_numpy_array_fn(
     batch,
     *,
-    collate_fn_map: Optional[dict[Union[type, tuple[type, ...]], Callable]] = None,
+    collate_fn_map: dict[type | tuple[type, ...], Callable] | None = None,
 ):
     elem = batch[0]
     # array of string classes and object
@@ -292,7 +291,7 @@ def collate_numpy_array_fn(
 def collate_numpy_scalar_fn(
     batch,
     *,
-    collate_fn_map: Optional[dict[Union[type, tuple[type, ...]], Callable]] = None,
+    collate_fn_map: dict[type | tuple[type, ...], Callable] | None = None,
 ):
     return torch.as_tensor(batch)
 
@@ -300,7 +299,7 @@ def collate_numpy_scalar_fn(
 def collate_float_fn(
     batch,
     *,
-    collate_fn_map: Optional[dict[Union[type, tuple[type, ...]], Callable]] = None,
+    collate_fn_map: dict[type | tuple[type, ...], Callable] | None = None,
 ):
     return torch.tensor(batch, dtype=torch.float64)
 
@@ -308,7 +307,7 @@ def collate_float_fn(
 def collate_int_fn(
     batch,
     *,
-    collate_fn_map: Optional[dict[Union[type, tuple[type, ...]], Callable]] = None,
+    collate_fn_map: dict[type | tuple[type, ...], Callable] | None = None,
 ):
     return torch.tensor(batch)
 
@@ -316,12 +315,12 @@ def collate_int_fn(
 def collate_str_fn(
     batch,
     *,
-    collate_fn_map: Optional[dict[Union[type, tuple[type, ...]], Callable]] = None,
+    collate_fn_map: dict[type | tuple[type, ...], Callable] | None = None,
 ):
     return batch
 
 
-default_collate_fn_map: dict[Union[type, tuple[type, ...]], Callable] = {
+default_collate_fn_map: dict[type | tuple[type, ...], Callable] = {
     torch.Tensor: collate_tensor_fn
 }
 with contextlib.suppress(ImportError):
diff --git a/torch/utils/data/_utils/fetch.py b/torch/utils/data/_utils/fetch.py
index 3fa6c49404f67..9bcd0ec5b3073 100644
--- a/torch/utils/data/_utils/fetch.py
+++ b/torch/utils/data/_utils/fetch.py
@@ -4,20 +4,22 @@
 This logic is shared in both single- and multi-processing data loading.
 """
 
+from typing import NoReturn
+
 
 class _BaseDatasetFetcher:
-    def __init__(self, dataset, auto_collation, collate_fn, drop_last):
+    def __init__(self, dataset, auto_collation, collate_fn, drop_last) -> None:
         self.dataset = dataset
         self.auto_collation = auto_collation
         self.collate_fn = collate_fn
         self.drop_last = drop_last
 
-    def fetch(self, possibly_batched_index):
+    def fetch(self, possibly_batched_index) -> NoReturn:
         raise NotImplementedError
 
 
 class _IterableDatasetFetcher(_BaseDatasetFetcher):
-    def __init__(self, dataset, auto_collation, collate_fn, drop_last):
+    def __init__(self, dataset, auto_collation, collate_fn, drop_last) -> None:
         super().__init__(dataset, auto_collation, collate_fn, drop_last)
         self.dataset_iter = iter(dataset)
         self.ended = False
diff --git a/torch/utils/data/_utils/pin_memory.py b/torch/utils/data/_utils/pin_memory.py
index 223962fc04ba9..cd9722b04e588 100644
--- a/torch/utils/data/_utils/pin_memory.py
+++ b/torch/utils/data/_utils/pin_memory.py
@@ -15,7 +15,7 @@
 from . import MP_STATUS_CHECK_INTERVAL
 
 
-def _pin_memory_loop(in_queue, out_queue, device_id, done_event, device):
+def _pin_memory_loop(in_queue, out_queue, device_id, done_event, device) -> None:
     # This setting is thread local, and prevents the copy in pin_memory from
     # consuming all CPU cores.
     torch.set_num_threads(1)
@@ -23,7 +23,7 @@ def _pin_memory_loop(in_queue, out_queue, device_id, done_event, device):
     torch.multiprocessing._set_thread_name("pt_data_pin")
     torch.accelerator.set_device_index(device_id)
 
-    def do_one_step():
+    def do_one_step() -> None:
         try:
             r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
         except queue.Empty:
diff --git a/torch/utils/data/_utils/signal_handling.py b/torch/utils/data/_utils/signal_handling.py
index 33e1dd021e975..abff09bc40819 100644
--- a/torch/utils/data/_utils/signal_handling.py
+++ b/torch/utils/data/_utils/signal_handling.py
@@ -51,7 +51,7 @@
 handler needs to be set for all DataLoaders in a process."""
 
 
-def _set_SIGCHLD_handler():
+def _set_SIGCHLD_handler() -> None:
     # Windows doesn't support SIGCHLD handler
     if IS_WINDOWS:
         return
@@ -67,7 +67,7 @@ def _set_SIGCHLD_handler():
         # no-op.
         previous_handler = None
 
-    def handler(signum, frame):
+    def handler(signum, frame) -> None:
         # This following call uses `waitid` with WNOHANG from C side. Therefore,
         # Python can still get and update the process status successfully.
         _error_if_any_worker_fails()
diff --git a/torch/utils/data/_utils/worker.py b/torch/utils/data/_utils/worker.py
index 5e61912dc6e77..611aee4766bf4 100644
--- a/torch/utils/data/_utils/worker.py
+++ b/torch/utils/data/_utils/worker.py
@@ -9,7 +9,7 @@
 import queue
 import random
 from dataclasses import dataclass
-from typing import Optional, TYPE_CHECKING, Union
+from typing import Optional, TYPE_CHECKING
 
 import torch
 from torch._utils import ExceptionWrapper
@@ -49,7 +49,7 @@ def __init__(self) -> None:
 
             self.manager_dead = False
 
-        def is_alive(self):
+        def is_alive(self) -> bool:
             if not self.manager_dead:
                 # Value obtained from https://msdn.microsoft.com/en-us/library/windows/desktop/ms687032.aspx
                 self.manager_dead = (
@@ -64,7 +64,7 @@ def __init__(self) -> None:
             self.manager_pid = os.getppid()
             self.manager_dead = False
 
-        def is_alive(self):
+        def is_alive(self) -> bool:
             if not self.manager_dead:
                 self.manager_dead = os.getppid() != self.manager_pid
             return not self.manager_dead
@@ -80,25 +80,25 @@ class WorkerInfo:
     dataset: "Dataset"
     __initialized = False
 
-    def __init__(self, **kwargs):
+    def __init__(self, **kwargs) -> None:
         for k, v in kwargs.items():
             setattr(self, k, v)
         self.__keys = tuple(kwargs.keys())
         self.__initialized = True
 
-    def __setattr__(self, key, val):
+    def __setattr__(self, key, val) -> None:
         if self.__initialized:
             raise RuntimeError(
                 f"Cannot assign attributes to {self.__class__.__name__} objects"
             )
         return super().__setattr__(key, val)
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         items = [f"{k}={getattr(self, k)}" for k in self.__keys]
         return f"{self.__class__.__name__}({', '.join(items)})"
 
 
-def get_worker_info() -> Optional[WorkerInfo]:
+def get_worker_info() -> WorkerInfo | None:
     r"""Returns the information about the current
     :class:`~torch.utils.data.DataLoader` iterator worker process.
 
@@ -140,7 +140,7 @@ class _IterableDatasetStopIteration:
 
 @dataclass(frozen=True)
 class _ResumeIteration:
-    seed: Optional[int] = None
+    seed: int | None = None
 
 
 # The function `_generate_state` is adapted from `numpy.random.SeedSequence`
@@ -240,7 +240,7 @@ def _worker_loop(
     num_workers,
     persistent_workers,
     shared_seed,
-):
+) -> None:
     # See NOTE [ Data Loader Multiprocessing Shutdown Logic ] for details on the
     # logic of this function.
 
@@ -349,7 +349,7 @@ def _worker_loop(
                 # processing steps.
                 continue
             idx, index = r
-            data: Union[_IterableDatasetStopIteration, ExceptionWrapper]
+            data: _IterableDatasetStopIteration | ExceptionWrapper
             if init_exception is not None:
                 data = init_exception
                 init_exception = None
diff --git a/torch/utils/data/backward_compatibility.py b/torch/utils/data/backward_compatibility.py
index e8f1c4e30ef72..5b928aea69fa7 100644
--- a/torch/utils/data/backward_compatibility.py
+++ b/torch/utils/data/backward_compatibility.py
@@ -7,5 +7,5 @@
     "as `DataLoader` automatically applies sharding in every worker",
     category=FutureWarning,
 )
-def worker_init_fn(worker_id):
+def worker_init_fn(worker_id) -> None:
     pass
diff --git a/torch/utils/data/dataframes_pipes.ipynb b/torch/utils/data/dataframes_pipes.ipynb
index bc4abeba15b33..4d65abe0e7ef9 100644
--- a/torch/utils/data/dataframes_pipes.ipynb
+++ b/torch/utils/data/dataframes_pipes.ipynb
@@ -27,7 +27,7 @@
    "source": [
     "# Example IterDataPipe\n",
     "class ExampleIterPipe(IterDataPipe):\n",
-    "    def __init__(self, range = 20):\n",
+    "    def __init__(self, range = 20) -> None:\n",
     "        self.range = range\n",
     "    def __iter__(self):\n",
     "        yield from self.range\n",
diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py
index 19400eb4a21a7..e01422708f791 100644
--- a/torch/utils/data/dataloader.py
+++ b/torch/utils/data/dataloader.py
@@ -17,7 +17,7 @@
 import threading
 import warnings
 from collections.abc import Callable
-from typing import Any, Generic, Optional, TYPE_CHECKING, TypeVar, Union
+from typing import Any, Generic, NoReturn, TYPE_CHECKING, TypeVar
 from typing_extensions import Self
 
 import torch
@@ -108,7 +108,7 @@ def _get_distributed_settings():
         return 1, 0
 
 
-def _sharding_worker_init_fn(worker_init_fn, world_size, rank_id, worker_id):
+def _sharding_worker_init_fn(worker_init_fn, world_size, rank_id, worker_id) -> None:
     global_worker_id = worker_id
     info = torch.utils.data.get_worker_info()
     if info is None:
@@ -233,34 +233,34 @@ class DataLoader(Generic[_T_co]):
     """
 
     dataset: Dataset[_T_co]
-    batch_size: Optional[int]
+    batch_size: int | None
     num_workers: int
     pin_memory: bool
     drop_last: bool
     timeout: float
-    sampler: Union[Sampler, Iterable]
+    sampler: Sampler | Iterable
     pin_memory_device: str
-    prefetch_factor: Optional[int]
-    _iterator: Optional[_BaseDataLoaderIter]
+    prefetch_factor: int | None
+    _iterator: _BaseDataLoaderIter | None
     __initialized = False
 
     def __init__(
         self,
         dataset: Dataset[_T_co],
-        batch_size: Optional[int] = 1,
-        shuffle: Optional[bool] = None,
-        sampler: Union[Sampler, Iterable, None] = None,
-        batch_sampler: Union[Sampler[list], Iterable[list], None] = None,
+        batch_size: int | None = 1,
+        shuffle: bool | None = None,
+        sampler: Sampler | Iterable | None = None,
+        batch_sampler: Sampler[list] | Iterable[list] | None = None,
         num_workers: int = 0,
-        collate_fn: Optional[_collate_fn_t] = None,
+        collate_fn: _collate_fn_t | None = None,
         pin_memory: bool = False,
         drop_last: bool = False,
         timeout: float = 0,
-        worker_init_fn: Optional[_worker_init_fn_t] = None,
+        worker_init_fn: _worker_init_fn_t | None = None,
         multiprocessing_context=None,
         generator=None,
         *,
-        prefetch_factor: Optional[int] = None,
+        prefetch_factor: int | None = None,
         persistent_workers: bool = False,
         pin_memory_device: str = "",
         in_order: bool = True,
@@ -436,7 +436,7 @@ def multiprocessing_context(self):
         return self.__multiprocessing_context
 
     @multiprocessing_context.setter
-    def multiprocessing_context(self, multiprocessing_context):
+    def multiprocessing_context(self, multiprocessing_context) -> None:
         if multiprocessing_context is not None:
             if self.num_workers > 0:
                 if isinstance(multiprocessing_context, str):
@@ -468,7 +468,7 @@ def multiprocessing_context(self, multiprocessing_context):
 
         self.__multiprocessing_context = multiprocessing_context
 
-    def __setattr__(self, attr, val):
+    def __setattr__(self, attr, val) -> None:
         if self.__initialized and attr in (
             "batch_size",
             "batch_sampler",
@@ -546,7 +546,7 @@ def __len__(self) -> int:
         else:
             return len(self._index_sampler)
 
-    def check_worker_number_rationality(self):
+    def check_worker_number_rationality(self) -> None:
         # This function check whether the dataloader's worker number is rational based on
         # current system's resource. Current rule is that if the number of workers this
         # Dataloader will create is bigger than the number of logical cpus that is allowed to
@@ -714,7 +714,7 @@ def __init__(self, loader: DataLoader) -> None:
     def __iter__(self) -> Self:
         return self
 
-    def _reset(self, loader, first_iter=False):
+    def _reset(self, loader, first_iter=False) -> None:
         self._sampler_iter = iter(self._index_sampler)
         self._num_yielded = 0
         self._IterableDataset_len_called = loader._IterableDataset_len_called
@@ -729,7 +729,7 @@ def _reset(self, loader, first_iter=False):
     def _next_index(self):
         return next(self._sampler_iter)  # may raise StopIteration
 
-    def _next_data(self):
+    def _next_data(self) -> NoReturn:
         raise NotImplementedError
 
     def __next__(self) -> Any:
@@ -770,7 +770,7 @@ def __getstate__(self):
 
 
 class _SingleProcessDataLoaderIter(_BaseDataLoaderIter):
-    def __init__(self, loader):
+    def __init__(self, loader) -> None:
         super().__init__(loader)
         if self._timeout != 0:
             raise AssertionError("_SingleProcessDataLoaderIter requires timeout == 0")
@@ -1113,7 +1113,7 @@ class _MultiProcessingDataLoaderIter(_BaseDataLoaderIter):
     #     processing indices already in `index_queue` if we are already shutting
     #     down.
 
-    def __init__(self, loader):
+    def __init__(self, loader) -> None:
         super().__init__(loader)
 
         self._prefetch_factor = loader.prefetch_factor
@@ -1185,7 +1185,20 @@ def __init__(self, loader):
             #     it started, so that we do not call .join() if program dies
             #     before it starts, and __del__ tries to join but will get:
             #     AssertionError: can only join a started process.
-            w.start()
+            from pickle import PicklingError
+
+            try:
+                w.start()
+            except (TypeError, AttributeError, PicklingError):
+                warnings.warn(
+                    "Got pickle error when attempting to start a worker Process. "
+                    "This might be because the worker Process arguments are not picklable. "
+                    "Python 3.14+ changed the multiprocessing start method in non-Mac POSIX platforms "
+                    "to 'forkserver', which requires the worker Process arguments to be picklable. "
+                    "You can also try multiprocessing.set_start_method('fork').",
+                    stacklevel=2,
+                )
+                raise
             self._index_queues.append(index_queue)
             self._workers.append(w)
 
@@ -1235,7 +1248,7 @@ def __init__(self, loader):
         self._worker_pids_set = True
         self._reset(loader, first_iter=True)
 
-    def _reset(self, loader, first_iter=False):
+    def _reset(self, loader, first_iter=False) -> None:
         super()._reset(loader, first_iter)
         self._send_idx = 0  # idx of the next task to be sent to workers
         self._rcvd_idx = 0  # idx of the next task to be returned in __next__
@@ -1321,7 +1334,7 @@ def _try_get_data(self, timeout=_utils.MP_STATUS_CHECK_INTERVAL):
                 # test.
                 # See NOTE [ DataLoader on Linux and open files limit ]
                 fds_limit_margin = 10
-                [tempfile.NamedTemporaryFile() for i in range(fds_limit_margin)]
+                [tempfile.NamedTemporaryFile() for _ in range(fds_limit_margin)]  # noqa: SIM115
             except OSError as e:
                 if e.errno == errno.EMFILE:
                     raise RuntimeError(
@@ -1529,7 +1542,7 @@ def _next_data(self):
                 self._rcvd_idx += 1
                 return self._process_data(data, worker_id)
 
-    def _try_put_index(self):
+    def _try_put_index(self) -> None:
         max_tasks = self._prefetch_factor * self._num_workers
         if self._tasks_outstanding >= max_tasks:
             raise AssertionError(
@@ -1568,7 +1581,7 @@ def _process_data(self, data, worker_idx):
             data.reraise()
         return data
 
-    def _mark_worker_as_unavailable(self, worker_id, shutdown=False):
+    def _mark_worker_as_unavailable(self, worker_id, shutdown=False) -> None:
         # Mark a worker as having finished its work e.g., due to
         # exhausting an `IterableDataset`. This should be used only when this
         # `_MultiProcessingDataLoaderIter` is going to continue running.
@@ -1604,7 +1617,7 @@ def _mark_worker_as_unavailable(self, worker_id, shutdown=False):
                 "_workers_done_event state does not match shutdown flag"
             )
 
-    def _shutdown_workers(self):
+    def _shutdown_workers(self) -> None:
         # Called when shutting down this `_MultiProcessingDataLoaderIter`.
         # See NOTE [ Data Loader Multiprocessing Shutdown Logic ] for details on
         # the logic of this function.
@@ -1678,12 +1691,12 @@ def _shutdown_workers(self):
 
     # staticmethod is used to remove reference to `_MultiProcessingDataLoaderIter`
     @staticmethod
-    def _clean_up_worker(w):
+    def _clean_up_worker(w) -> None:
         try:
             w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
         finally:
             if w.is_alive():
                 w.terminate()
 
-    def __del__(self):
+    def __del__(self) -> None:
         self._shutdown_workers()
diff --git a/torch/utils/data/datapipes/_decorator.py b/torch/utils/data/datapipes/_decorator.py
index 507e00259c4c7..0289668c03abc 100644
--- a/torch/utils/data/datapipes/_decorator.py
+++ b/torch/utils/data/datapipes/_decorator.py
@@ -2,7 +2,7 @@
 import inspect
 from collections.abc import Callable
 from functools import wraps
-from typing import Any, get_type_hints, Optional, Union
+from typing import Any, get_type_hints
 
 from torch.utils.data.datapipes._typing import _DataPipeMeta
 from torch.utils.data.datapipes.datapipe import IterDataPipe, MapDataPipe
@@ -73,11 +73,11 @@ def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
 
 
 class non_deterministic:
-    cls: Optional[type[IterDataPipe]] = None
+    cls: type[IterDataPipe] | None = None
     # TODO: Lambda for picking
     deterministic_fn: Callable[..., bool]
 
-    def __init__(self, arg: Union[type[IterDataPipe], Callable[..., bool]]) -> None:
+    def __init__(self, arg: type[IterDataPipe] | Callable[..., bool]) -> None:
         # 1. Decorator doesn't have any argument
         if isinstance(arg, type):  # type: ignore[arg-type]
             if not issubclass(arg, IterDataPipe):  # type: ignore[arg-type]
diff --git a/torch/utils/data/datapipes/_hook_iterator.py b/torch/utils/data/datapipes/_hook_iterator.py
index ae42f75885c1d..2683616804749 100644
--- a/torch/utils/data/datapipes/_hook_iterator.py
+++ b/torch/utils/data/datapipes/_hook_iterator.py
@@ -52,7 +52,7 @@ def _generate_iterdatapipe_msg(datapipe, simplify_dp_name: bool = False):
     return output_string
 
 
-def _gen_invalid_iterdatapipe_msg(datapipe):
+def _gen_invalid_iterdatapipe_msg(datapipe) -> str:
     return (
         "This iterator has been invalidated because another iterator has been created "
         f"from the same IterDataPipe: {_generate_iterdatapipe_msg(datapipe)}\n"
@@ -119,7 +119,7 @@ def _set_datapipe_valid_iterator_id(datapipe):
     return datapipe._valid_iterator_id
 
 
-def hook_iterator(namespace):
+def hook_iterator(namespace) -> None:
     r"""
     Define a hook that is applied to all `__iter__` of metaclass `_DataPipeMeta`.
 
@@ -141,7 +141,7 @@ class IteratorDecorator:
         Those `__iter__` method commonly returns `self` but not necessarily.
         """
 
-        def __init__(self, iterator, datapipe, iterator_id, has_next_method):
+        def __init__(self, iterator, datapipe, iterator_id, has_next_method) -> None:
             self.iterator = iterator
             self.datapipe = datapipe
             self.iterator_id = iterator_id
diff --git a/torch/utils/data/datapipes/_typing.py b/torch/utils/data/datapipes/_typing.py
index 5392d71bce804..e198aa16caa66 100644
--- a/torch/utils/data/datapipes/_typing.py
+++ b/torch/utils/data/datapipes/_typing.py
@@ -235,10 +235,10 @@ def issubinstance(data, data_type):
 class _DataPipeType:
     r"""Save type annotation in `param`."""
 
-    def __init__(self, param):
+    def __init__(self, param) -> None:
         self.param = param
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return _type_repr(self.param)
 
     def __eq__(self, other):
@@ -300,7 +300,7 @@ def __new__(cls, name, bases, namespace, **kwargs):
         )
         return super().__new__(cls, name, bases, namespace, **kwargs)  # type: ignore[call-overload]
 
-    def __init__(self, name, bases, namespace, **kwargs):
+    def __init__(self, name, bases, namespace, **kwargs) -> None:
         super().__init__(name, bases, namespace, **kwargs)  # type: ignore[call-overload]
 
     # TODO: Fix isinstance bug
@@ -388,7 +388,7 @@ def __new__(cls, name, bases, namespace, **kwargs):
             reset_func = namespace["reset"]
 
             @functools.wraps(reset_func)
-            def conditional_reset(*args, **kwargs):
+            def conditional_reset(*args, **kwargs) -> None:
                 r"""
                 Only execute DataPipe's `reset()` method if `_SnapshotState` is `Iterating` or `NotStarted`.
 
@@ -413,7 +413,7 @@ def conditional_reset(*args, **kwargs):
         return super().__new__(cls, name, bases, namespace, **kwargs)  # type: ignore[call-overload]
 
 
-def _dp_init_subclass(sub_cls, *args, **kwargs):
+def _dp_init_subclass(sub_cls, *args, **kwargs) -> None:
     # Add function for datapipe instance to reinforce the type
     sub_cls.reinforce_type = reinforce_type
 
diff --git a/torch/utils/data/datapipes/dataframe/dataframe_wrapper.py b/torch/utils/data/datapipes/dataframe/dataframe_wrapper.py
index 4bbd2505b4b5f..9cfc5c268a174 100644
--- a/torch/utils/data/datapipes/dataframe/dataframe_wrapper.py
+++ b/torch/utils/data/datapipes/dataframe/dataframe_wrapper.py
@@ -1,9 +1,9 @@
 # mypy: allow-untyped-defs
-from typing import Any, Optional
+from typing import Any
 
 
 _pandas: Any = None
-_WITH_PANDAS: Optional[bool] = None
+_WITH_PANDAS: bool | None = None
 
 
 def _try_import_pandas() -> bool:
@@ -83,7 +83,7 @@ def get_df_wrapper():
     return default_wrapper
 
 
-def set_df_wrapper(wrapper):
+def set_df_wrapper(wrapper) -> None:
     global default_wrapper
     default_wrapper = wrapper
 
diff --git a/torch/utils/data/datapipes/dataframe/dataframes.py b/torch/utils/data/datapipes/dataframe/dataframes.py
index 8908721bccd77..463f7384aa6c4 100644
--- a/torch/utils/data/datapipes/dataframe/dataframes.py
+++ b/torch/utils/data/datapipes/dataframe/dataframes.py
@@ -1,5 +1,5 @@
 # mypy: allow-untyped-defs
-from typing import Any, Optional
+from typing import Any, NoReturn
 
 from torch.utils.data.datapipes._decorator import functional_datapipe
 from torch.utils.data.datapipes.dataframe.structures import DataChunkDF
@@ -33,7 +33,7 @@
 ]
 
 
-def disable_capture():
+def disable_capture() -> None:
     CaptureControl.disabled = True
 
 
@@ -42,7 +42,7 @@ class CaptureControl:
 
 
 class DataFrameTracedOps(DFIterDataPipe):
-    def __init__(self, source_datapipe, output_var):
+    def __init__(self, source_datapipe, output_var) -> None:
         self.source_datapipe = source_datapipe
         self.output_var = output_var
 
@@ -72,10 +72,10 @@ def __iter__(self):
 class Capture:
     # TODO: All operations are shared across entire InitialCapture, need to figure out what if we join two captures
 
-    def __init__(self, schema_df=None):
+    def __init__(self, schema_df=None) -> None:
         self.ctx = {"operations": [], "variables": [], "schema_df": schema_df}
 
-    def __str__(self):
+    def __str__(self) -> str:
         return self._ops_str()
 
     def _ops_str(self):
@@ -113,7 +113,7 @@ def __getattr__(self, attrname):
     def __getitem__(self, key):
         return CaptureGetItem(self, key, ctx=self.ctx)
 
-    def __setitem__(self, key, value):
+    def __setitem__(self, key, value) -> None:
         # pyrefly: ignore [missing-attribute]
         self.ctx["operations"].append(CaptureSetItem(self, key, value, ctx=self.ctx))
 
@@ -147,7 +147,7 @@ def _is_context_empty(self):
         # pyrefly: ignore [bad-argument-type]
         return len(self.ctx["operations"]) == 0 and len(self.ctx["variables"]) == 0
 
-    def apply_ops_2(self, dataframe):
+    def apply_ops_2(self, dataframe) -> None:
         # TODO(VitalyFedyunin): Make this calculation thread safe (as currently it updates pointer)
         # pyrefly: ignore [unsupported-operation]
         self.ctx["variables"][0].calculated_value = dataframe
@@ -190,7 +190,7 @@ def __call__(self, *args, **kwargs):
 
 
 class CaptureF(Capture):
-    def __init__(self, ctx=None, **kwargs):
+    def __init__(self, ctx=None, **kwargs) -> None:
         if ctx is None:
             self.ctx = {"operations": [], "variables": []}
         else:
@@ -199,7 +199,7 @@ def __init__(self, ctx=None, **kwargs):
 
 
 class CaptureA(CaptureF):
-    def __str__(self):
+    def __str__(self) -> str:
         return f"{self.kwargs['name']}"
 
     def execute(self):
@@ -208,7 +208,7 @@ def execute(self):
 
 
 class CaptureLikeMock:
-    def __init__(self, name):
+    def __init__(self, name) -> None:
         import unittest.mock as mock
 
         # TODO(VitalyFedyunin): Do not use private function here, copy own implementation instead.
@@ -227,7 +227,7 @@ def __exit__(self, *exc_info):
 
 
 class CaptureCall(Capture):
-    def __init__(self, callable, ctx=None, **kwargs):
+    def __init__(self, callable, ctx=None, **kwargs) -> None:
         if ctx is None:
             self.ctx = {"operations": [], "variables": []}
         else:
@@ -235,7 +235,7 @@ def __init__(self, callable, ctx=None, **kwargs):
         self.kwargs = kwargs
         self.callable = callable
 
-    def __str__(self):
+    def __str__(self) -> str:
         return "{callable}({args},{kwargs})".format(
             callable=self.callable, **self.kwargs
         )
@@ -253,12 +253,12 @@ def execute(self):
 
 
 class CaptureVariableAssign(CaptureF):
-    def __str__(self):
+    def __str__(self) -> str:
         variable = self.kwargs["variable"]
         value = self.kwargs["value"]
         return f"{variable} = {value}"
 
-    def execute(self):
+    def execute(self) -> None:
         self.kwargs["variable"].calculated_value = self.kwargs["value"].execute()
 
 
@@ -266,7 +266,7 @@ class CaptureVariable(Capture):
     # TODO(VitalyFedyunin): This should be atomic and thread safe
     names_idx = 0
 
-    def __init__(self, value, ctx):
+    def __init__(self, value, ctx) -> None:
         if CaptureControl.disabled:
             raise RuntimeError("Attempting to create capture variable with capture off")
         self.ctx = ctx
@@ -275,7 +275,7 @@ def __init__(self, value, ctx):
         CaptureVariable.names_idx += 1
         self.ctx["variables"].append(self)
 
-    def __str__(self):
+    def __str__(self) -> str:
         return self.name
 
     def execute(self):
@@ -292,12 +292,12 @@ def apply_ops(self, dataframe):
 
 
 class CaptureGetItem(Capture):
-    def __init__(self, left, key, ctx):
+    def __init__(self, left, key, ctx) -> None:
         self.ctx = ctx
         self.left = left
         self.key = key
 
-    def __str__(self):
+    def __str__(self) -> str:
         return f"{self.left}[{get_val(self.key)}]"
 
     def execute(self):
@@ -306,28 +306,28 @@ def execute(self):
 
 
 class CaptureSetItem(Capture):
-    def __init__(self, left, key, value, ctx):
+    def __init__(self, left, key, value, ctx) -> None:
         self.ctx = ctx
         self.left = left
         self.key = key
         self.value = value
 
-    def __str__(self):
+    def __str__(self) -> str:
         return f"{self.left}[{get_val(self.key)}] = {self.value}"
 
-    def execute(self):
+    def execute(self) -> None:
         left = self.left.execute()
         value = self.value.execute()
         left[self.key] = value
 
 
 class CaptureAdd(Capture):
-    def __init__(self, left, right, ctx):
+    def __init__(self, left, right, ctx) -> None:
         self.ctx = ctx
         self.left = left
         self.right = right
 
-    def __str__(self):
+    def __str__(self) -> str:
         return f"{self.left} + {self.right}"
 
     def execute(self):
@@ -335,12 +335,12 @@ def execute(self):
 
 
 class CaptureMul(Capture):
-    def __init__(self, left, right, ctx):
+    def __init__(self, left, right, ctx) -> None:
         self.ctx = ctx
         self.left = left
         self.right = right
 
-    def __str__(self):
+    def __str__(self) -> str:
         return f"{self.left} * {self.right}"
 
     def execute(self):
@@ -348,12 +348,12 @@ def execute(self):
 
 
 class CaptureSub(Capture):
-    def __init__(self, left, right, ctx):
+    def __init__(self, left, right, ctx) -> None:
         self.ctx = ctx
         self.left = left
         self.right = right
 
-    def __str__(self):
+    def __str__(self) -> str:
         return f"{self.left} - {self.right}"
 
     def execute(self):
@@ -361,12 +361,12 @@ def execute(self):
 
 
 class CaptureGetAttr(Capture):
-    def __init__(self, src, name, ctx):
+    def __init__(self, src, name, ctx) -> None:
         self.ctx = ctx
         self.src = src
         self.name = name
 
-    def __str__(self):
+    def __str__(self) -> str:
         return f"{self.src}.{self.name}"
 
     def execute(self):
@@ -384,7 +384,7 @@ def get_val(capture):
 
 
 class CaptureInitial(CaptureVariable):
-    def __init__(self, schema_df=None):
+    def __init__(self, schema_df=None) -> None:
         # pyrefly: ignore [bad-assignment]
         new_ctx: dict[str, list[Any]] = {
             "operations": [],
@@ -441,7 +441,7 @@ def shuffle(self, *args, **kwargs):
     def filter(self, *args, **kwargs):
         return self._dataframes_filter(*args, **kwargs)
 
-    def collate(self, *args, **kwargs):
+    def collate(self, *args, **kwargs) -> NoReturn:
         raise RuntimeError("Can't collate unbatched DataFrames stream")
 
     def __getattr__(self, attrname):  # ?
@@ -454,17 +454,17 @@ def __getattr__(self, attrname):  # ?
 
 @functional_datapipe("trace_as_dataframe")
 class DataFrameTracer(CaptureDataFrameWithDataPipeOps, IterDataPipe):  # type: ignore[misc]
-    source_datapipe: Optional[Any] = None
+    source_datapipe: Any | None = None
 
     # TODO(VitalyFedyunin): Must implement all special functions of datapipes
 
-    def set_shuffle_settings(self, *args, **kwargs):
+    def set_shuffle_settings(self, *args, **kwargs) -> None:
         pass
 
-    def is_shardable(self):
+    def is_shardable(self) -> bool:
         return False
 
-    def __init__(self, source_datapipe, schema_df=None):
+    def __init__(self, source_datapipe, schema_df=None) -> None:
         self.source_datapipe = source_datapipe
         if schema_df is None:
             schema_df = next(iter(self.source_datapipe))
diff --git a/torch/utils/data/datapipes/dataframe/datapipes.py b/torch/utils/data/datapipes/dataframe/datapipes.py
index 0526b472ad194..50c5a44dfd5f3 100644
--- a/torch/utils/data/datapipes/dataframe/datapipes.py
+++ b/torch/utils/data/datapipes/dataframe/datapipes.py
@@ -19,7 +19,7 @@
 
 @functional_datapipe("_dataframes_as_tuples")
 class DataFramesAsTuplesPipe(IterDataPipe):
-    def __init__(self, source_datapipe):
+    def __init__(self, source_datapipe) -> None:
         self.source_datapipe = source_datapipe
 
     def __iter__(self):
@@ -30,7 +30,7 @@ def __iter__(self):
 
 @functional_datapipe("_dataframes_per_row", enable_df_api_tracing=True)
 class PerRowDataFramesPipe(DFIterDataPipe):
-    def __init__(self, source_datapipe):
+    def __init__(self, source_datapipe) -> None:
         self.source_datapipe = source_datapipe
 
     def __iter__(self):
@@ -42,7 +42,7 @@ def __iter__(self):
 
 @functional_datapipe("_dataframes_concat", enable_df_api_tracing=True)
 class ConcatDataFramesPipe(DFIterDataPipe):
-    def __init__(self, source_datapipe, batch=3):
+    def __init__(self, source_datapipe, batch=3) -> None:
         self.source_datapipe = source_datapipe
         self.n_batch = batch
 
@@ -59,7 +59,7 @@ def __iter__(self):
 
 @functional_datapipe("_dataframes_shuffle", enable_df_api_tracing=True)
 class ShuffleDataFramesPipe(DFIterDataPipe):
-    def __init__(self, source_datapipe):
+    def __init__(self, source_datapipe) -> None:
         self.source_datapipe = source_datapipe
 
     def __iter__(self):
@@ -84,7 +84,7 @@ def __iter__(self):
 
 @functional_datapipe("_dataframes_filter", enable_df_api_tracing=True)
 class FilterDataFramesPipe(DFIterDataPipe):
-    def __init__(self, source_datapipe, filter_fn):
+    def __init__(self, source_datapipe, filter_fn) -> None:
         self.source_datapipe = source_datapipe
         self.filter_fn = filter_fn
 
@@ -113,7 +113,7 @@ def __iter__(self):
 
 @functional_datapipe("_to_dataframes_pipe", enable_df_api_tracing=True)
 class ExampleAggregateAsDataFrames(DFIterDataPipe):
-    def __init__(self, source_datapipe, dataframe_size=10, columns=None):
+    def __init__(self, source_datapipe, dataframe_size=10, columns=None) -> None:
         self.source_datapipe = source_datapipe
         self.columns = columns
         self.dataframe_size = dataframe_size
diff --git a/torch/utils/data/datapipes/datapipe.py b/torch/utils/data/datapipes/datapipe.py
index f0811ac81b616..51c1689008530 100644
--- a/torch/utils/data/datapipes/datapipe.py
+++ b/torch/utils/data/datapipes/datapipe.py
@@ -1,7 +1,7 @@
 import functools
 import pickle
 from collections.abc import Callable, Iterable, Iterator
-from typing import Optional, TypeVar
+from typing import TypeVar
 
 from torch.utils._import_utils import import_dill
 from torch.utils.data.datapipes._hook_iterator import _SnapshotState
@@ -125,14 +125,14 @@ class IterDataPipe(IterableDataset[_T_co], metaclass=_IterDataPipeMeta):
     """
 
     functions: dict[str, Callable] = {}
-    reduce_ex_hook: Optional[Callable] = None
-    getstate_hook: Optional[Callable] = None
-    str_hook: Optional[Callable] = None
-    repr_hook: Optional[Callable] = None
-    _valid_iterator_id: Optional[int] = None
+    reduce_ex_hook: Callable | None = None
+    getstate_hook: Callable | None = None
+    str_hook: Callable | None = None
+    repr_hook: Callable | None = None
+    _valid_iterator_id: int | None = None
     _number_of_samples_yielded: int = 0
     _snapshot_state: _SnapshotState = _SnapshotState.NotStarted
-    _fast_forward_iterator: Optional[Iterator] = None
+    _fast_forward_iterator: Iterator | None = None
 
     def __iter__(self) -> Iterator[_T_co]:
         # pyrefly: ignore [bad-return]
@@ -153,13 +153,13 @@ def __getattr__(self, attribute_name):
             )
 
     @classmethod
-    def register_function(cls, function_name, function):
+    def register_function(cls, function_name, function) -> None:
         cls.functions[function_name] = function
 
     @classmethod
     def register_datapipe_as_function(
         cls, function_name, cls_to_register, enable_df_api_tracing=False
-    ):
+    ) -> None:
         if function_name in cls.functions:
             raise Exception(  # noqa: TRY002
                 f"Unable to add DataPipe function name {function_name} as it is already taken"
@@ -203,24 +203,24 @@ def __reduce_ex__(self, *args, **kwargs):
         return super().__reduce_ex__(*args, **kwargs)
 
     @classmethod
-    def set_getstate_hook(cls, hook_fn):
+    def set_getstate_hook(cls, hook_fn) -> None:
         if IterDataPipe.getstate_hook is not None and hook_fn is not None:
             raise RuntimeError("Attempt to override existing getstate_hook")
         IterDataPipe.getstate_hook = hook_fn
 
     @classmethod
-    def set_reduce_ex_hook(cls, hook_fn):
+    def set_reduce_ex_hook(cls, hook_fn) -> None:
         if IterDataPipe.reduce_ex_hook is not None and hook_fn is not None:
             raise RuntimeError("Attempt to override existing reduce_ex_hook")
         IterDataPipe.reduce_ex_hook = hook_fn
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         if self.repr_hook is not None:
             return self.repr_hook(self)
         # Instead of showing <torch. ... .MapperIterDataPipe object at 0x.....>, return the class name
         return str(self.__class__.__qualname__)
 
-    def __str__(self):
+    def __str__(self) -> str:
         if self.str_hook is not None:
             return self.str_hook(self)
         # Instead of showing <torch. ... .MapperIterDataPipe object at 0x.....>, return the class name
@@ -242,7 +242,7 @@ def reset(self) -> None:
 
 
 class DFIterDataPipe(IterDataPipe):
-    def _is_dfpipe(self):
+    def _is_dfpipe(self) -> bool:
         return True
 
 
@@ -281,10 +281,10 @@ class MapDataPipe(Dataset[_T_co], metaclass=_DataPipeMeta):
     """
 
     functions: dict[str, Callable] = {}
-    reduce_ex_hook: Optional[Callable] = None
-    getstate_hook: Optional[Callable] = None
-    str_hook: Optional[Callable] = None
-    repr_hook: Optional[Callable] = None
+    reduce_ex_hook: Callable | None = None
+    getstate_hook: Callable | None = None
+    str_hook: Callable | None = None
+    repr_hook: Callable | None = None
 
     def __getattr__(self, attribute_name):
         if attribute_name in MapDataPipe.functions:
@@ -301,11 +301,11 @@ def __getattr__(self, attribute_name):
             )
 
     @classmethod
-    def register_function(cls, function_name, function):
+    def register_function(cls, function_name, function) -> None:
         cls.functions[function_name] = function
 
     @classmethod
-    def register_datapipe_as_function(cls, function_name, cls_to_register):
+    def register_datapipe_as_function(cls, function_name, cls_to_register) -> None:
         if function_name in cls.functions:
             raise Exception(  # noqa: TRY002
                 f"Unable to add DataPipe function name {function_name} as it is already taken"
@@ -342,24 +342,24 @@ def __reduce_ex__(self, *args, **kwargs):
         return super().__reduce_ex__(*args, **kwargs)
 
     @classmethod
-    def set_getstate_hook(cls, hook_fn):
+    def set_getstate_hook(cls, hook_fn) -> None:
         if MapDataPipe.getstate_hook is not None and hook_fn is not None:
             raise RuntimeError("Attempt to override existing getstate_hook")
         MapDataPipe.getstate_hook = hook_fn
 
     @classmethod
-    def set_reduce_ex_hook(cls, hook_fn):
+    def set_reduce_ex_hook(cls, hook_fn) -> None:
         if MapDataPipe.reduce_ex_hook is not None and hook_fn is not None:
             raise RuntimeError("Attempt to override existing reduce_ex_hook")
         MapDataPipe.reduce_ex_hook = hook_fn
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         if self.repr_hook is not None:
             return self.repr_hook(self)
         # Instead of showing <torch. ... .MapperMapDataPipe object at 0x.....>, return the class name
         return str(self.__class__.__qualname__)
 
-    def __str__(self):
+    def __str__(self) -> str:
         if self.str_hook is not None:
             return self.str_hook(self)
         # Instead of showing <torch. ... .MapperMapDataPipe object at 0x.....>, return the class name
@@ -371,7 +371,7 @@ def __dir__(self):
 
 
 class _DataPipeSerializationWrapper:
-    def __init__(self, datapipe):
+    def __init__(self, datapipe) -> None:
         self._datapipe = datapipe
 
     def __getstate__(self):
@@ -395,7 +395,7 @@ def __setstate__(self, state):
         else:
             self._datapipe = pickle.loads(value)
 
-    def __len__(self):
+    def __len__(self) -> int:
         try:
             return len(self._datapipe)
         except Exception as e:
@@ -405,10 +405,10 @@ def __len__(self):
 
 
 class _IterDataPipeSerializationWrapper(_DataPipeSerializationWrapper, IterDataPipe):
-    def __init__(self, datapipe: IterDataPipe[_T_co]):
+    def __init__(self, datapipe: IterDataPipe[_T_co]) -> None:
         super().__init__(datapipe)
         # pyrefly: ignore [invalid-type-var]
-        self._datapipe_iter: Optional[Iterator[_T_co]] = None
+        self._datapipe_iter: Iterator[_T_co] | None = None
 
     def __iter__(self) -> "_IterDataPipeSerializationWrapper":
         self._datapipe_iter = iter(self._datapipe)
diff --git a/torch/utils/data/datapipes/datapipe.pyi.in b/torch/utils/data/datapipes/datapipe.pyi.in
index 73cfa120e4944..084f253b5ddbe 100644
--- a/torch/utils/data/datapipes/datapipe.pyi.in
+++ b/torch/utils/data/datapipes/datapipe.pyi.in
@@ -5,8 +5,8 @@
 # Note that, for mypy, .pyi file takes precedent over .py file, such that we must define the interface for other
 # classes/objects here, even though we are not injecting extra code into them at the moment.
 
-from collections.abc import Iterable, Iterator
-from typing import Any, Callable, Literal, Optional, TypeVar, Union
+from collections.abc import Callable, Iterable, Iterator
+from typing import Any, Literal, TypeVar
 
 from torch.utils.data import Dataset, default_collate, IterableDataset
 from torch.utils.data.datapipes._hook_iterator import _SnapshotState
@@ -48,13 +48,13 @@ class MapDataPipe(Dataset[_T_co], metaclass=_DataPipeMeta):
 
 class IterDataPipe(IterableDataset[_T_co], metaclass=_IterDataPipeMeta):
     functions: dict[str, Callable] = ...
-    reduce_ex_hook: Optional[Callable] = ...
-    getstate_hook: Optional[Callable] = ...
-    str_hook: Optional[Callable] = ...
-    repr_hook: Optional[Callable] = ...
+    reduce_ex_hook: Callable | None = ...
+    getstate_hook: Callable | None = ...
+    str_hook: Callable | None = ...
+    repr_hook: Callable | None = ...
     _number_of_samples_yielded: int = ...
     _snapshot_state: _SnapshotState = _SnapshotState.Iterating  # noqa: PYI015
-    _fast_forward_iterator: Optional[Iterator] = ...
+    _fast_forward_iterator: Iterator | None = ...
     def __getattr__(self, attribute_name: Any): ...
     @classmethod
     def register_function(cls, function_name: Any, function: Any) -> None: ...
diff --git a/torch/utils/data/datapipes/gen_pyi.py b/torch/utils/data/datapipes/gen_pyi.py
index 9f16f6f4552d4..90f9d80a2e7fe 100644
--- a/torch/utils/data/datapipes/gen_pyi.py
+++ b/torch/utils/data/datapipes/gen_pyi.py
@@ -2,7 +2,7 @@
 import os
 from collections import defaultdict
 from pathlib import Path
-from typing import Any, Union
+from typing import Any
 from typing_extensions import deprecated
 
 
@@ -52,7 +52,7 @@ def gen_from_template(
     template_name: str,
     output_name: str,
     replacements: list[tuple[str, Any, int]],
-):
+) -> None:
     template_path = os.path.join(dir, template_name)
     output_path = os.path.join(dir, output_name)
 
@@ -225,7 +225,7 @@ def process_signature(line: str) -> list[str]:
 
 
 def get_method_definitions(
-    file_path: Union[str, list[str]],
+    file_path: str | list[str],
     files_to_exclude: set[str],
     deprecated_files: set[str],
     default_output_type: str,
diff --git a/torch/utils/data/datapipes/iter/callable.py b/torch/utils/data/datapipes/iter/callable.py
index 1ce1c9c07196c..af1d9792c097b 100644
--- a/torch/utils/data/datapipes/iter/callable.py
+++ b/torch/utils/data/datapipes/iter/callable.py
@@ -2,7 +2,7 @@
 import functools
 from collections import namedtuple
 from collections.abc import Callable, Iterator, Sized
-from typing import Any, Optional, TypeVar, Union
+from typing import Any, TypeVar
 
 import torch
 from torch.utils.data._utils.collate import default_collate
@@ -149,7 +149,7 @@ def _collate_helper(conversion, item):
     tuple_names: list = []
     tuple_values: list = []
 
-    for name in conversion.keys():
+    for name in conversion:
         if name not in columns_name:
             raise RuntimeError("Conversion keys mismatch")
 
@@ -226,10 +226,10 @@ class CollatorIterDataPipe(MapperIterDataPipe):
     def __init__(
         self,
         datapipe: IterDataPipe,
-        conversion: Union[
-            Callable[..., Any], dict[Union[str, Any], Union[Callable, Any]], None
-        ] = default_collate,
-        collate_fn: Optional[Callable] = None,
+        conversion: Callable[..., Any]
+        | dict[str | Any, Callable | Any]
+        | None = default_collate,
+        collate_fn: Callable | None = None,
     ) -> None:
         # TODO(VitalyFedyunin): Replace `Callable[..., Any]` with `Callable[[IColumn], Any]`
         # TODO(VitalyFedyunin): Replace with `Dict[Union[str, IColumn], Union[Callable, Enum]]`
diff --git a/torch/utils/data/datapipes/iter/combinatorics.py b/torch/utils/data/datapipes/iter/combinatorics.py
index ff76e995f0ad2..79a774c5e63db 100644
--- a/torch/utils/data/datapipes/iter/combinatorics.py
+++ b/torch/utils/data/datapipes/iter/combinatorics.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 import random
 from collections.abc import Iterator, Sized
-from typing import Optional, TypeVar
+from typing import TypeVar
 
 import torch
 from torch.utils.data.datapipes._decorator import functional_datapipe
@@ -35,8 +35,8 @@ def __init__(
         self,
         datapipe: IterDataPipe,
         sampler: type[Sampler] = SequentialSampler,
-        sampler_args: Optional[tuple] = None,
-        sampler_kwargs: Optional[dict] = None,
+        sampler_args: tuple | None = None,
+        sampler_kwargs: dict | None = None,
     ) -> None:
         if not isinstance(datapipe, Sized):
             raise AssertionError(
@@ -99,7 +99,7 @@ class ShufflerIterDataPipe(IterDataPipe[_T_co]):
     buffer_size: int
     _buffer: list[_T_co]
     _enabled: bool
-    _seed: Optional[int]
+    _seed: int | None
     _rng: random.Random
 
     def __init__(
@@ -189,5 +189,5 @@ def __setstate__(self, state):
         self._rng = random.Random()
         self._rng.setstate(rng_state)
 
-    def __del__(self):
+    def __del__(self) -> None:
         self._buffer.clear()
diff --git a/torch/utils/data/datapipes/iter/combining.py b/torch/utils/data/datapipes/iter/combining.py
index 6efaa8c3d8be9..4915e4c3d7c52 100644
--- a/torch/utils/data/datapipes/iter/combining.py
+++ b/torch/utils/data/datapipes/iter/combining.py
@@ -4,7 +4,7 @@
 from abc import ABC, abstractmethod
 from collections import deque
 from collections.abc import Callable, Iterator, Sized
-from typing import Any, Literal, Optional, TypeVar
+from typing import Any, Literal, TypeVar
 
 from torch.utils.data.datapipes._decorator import functional_datapipe
 from torch.utils.data.datapipes._hook_iterator import _SnapshotState
@@ -46,7 +46,7 @@ class ConcaterIterDataPipe(IterDataPipe):
 
     datapipes: tuple[IterDataPipe]
 
-    def __init__(self, *datapipes: IterDataPipe):
+    def __init__(self, *datapipes: IterDataPipe) -> None:
         if len(datapipes) == 0:
             raise ValueError("Expected at least one DataPipe, but got nothing")
         if not all(isinstance(dp, IterDataPipe) for dp in datapipes):
@@ -101,7 +101,7 @@ def __new__(
         datapipe: IterDataPipe,
         num_instances: int,
         buffer_size: int = 1000,
-        copy: Optional[Literal["shallow", "deep"]] = None,
+        copy: Literal["shallow", "deep"] | None = None,
     ):
         if num_instances < 1:
             raise ValueError(
@@ -147,10 +147,10 @@ def __init__(
         datapipe: IterDataPipe,
         num_instances: int,
         buffer_size: int = 1000,
-        copy: Optional[Literal["shallow", "deep"]] = None,
-    ):
+        copy: Literal["shallow", "deep"] | None = None,
+    ) -> None:
         self.main_datapipe = datapipe
-        self._datapipe_iterator: Optional[Iterator[Any]] = None
+        self._datapipe_iterator: Iterator[Any] | None = None
         self.num_instances = num_instances
         self.buffer: deque = deque()
         self.buffer_size = buffer_size
@@ -177,10 +177,10 @@ def __init__(
         ] * num_instances  # Indicate the indices of the next element to get
         self.slowest_ptr = 0  # The index to read by the slowest child
         self.leading_ptr = 0  # The index to read by the fastest child
-        self.end_ptr: Optional[int] = None  # The index to stop child
+        self.end_ptr: int | None = None  # The index to stop child
         self._child_stop: list[bool] = [True for _ in range(num_instances)]
 
-    def __len__(self):
+    def __len__(self) -> int:
         # pyrefly: ignore [bad-argument-type]
         return len(self.main_datapipe)
 
@@ -283,12 +283,12 @@ def __setstate__(self, state):
         self.end_ptr = None
         self._child_stop = [True for _ in range(self.num_instances)]
 
-    def _cleanup(self):
+    def _cleanup(self) -> None:
         while self.buffer:
             d = self.buffer.popleft()
             StreamWrapper.close_streams(d)
 
-    def __del__(self):
+    def __del__(self) -> None:
         self._cleanup()
 
 
@@ -324,7 +324,7 @@ class _ChildDataPipe(IterDataPipe):
 
     _is_child_datapipe: bool = True
 
-    def __init__(self, main_datapipe: IterDataPipe, instance_id: int):
+    def __init__(self, main_datapipe: IterDataPipe, instance_id: int) -> None:
         if not isinstance(main_datapipe, _ContainerTemplate):
             raise AssertionError("main_datapipe must implement _ContainerTemplate")
 
@@ -337,7 +337,7 @@ def __iter__(self):
         # We want to separate the code for reset and yield, so that 'reset' executes before __next__ is called
         return self.main_datapipe.get_next_element_by_instance(self.instance_id)
 
-    def __len__(self):
+    def __len__(self) -> int:
         return self.main_datapipe.get_length_by_instance(self.instance_id)
 
     # This method is called by `hook_iterator` in `_typing.py`.
@@ -420,7 +420,7 @@ def __new__(
         cls,
         datapipe: IterDataPipe,
         num_instances: int,
-        classifier_fn: Callable[[_T_co], Optional[int]],
+        classifier_fn: Callable[[_T_co], int | None],
         drop_none: bool = False,
         buffer_size: int = 1000,
     ):
@@ -452,13 +452,13 @@ def __init__(
         self,
         datapipe: IterDataPipe[_T_co],
         num_instances: int,
-        classifier_fn: Callable[[_T_co], Optional[int]],
+        classifier_fn: Callable[[_T_co], int | None],
         drop_none: bool,
         buffer_size: int,
-    ):
+    ) -> None:
         # pyrefly: ignore [invalid-type-var]
         self.main_datapipe = datapipe
-        self._datapipe_iterator: Optional[Iterator[Any]] = None
+        self._datapipe_iterator: Iterator[Any] | None = None
         self.num_instances = num_instances
         self.buffer_size = buffer_size
         if self.buffer_size < 0:
@@ -582,7 +582,7 @@ def __setstate__(self, state):
         self._child_stop = [True for _ in range(self.num_instances)]
         self.main_datapipe_exhausted = False
 
-    def _cleanup(self, instance_id: Optional[int] = None):
+    def _cleanup(self, instance_id: int | None = None) -> None:
         ids = (
             range(self.num_instances)
             if instance_id is None
@@ -596,7 +596,7 @@ def _cleanup(self, instance_id: Optional[int] = None):
                 d = q.popleft()
                 StreamWrapper.close_streams(d)
 
-    def __del__(self):
+    def __del__(self) -> None:
         self._cleanup()
 
 
@@ -623,7 +623,7 @@ class MultiplexerIterDataPipe(IterDataPipe):
         [0, 10, 20, 1, 11, 21, 2, 12, 22]
     """
 
-    def __init__(self, *datapipes):
+    def __init__(self, *datapipes) -> None:
         self.datapipes = datapipes
         self.buffer: list = []  # Store values to be yielded only when every iterator provides one
 
@@ -640,7 +640,7 @@ def __iter__(self):
             yield from self.buffer
             self.buffer.clear()
 
-    def __len__(self):
+    def __len__(self) -> int:
         if all(isinstance(dp, Sized) for dp in self.datapipes):
             return min(len(dp) for dp in self.datapipes) * len(self.datapipes)
         else:
@@ -667,7 +667,7 @@ def __setstate__(self, state):
         ) = state
         self.buffer = []
 
-    def __del__(self):
+    def __del__(self) -> None:
         self.buffer.clear()
 
 
@@ -695,7 +695,7 @@ class ZipperIterDataPipe(IterDataPipe[tuple[_T_co]]):
 
     datapipes: tuple[IterDataPipe]
 
-    def __init__(self, *datapipes: IterDataPipe):
+    def __init__(self, *datapipes: IterDataPipe) -> None:
         if not all(isinstance(dp, IterDataPipe) for dp in datapipes):
             raise TypeError(
                 "All inputs are required to be `IterDataPipe` for `ZipIterDataPipe`."
diff --git a/torch/utils/data/datapipes/iter/filelister.py b/torch/utils/data/datapipes/iter/filelister.py
index 2b3d16bed2a66..352d3c01e12d2 100644
--- a/torch/utils/data/datapipes/iter/filelister.py
+++ b/torch/utils/data/datapipes/iter/filelister.py
@@ -1,5 +1,4 @@
 from collections.abc import Iterator, Sequence
-from typing import Union
 
 from torch.utils.data.datapipes._decorator import functional_datapipe
 from torch.utils.data.datapipes.datapipe import IterDataPipe
@@ -36,8 +35,8 @@ class FileListerIterDataPipe(IterDataPipe[str]):
 
     def __init__(
         self,
-        root: Union[str, Sequence[str], IterDataPipe] = ".",
-        masks: Union[str, list[str]] = "",
+        root: str | Sequence[str] | IterDataPipe = ".",
+        masks: str | list[str] = "",
         *,
         recursive: bool = False,
         abspath: bool = False,
@@ -50,7 +49,7 @@ def __init__(
         if not isinstance(root, IterDataPipe):
             root = IterableWrapperIterDataPipe(root)
         self.datapipe: IterDataPipe = root
-        self.masks: Union[str, list[str]] = masks
+        self.masks: str | list[str] = masks
         self.recursive: bool = recursive
         self.abspath: bool = abspath
         self.non_deterministic: bool = non_deterministic
diff --git a/torch/utils/data/datapipes/iter/fileopener.py b/torch/utils/data/datapipes/iter/fileopener.py
index 5b627a190e8a8..e77f7a4c8e660 100644
--- a/torch/utils/data/datapipes/iter/fileopener.py
+++ b/torch/utils/data/datapipes/iter/fileopener.py
@@ -1,6 +1,5 @@
 from collections.abc import Iterable, Iterator
 from io import IOBase
-from typing import Optional
 
 from torch.utils.data.datapipes._decorator import functional_datapipe
 from torch.utils.data.datapipes.datapipe import IterDataPipe
@@ -48,13 +47,13 @@ def __init__(
         self,
         datapipe: Iterable[str],
         mode: str = "r",
-        encoding: Optional[str] = None,
+        encoding: str | None = None,
         length: int = -1,
-    ):
+    ) -> None:
         super().__init__()
         self.datapipe: Iterable[str] = datapipe
         self.mode: str = mode
-        self.encoding: Optional[str] = encoding
+        self.encoding: str | None = encoding
 
         if self.mode not in ("b", "t", "rb", "rt", "r"):
             raise ValueError(f"Invalid mode {mode}")
diff --git a/torch/utils/data/datapipes/iter/grouping.py b/torch/utils/data/datapipes/iter/grouping.py
index 865feb9953e35..b773f06823a76 100644
--- a/torch/utils/data/datapipes/iter/grouping.py
+++ b/torch/utils/data/datapipes/iter/grouping.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 from collections import defaultdict
 from collections.abc import Callable, Iterator, Sized
-from typing import Any, Optional, TypeVar
+from typing import Any, NoReturn, TypeVar
 
 from torch.utils.data.datapipes._decorator import functional_datapipe
 from torch.utils.data.datapipes.datapipe import DataChunk, IterDataPipe
@@ -18,7 +18,7 @@
 _T_co = TypeVar("_T_co", covariant=True)
 
 
-def __getattr__(name: str):
+def __getattr__(name: str) -> NoReturn:
     raise AttributeError(f"module {__name__} has no attribute {name}")
 
 
@@ -110,7 +110,7 @@ class UnBatcherIterDataPipe(IterDataPipe):
         [0, 1, 2, 3, 4, 5, 6]
     """
 
-    def __init__(self, datapipe: IterDataPipe, unbatch_level: int = 1):
+    def __init__(self, datapipe: IterDataPipe, unbatch_level: int = 1) -> None:
         self.datapipe = datapipe
         self.unbatch_level = unbatch_level
 
@@ -199,10 +199,10 @@ def __init__(
         *,
         keep_key: bool = False,
         buffer_size: int = 10000,
-        group_size: Optional[int] = None,
-        guaranteed_group_size: Optional[int] = None,
+        group_size: int | None = None,
+        guaranteed_group_size: int | None = None,
         drop_remaining: bool = False,
-    ):
+    ) -> None:
         _check_unpickable_fn(group_key_fn)
         # pyrefly: ignore [invalid-type-var]
         self.datapipe = datapipe
@@ -234,7 +234,7 @@ def _remove_biggest_key(self):
         biggest_key = None
         biggest_size = 0
         result_to_yield = None
-        for findkey in self.buffer_elements.keys():
+        for findkey in self.buffer_elements:
             if len(self.buffer_elements[findkey]) > biggest_size:
                 biggest_size = len(self.buffer_elements[findkey])
                 biggest_key = findkey
@@ -322,5 +322,5 @@ def __setstate__(self, state):
         self.curr_buffer_size = 0
         self.buffer_elements = defaultdict(list)
 
-    def __del__(self):
+    def __del__(self) -> None:
         self.buffer_elements.clear()
diff --git a/torch/utils/data/datapipes/iter/sharding.py b/torch/utils/data/datapipes/iter/sharding.py
index 0e381c87a4a58..494ea0106a041 100644
--- a/torch/utils/data/datapipes/iter/sharding.py
+++ b/torch/utils/data/datapipes/iter/sharding.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-defs
 from collections.abc import Sized
 from enum import IntEnum
+from typing import NoReturn
 
 from torch.utils.data.datapipes._decorator import functional_datapipe
 from torch.utils.data.datapipes.datapipe import IterDataPipe
@@ -24,7 +25,7 @@ def apply_sharding(
         num_of_instances: int,
         instance_id: int,
         sharding_group: SHARDING_PRIORITIES,
-    ):
+    ) -> NoReturn:
         raise NotImplementedError
 
 
@@ -40,7 +41,9 @@ class ShardingFilterIterDataPipe(_ShardingIterDataPipe):
         source_datapipe: Iterable DataPipe that will be sharded
     """
 
-    def __init__(self, source_datapipe: IterDataPipe, sharding_group_filter=None):
+    def __init__(
+        self, source_datapipe: IterDataPipe, sharding_group_filter=None
+    ) -> None:
         self.source_datapipe = source_datapipe
         self.sharding_group_filter = sharding_group_filter
         self.groups: dict[int, tuple[int, int]] = {}
@@ -68,7 +71,7 @@ def apply_sharding(
         self.groups[sharding_group] = (num_of_instances, instance_id)
         self._update_num_of_instances()
 
-    def _update_num_of_instances(self):
+    def _update_num_of_instances(self) -> None:
         sorted_sharding_groups = [
             self.groups[key]
             for key in sorted(self.groups.keys())
@@ -89,7 +92,7 @@ def __iter__(self):
             if i % self.num_of_instances == self.instance_id:
                 yield item
 
-    def __len__(self):
+    def __len__(self) -> int:
         if isinstance(self.source_datapipe, Sized):
             return len(self.source_datapipe) // self.num_of_instances + (
                 1
diff --git a/torch/utils/data/datapipes/iter/streamreader.py b/torch/utils/data/datapipes/iter/streamreader.py
index 4c3af4f12a81f..1129c06548e1f 100644
--- a/torch/utils/data/datapipes/iter/streamreader.py
+++ b/torch/utils/data/datapipes/iter/streamreader.py
@@ -1,6 +1,5 @@
 from collections.abc import Iterator
 from io import IOBase
-from typing import Optional
 
 from torch.utils.data.datapipes._decorator import functional_datapipe
 from torch.utils.data.datapipes.datapipe import IterDataPipe
@@ -31,8 +30,8 @@ class StreamReaderIterDataPipe(IterDataPipe[tuple[str, bytes]]):
     """
 
     def __init__(
-        self, datapipe: IterDataPipe[tuple[str, IOBase]], chunk: Optional[int] = None
-    ):
+        self, datapipe: IterDataPipe[tuple[str, IOBase]], chunk: int | None = None
+    ) -> None:
         self.datapipe = datapipe
         self.chunk = chunk
 
diff --git a/torch/utils/data/datapipes/map/combinatorics.py b/torch/utils/data/datapipes/map/combinatorics.py
index 4876ce3fd1cbc..af4792fc805b8 100644
--- a/torch/utils/data/datapipes/map/combinatorics.py
+++ b/torch/utils/data/datapipes/map/combinatorics.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 import random
 from collections.abc import Iterator
-from typing import Optional, TypeVar
+from typing import TypeVar
 
 import torch
 from torch.utils.data.datapipes.datapipe import IterDataPipe, MapDataPipe
@@ -53,14 +53,14 @@ class ShufflerIterDataPipe(IterDataPipe[_T_co]):
 
     datapipe: MapDataPipe[_T_co]
     _enabled: bool
-    _seed: Optional[int]
+    _seed: int | None
     _rng: random.Random
 
     def __init__(
         self,
         datapipe: MapDataPipe[_T_co],
         *,
-        indices: Optional[list] = None,
+        indices: list | None = None,
     ) -> None:
         super().__init__()
         self.datapipe = datapipe
diff --git a/torch/utils/data/datapipes/map/combining.py b/torch/utils/data/datapipes/map/combining.py
index 21a412ff91609..c11d0bcd17d99 100644
--- a/torch/utils/data/datapipes/map/combining.py
+++ b/torch/utils/data/datapipes/map/combining.py
@@ -37,7 +37,7 @@ class ConcaterMapDataPipe(MapDataPipe):
 
     datapipes: tuple[MapDataPipe]
 
-    def __init__(self, *datapipes: MapDataPipe):
+    def __init__(self, *datapipes: MapDataPipe) -> None:
         if len(datapipes) == 0:
             raise ValueError("Expected at least one DataPipe, but got nothing")
         if not all(isinstance(dp, MapDataPipe) for dp in datapipes):
diff --git a/torch/utils/data/datapipes/map/utils.py b/torch/utils/data/datapipes/map/utils.py
index 360f66b3137c7..a5b9075f1dbbc 100644
--- a/torch/utils/data/datapipes/map/utils.py
+++ b/torch/utils/data/datapipes/map/utils.py
@@ -1,7 +1,7 @@
 import copy
 import warnings
 from collections.abc import Mapping, Sequence
-from typing import Any, TypeVar, Union
+from typing import Any, TypeVar
 
 from torch.utils.data.datapipes.datapipe import MapDataPipe
 
@@ -36,10 +36,10 @@ class SequenceWrapperMapDataPipe(MapDataPipe[_T]):
         100
     """
 
-    sequence: Union[Sequence[_T], Mapping[Any, _T]]
+    sequence: Sequence[_T] | Mapping[Any, _T]
 
     def __init__(
-        self, sequence: Union[Sequence[_T], Mapping[Any, _T]], deepcopy: bool = True
+        self, sequence: Sequence[_T] | Mapping[Any, _T], deepcopy: bool = True
     ) -> None:
         if deepcopy:
             try:
diff --git a/torch/utils/data/datapipes/utils/common.py b/torch/utils/data/datapipes/utils/common.py
index 003ca568fcaf6..4fcc617b3b722 100644
--- a/torch/utils/data/datapipes/utils/common.py
+++ b/torch/utils/data/datapipes/utils/common.py
@@ -6,7 +6,7 @@
 import warnings
 from collections.abc import Callable, Iterable
 from io import IOBase
-from typing import Any, Optional, Union
+from typing import Any, NoReturn
 
 from torch.utils._import_utils import dill_available
 
@@ -25,7 +25,7 @@
 DILL_AVAILABLE = dill_available()
 
 
-def validate_input_col(fn: Callable, input_col: Optional[Union[int, tuple, list]]):
+def validate_input_col(fn: Callable, input_col: int | tuple | list | None) -> None:
     """
     Check that function used in a callable datapipe works with the input column.
 
@@ -131,7 +131,7 @@ def _is_local_fn(fn):
     return False
 
 
-def _check_unpickable_fn(fn: Callable):
+def _check_unpickable_fn(fn: Callable) -> None:
     """
     Check function is pickable or not.
 
@@ -164,7 +164,7 @@ def _check_unpickable_fn(fn: Callable):
         return
 
 
-def match_masks(name: str, masks: Union[str, list[str]]) -> bool:
+def match_masks(name: str, masks: str | list[str]) -> bool:
     # empty mask matches any input name
     if not masks:
         return True
@@ -180,13 +180,13 @@ def match_masks(name: str, masks: Union[str, list[str]]) -> bool:
 
 def get_file_pathnames_from_root(
     root: str,
-    masks: Union[str, list[str]],
+    masks: str | list[str],
     recursive: bool = False,
     abspath: bool = False,
     non_deterministic: bool = False,
 ) -> Iterable[str]:
     # print out an error message and raise the error out
-    def onerror(err: OSError):
+    def onerror(err: OSError) -> NoReturn:
         warnings.warn(err.filename + " : " + err.strerror, stacklevel=2)
         raise err
 
@@ -217,7 +217,7 @@ def onerror(err: OSError):
 
 
 def get_file_binaries_from_pathnames(
-    pathnames: Iterable, mode: str, encoding: Optional[str] = None
+    pathnames: Iterable, mode: str, encoding: str | None = None
 ):
     if not isinstance(pathnames, Iterable):
         pathnames = [
@@ -232,10 +232,10 @@ def get_file_binaries_from_pathnames(
             raise TypeError(
                 f"Expected string type for pathname, but got {type(pathname)}"
             )
-        yield pathname, StreamWrapper(open(pathname, mode, encoding=encoding))
+        yield pathname, StreamWrapper(open(pathname, mode, encoding=encoding))  # noqa:SIM115
 
 
-def validate_pathname_binary_tuple(data: tuple[str, IOBase]):
+def validate_pathname_binary_tuple(data: tuple[str, IOBase]) -> None:
     if not isinstance(data, tuple):
         raise TypeError(
             f"pathname binary data should be tuple type, but it is type {type(data)}"
@@ -326,7 +326,7 @@ class StreamWrapper:
     session_streams: dict[Any, int] = {}
     debug_unclosed_streams: bool = False
 
-    def __init__(self, file_obj, parent_stream=None, name=None):
+    def __init__(self, file_obj, parent_stream=None, name=None) -> None:
         self.file_obj = file_obj
         self.child_counter = 0
         self.parent_stream = parent_stream
@@ -344,7 +344,7 @@ def __init__(self, file_obj, parent_stream=None, name=None):
             StreamWrapper.session_streams[self] = 1
 
     @classmethod
-    def close_streams(cls, v, depth=0):
+    def close_streams(cls, v, depth=0) -> None:
         """Traverse structure and attempts to close all found StreamWrappers on best effort basis."""
         if depth > 10:
             return
@@ -363,7 +363,7 @@ def __getattr__(self, name):
         file_obj = self.__dict__["file_obj"]
         return getattr(file_obj, name)
 
-    def close(self, *args, **kwargs):
+    def close(self, *args, **kwargs) -> None:
         if self.closed:
             return
         if StreamWrapper.debug_unclosed_streams:
@@ -381,7 +381,7 @@ def close(self, *args, **kwargs):
             pass
         self.closed = True
 
-    def autoclose(self):
+    def autoclose(self) -> None:
         """Automatically close stream when all child streams are closed or if there are none."""
         self.close_on_last_child = True
         if self.child_counter == 0:
@@ -392,7 +392,7 @@ def __dir__(self):
         attrs += dir(self.file_obj)
         return list(set(attrs))
 
-    def __del__(self):
+    def __del__(self) -> None:
         if not self.closed:
             self.close()
 
@@ -402,7 +402,7 @@ def __iter__(self):
     def __next__(self):
         return next(self.file_obj)
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         if self.name is None:
             return f"StreamWrapper<{self.file_obj!r}>"
         else:
diff --git a/torch/utils/data/datapipes/utils/decoder.py b/torch/utils/data/datapipes/utils/decoder.py
index f4cc55838ae08..3b907ffebdd22 100644
--- a/torch/utils/data/datapipes/utils/decoder.py
+++ b/torch/utils/data/datapipes/utils/decoder.py
@@ -168,7 +168,7 @@ class ImageHandler:
     - pilrgba: pil None rgba
     """
 
-    def __init__(self, imagespec):
+    def __init__(self, imagespec) -> None:
         if imagespec not in list(imagespecs.keys()):
             raise AssertionError(f"unknown image specification: {imagespec}")
         self.imagespec = imagespec.lower()
@@ -335,13 +335,13 @@ class Decoder:
     handlers until some handler returns something other than None.
     """
 
-    def __init__(self, *handler, key_fn=extension_extract_fn):
+    def __init__(self, *handler, key_fn=extension_extract_fn) -> None:
         self.handlers = list(handler) if handler else []
         self.key_fn = key_fn
 
     # Insert new handler from the beginning of handlers list to make sure the new
     # handler having the highest priority
-    def add_handler(self, *handler):
+    def add_handler(self, *handler) -> None:
         if not handler:
             return
         self.handlers = list(handler) + self.handlers
diff --git a/torch/utils/data/dataset.py b/torch/utils/data/dataset.py
index b77ff892e6662..19ec449f040dd 100644
--- a/torch/utils/data/dataset.py
+++ b/torch/utils/data/dataset.py
@@ -10,7 +10,7 @@
 # targets fail to typecheck with:
 #     TypeError: Cannot create a consistent method resolution order (MRO) for
 #     bases Iterable, Generic
-from typing import cast, Generic, Iterable, Optional, TypeVar, Union  # noqa: UP035
+from typing import cast, Generic, Iterable, TypeVar  # noqa: UP035
 from typing_extensions import deprecated
 
 # No 'default_generator' in torch/__init__.pyi
@@ -205,7 +205,7 @@ def __init__(self, *tensors: Tensor) -> None:
     def __getitem__(self, index):
         return tuple(tensor[index] for tensor in self.tensors)
 
-    def __len__(self):
+    def __len__(self) -> int:
         return self.tensors[0].size(0)
 
 
@@ -228,7 +228,7 @@ class StackDataset(Dataset[_T_stack]):
         **kwargs (Dataset): Datasets for stacking returned as dict.
     """
 
-    datasets: Union[tuple, dict]
+    datasets: tuple | dict
 
     def __init__(self, *args: Dataset[_T_co], **kwargs: Dataset[_T_co]) -> None:
         if args:
@@ -292,7 +292,7 @@ def __getitems__(self, indices: list):
         tuple_batch: list[_T_tuple] = [tuple(sample) for sample in list_batch]
         return tuple_batch
 
-    def __len__(self):
+    def __len__(self) -> int:
         return self._length
 
 
@@ -327,7 +327,7 @@ def __init__(self, datasets: Iterable[Dataset]) -> None:
                 raise AssertionError("ConcatDataset does not support IterableDataset")
         self.cumulative_sizes = self.cumsum(self.datasets)
 
-    def __len__(self):
+    def __len__(self) -> int:
         return self.cumulative_sizes[-1]
 
     def __getitem__(self, idx):
@@ -374,7 +374,7 @@ def __iter__(self):
                 raise AssertionError("ChainDataset only supports IterableDataset")
             yield from d
 
-    def __len__(self):
+    def __len__(self) -> int:
         total = 0
         for d in self.datasets:
             if not isinstance(d, IterableDataset):
@@ -412,14 +412,14 @@ def __getitems__(self, indices: list[int]) -> list[_T_co]:
         else:
             return [self.dataset[self.indices[idx]] for idx in indices]
 
-    def __len__(self):
+    def __len__(self) -> int:
         return len(self.indices)
 
 
 def random_split(
     dataset: Dataset[_T],
-    lengths: Sequence[Union[int, float]],
-    generator: Optional[Generator] = default_generator,
+    lengths: Sequence[int | float],
+    generator: Generator | None = default_generator,
 ) -> list[Subset[_T]]:
     r"""
     Randomly split a dataset into non-overlapping new datasets of given lengths.
diff --git a/torch/utils/data/distributed.py b/torch/utils/data/distributed.py
index b2f4eb04e8e24..5179d7698ffee 100644
--- a/torch/utils/data/distributed.py
+++ b/torch/utils/data/distributed.py
@@ -1,6 +1,6 @@
 import math
 from collections.abc import Iterator
-from typing import Optional, TypeVar
+from typing import TypeVar
 
 import torch
 import torch.distributed as dist
@@ -66,8 +66,8 @@ class DistributedSampler(Sampler[_T_co]):
     def __init__(
         self,
         dataset: Dataset,
-        num_replicas: Optional[int] = None,
-        rank: Optional[int] = None,
+        num_replicas: int | None = None,
+        rank: int | None = None,
         shuffle: bool = True,
         seed: int = 0,
         drop_last: bool = False,
diff --git a/torch/utils/data/graph.py b/torch/utils/data/graph.py
index 8867109c1e0b7..d1e7e679ad5d5 100644
--- a/torch/utils/data/graph.py
+++ b/torch/utils/data/graph.py
@@ -3,7 +3,7 @@
 import pickle
 import warnings
 from collections.abc import Collection
-from typing import Optional, Union
+from typing import Union
 
 from torch.utils._import_utils import dill_available
 from torch.utils.data.datapipes.datapipe import IterDataPipe, MapDataPipe
@@ -15,7 +15,7 @@
 DataPipeGraph = dict[int, tuple[DataPipe, "DataPipeGraph"]]
 
 
-def _stub_unpickler():
+def _stub_unpickler() -> str:
     return "STUB"
 
 
@@ -106,7 +106,7 @@ def traverse_dps(datapipe: DataPipe) -> DataPipeGraph:
     return _traverse_helper(datapipe, only_datapipe=True, cache=cache)
 
 
-def traverse(datapipe: DataPipe, only_datapipe: Optional[bool] = None) -> DataPipeGraph:
+def traverse(datapipe: DataPipe, only_datapipe: bool | None = None) -> DataPipeGraph:
     r"""
     Traverse the DataPipes and their attributes to extract the DataPipe graph.
 
diff --git a/torch/utils/data/graph_settings.py b/torch/utils/data/graph_settings.py
index bb97558256bec..03096398a6738 100644
--- a/torch/utils/data/graph_settings.py
+++ b/torch/utils/data/graph_settings.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 import inspect
 import warnings
-from typing import Any, Optional
+from typing import Any
 from typing_extensions import deprecated
 
 import torch
@@ -58,7 +58,7 @@ def apply_sharding(
     """
     graph = traverse_dps(datapipe)
 
-    def _helper(graph, prev_applied=None):
+    def _helper(graph, prev_applied=None) -> None:
         for dp, sub_graph in graph.values():
             applied = None
             if _is_sharding_datapipe(dp):
@@ -94,9 +94,7 @@ def _is_shuffle_datapipe(datapipe: DataPipe) -> bool:
     )
 
 
-def apply_shuffle_settings(
-    datapipe: DataPipe, shuffle: Optional[bool] = None
-) -> DataPipe:
+def apply_shuffle_settings(datapipe: DataPipe, shuffle: bool | None = None) -> DataPipe:
     r"""
     Traverse the graph of ``DataPipes`` to find and set shuffle attribute.
 
diff --git a/torch/utils/data/sampler.py b/torch/utils/data/sampler.py
index f36f15ee09589..aa13bb8e0a3e1 100644
--- a/torch/utils/data/sampler.py
+++ b/torch/utils/data/sampler.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 import itertools
 from collections.abc import Iterable, Iterator, Sequence, Sized
-from typing import Generic, Optional, TypeVar, Union
+from typing import Generic, TypeVar
 
 import torch
 
@@ -132,7 +132,7 @@ def __init__(
         self,
         data_source: Sized,
         replacement: bool = False,
-        num_samples: Optional[int] = None,
+        num_samples: int | None = None,
         generator=None,
     ) -> None:
         self.data_source = data_source
@@ -307,7 +307,7 @@ class BatchSampler(Sampler[list[int]]):
 
     def __init__(
         self,
-        sampler: Union[Sampler[int], Iterable[int]],
+        sampler: Sampler[int] | Iterable[int],
         batch_size: int,
         drop_last: bool,
     ) -> None:
diff --git a/torch/utils/data/standard_pipes.ipynb b/torch/utils/data/standard_pipes.ipynb
index c40058bca7699..e05b602c840bd 100644
--- a/torch/utils/data/standard_pipes.ipynb
+++ b/torch/utils/data/standard_pipes.ipynb
@@ -24,7 +24,7 @@
    "source": [
     "# Example IterDataPipe\n",
     "class ExampleIterPipe(IterDataPipe):\n",
-    "    def __init__(self, range = 20):\n",
+    "    def __init__(self, range = 20) -> None:\n",
     "        self.range = range\n",
     "    def __iter__(self):\n",
     "        yield from self.range"
diff --git a/torch/utils/data/typing.ipynb b/torch/utils/data/typing.ipynb
index 1b1aa8c9da72f..b25d82d421d9b 100644
--- a/torch/utils/data/typing.ipynb
+++ b/torch/utils/data/typing.ipynb
@@ -33,7 +33,7 @@
     "import functools\n",
     "ipython = get_ipython()\n",
     "def showtraceback(self, exc_tuple=None, filename=None, tb_offset=None,\n",
-    "                  exception_only=False, running_compiled_code=False):\n",
+    "                  exception_only=False, running_compiled_code=False) -> None:\n",
     "    try:\n",
     "        try:\n",
     "            etype, value, tb = self._get_exc_info(exc_tuple)\n",
@@ -208,7 +208,7 @@
     "\n",
     "T = TypeVar('T', int, str)  # equals to Union[int, str]\n",
     "class DP(IterDataPipe[tuple[T, str]]):\n",
-    "    def __iter__(self) -> Iterator[tuple[Union[int, str], str]]:\n",
+    "    def __iter__(self) -> Iterator[tuple[int | str, str]]:\n",
     "        pass\n",
     "print(DP.type)"
    ]
@@ -227,7 +227,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def print_helper(cls, obj):\n",
+    "def print_helper(cls, obj) -> None:\n",
     "    print(f\"DataPipe[{cls.type}]\\nInstance type: {obj.type}\")"
    ]
   },
@@ -313,7 +313,7 @@
     "\n",
     "class DP(IterDataPipe):\n",
     "    @argument_validation\n",
-    "    def __init__(self, dp: IterDataPipe[Union[int, tuple]]):\n",
+    "    def __init__(self, dp: IterDataPipe[int | tuple]) -> None:\n",
     "        self.dp = dp\n",
     "\n",
     "    def __iter__(self):\n",
@@ -411,7 +411,7 @@
     "from torch.utils.data import runtime_validation, runtime_validation_disabled\n",
     "\n",
     "class DP(IterDataPipe[tuple[int, T_co]]):\n",
-    "    def __init__(self, datasource):\n",
+    "    def __init__(self, datasource) -> None:\n",
     "        self.ds = datasource\n",
     "\n",
     "    @runtime_validation\n",
@@ -606,7 +606,7 @@
    ],
    "source": [
     "class DP(IterDataPipe[T]):\n",
-    "    def __init__(self, ds):\n",
+    "    def __init__(self, ds) -> None:\n",
     "        self.ds = ds\n",
     "\n",
     "    def __iter__(self):\n",
@@ -621,7 +621,7 @@
    "outputs": [],
    "source": [
     "class DP(IterDataPipe[T]):\n",
-    "    def __init__(self, ds):\n",
+    "    def __init__(self, ds) -> None:\n",
     "        self.ds = ds\n",
     "\n",
     "    @runtime_validation\n",
@@ -744,7 +744,7 @@
    "outputs": [],
    "source": [
     "class DP(IterDataPipe[Union[int, str]]):\n",
-    "    def __init__(self, label):\n",
+    "    def __init__(self, label) -> None:\n",
     "        if label == 'int':\n",
     "            self.reinforce_type(int)\n",
     "        elif label == 'str':\n",
diff --git a/torch/utils/dlpack.py b/torch/utils/dlpack.py
index f63cc89cc26ea..223cca54dafed 100644
--- a/torch/utils/dlpack.py
+++ b/torch/utils/dlpack.py
@@ -1,4 +1,4 @@
-from typing import Any, Optional
+from typing import Any
 
 import torch
 import enum
@@ -58,8 +58,8 @@ class DLDeviceType(enum.IntEnum):
 def from_dlpack(
     ext_tensor: Any,
     *,
-    device: Optional[_Device] = None,
-    copy: Optional[bool] = None
+    device: _Device | None = None,
+    copy: bool | None = None
 ) -> 'torch.Tensor':
     """from_dlpack(ext_tensor) -> Tensor
 
diff --git a/torch/utils/file_baton.py b/torch/utils/file_baton.py
index 3d51d9efb339f..5b4f55d8c88dd 100644
--- a/torch/utils/file_baton.py
+++ b/torch/utils/file_baton.py
@@ -7,7 +7,7 @@
 class FileBaton:
     """A primitive, file-based synchronization utility."""
 
-    def __init__(self, lock_file_path, wait_seconds=0.1, warn_after_seconds=None):
+    def __init__(self, lock_file_path, wait_seconds=0.1, warn_after_seconds=None) -> None:
         """
         Create a new :class:`FileBaton`.
 
@@ -23,7 +23,7 @@ def __init__(self, lock_file_path, wait_seconds=0.1, warn_after_seconds=None):
         self.fd = None
         self.warn_after_seconds = warn_after_seconds
 
-    def try_acquire(self):
+    def try_acquire(self) -> bool | None:
         """
         Try to atomically create a file under exclusive access.
 
@@ -37,7 +37,7 @@ def try_acquire(self):
         except FileExistsError:
             return False
 
-    def wait(self):
+    def wait(self) -> None:
         """
         Periodically sleeps for a certain amount until the baton is released.
 
@@ -56,7 +56,7 @@ def wait(self):
                                   f'{self.warn_after_seconds} seconds.', stacklevel=2)
                     has_warned = True
 
-    def release(self):
+    def release(self) -> None:
         """Release the baton and removes its file."""
         if self.fd is not None:
             os.close(self.fd)
diff --git a/torch/utils/flop_counter.py b/torch/utils/flop_counter.py
index 634e03439d4f4..7d08a14158300 100644
--- a/torch/utils/flop_counter.py
+++ b/torch/utils/flop_counter.py
@@ -2,7 +2,7 @@
 import torch
 from torch.utils._pytree import tree_map, tree_flatten, tree_unflatten
 from .module_tracker import ModuleTracker
-from typing import Any, Optional, Union, TypeVar
+from typing import Any, TypeVar
 from collections.abc import Callable
 from collections.abc import Iterator
 from typing_extensions import ParamSpec
@@ -38,7 +38,7 @@ def register_fun(flop_formula: Callable[_P, _T]) -> Callable[_P, _T]:
         if not get_raw:
             flop_formula = shape_wrapper(flop_formula)
 
-        def register(target):
+        def register(target) -> None:
             if not isinstance(target, torch._ops.OpOverloadPacket):
                 raise ValueError(
                     f"register_flop_formula(targets): expected each target to be "
@@ -314,7 +314,7 @@ def _unpack_flash_attention_nested_shapes(
     cum_seq_k,
     max_q,
     max_k,
-) -> Iterator[tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], Optional[tuple[int, ...]]]]:
+) -> Iterator[tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], tuple[int, ...] | None]]:
     """
     Given inputs to a flash_attention_(forward|backward) kernel, this will handle behavior for
     NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
@@ -366,7 +366,7 @@ def _unpack_efficient_attention_nested_shapes(
     cu_seqlens_k,
     max_seqlen_q,
     max_seqlen_k,
-) -> Iterator[tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], Optional[tuple[int, ...]]]]:
+) -> Iterator[tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], tuple[int, ...] | None]]:
     """
     Given inputs to a efficient_attention_(forward|backward) kernel, this will handle behavior for
     NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
@@ -624,7 +624,7 @@ def convert_num_with_suffix(number, suffix):
     # Return the value and the suffix as a string
     return value + suffixes[index]
 
-def convert_to_percent_str(num, denom):
+def convert_to_percent_str(num, denom) -> str:
     if denom == 0:
         return "0%"
     return f"{num / denom:.2%}"
@@ -661,15 +661,15 @@ class FlopCounterMode:
 
     def __init__(
             self,
-            mods: Optional[Union[torch.nn.Module, list[torch.nn.Module]]] = None,
+            mods: torch.nn.Module | list[torch.nn.Module] | None = None,
             depth: int = 2,
             display: bool = True,
-            custom_mapping: Optional[dict[Any, Any]] = None):
+            custom_mapping: dict[Any, Any] | None = None) -> None:
         super().__init__()
         self.flop_counts: dict[str, dict[Any, int]] = defaultdict(lambda: defaultdict(int))
         self.depth = depth
         self.display = display
-        self.mode: Optional[_FlopCounterMode] = None
+        self.mode: _FlopCounterMode | None = None
         if custom_mapping is None:
             custom_mapping = {}
         if mods is not None:
@@ -787,7 +787,7 @@ def _count_flops(self, func_packet, out, args, kwargs):
 class _FlopCounterMode(TorchDispatchMode):
     supports_higher_order_operators = True
 
-    def __init__(self, counter: FlopCounterMode):
+    def __init__(self, counter: FlopCounterMode) -> None:
         self.counter = counter
 
     def _execute_with_isolated_flop_counting(self, branch_fn, operands):
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
index 12e1a1209c2cd..5d0f252d95b30 100644
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -8640,14 +8640,19 @@
         ("CUSPARSE_COMPUTE_16F", ("HIPSPARSELT_COMPUTE_16F", CONV_NUMERIC_LITERAL, API_SPECIAL)),
         ("CUSPARSE_COMPUTE_32I", ("HIPSPARSELT_COMPUTE_32I", CONV_NUMERIC_LITERAL, API_SPECIAL)),
         ("CUSPARSE_COMPUTE_TF32", ("HIPSPARSELT_COMPUTE_TF32", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUSPARSELT_MATMUL_ALG_CONFIG_ID", ("HIPSPARSELT_MATMUL_ALG_CONFIG_ID", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUSPARSELT_MATMUL_ALG_CONFIG_MAX_ID", ("HIPSPARSELT_MATMUL_ALG_CONFIG_MAX_ID", CONV_NUMERIC_LITERAL, API_SPECIAL)),
         ("CUSPARSELT_MATMUL_BIAS_POINTER", ("HIPSPARSELT_MATMUL_BIAS_POINTER", CONV_NUMERIC_LITERAL, API_SPECIAL)),
         ("CUSPARSELT_MATMUL_ALG_DEFAULT", ("HIPSPARSELT_MATMUL_ALG_DEFAULT", CONV_NUMERIC_LITERAL, API_SPECIAL)),
         ("CUSPARSELT_MATMUL_ALG_CONFIG_ID", ("HIPSPARSELT_MATMUL_ALG_CONFIG_ID", CONV_NUMERIC_LITERAL, API_SPECIAL)),
         ("CUSPARSELT_MATMUL_ALPHA_VECTOR_SCALING", ("HIPSPARSELT_MATMUL_ALPHA_VECTOR_SCALING", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUSPARSELT_MATMUL_SPLIT_K", ("HIPSPARSELT_MATMUL_SPLIT_K", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUSPARSELT_MATMUL_SPLIT_K_MODE", ("HIPSPARSELT_MATMUL_SPLIT_K_MODE", CONV_NUMERIC_LITERAL, API_SPECIAL)),
         ("cusparseLtHandle_t", ("hipsparseLtHandle_t", CONV_TYPE, API_SPECIAL)),
         ("cusparseLtMatDescriptor_t", ("hipsparseLtMatDescriptor_t", CONV_TYPE, API_SPECIAL)),
         ("cusparseLtInit", ("hipsparseLtInit", CONV_MATH_FUNC, API_SPECIAL)),
         ("cusparseLtStructuredDescriptorInit", ("hipsparseLtStructuredDescriptorInit", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseLtSplitKMode_t", ("hipsparseLtSplitKMode_t", CONV_TYPE, API_SPECIAL)),
         ("cusparseLtSpMMACompressedSize2", ("hipsparseLtSpMMACompressedSize2", CONV_MATH_FUNC, API_SPECIAL)),
         ("cusparseLtSpMMACompress2", ("hipsparseLtSpMMACompress2", CONV_MATH_FUNC, API_SPECIAL)),
         ("cusparseLtMatmulDescriptor_t", ("hipsparseLtMatmulDescriptor_t", CONV_TYPE, API_SPECIAL)),
diff --git a/torch/utils/hipify/hipify_python.py b/torch/utils/hipify/hipify_python.py
index 93ce3c50dfcf2..1d2f5964fcaf8 100755
--- a/torch/utils/hipify/hipify_python.py
+++ b/torch/utils/hipify/hipify_python.py
@@ -35,7 +35,6 @@
 from .cuda_to_hip_mappings import CUDA_TO_HIP_MAPPINGS
 from .cuda_to_hip_mappings import MATH_TRANSPILATIONS
 
-from typing import Optional
 from collections.abc import Iterator
 from collections.abc import Mapping, Iterable
 from enum import Enum
@@ -47,12 +46,12 @@ class CurrentState(Enum):
     DONE = 2
 
 class HipifyResult:
-    def __init__(self, current_state, hipified_path):
+    def __init__(self, current_state, hipified_path) -> None:
         self.current_state = current_state
         self.hipified_path = hipified_path
         self.status = ""
 
-    def __str__(self):
+    def __str__(self) -> str:
         return (f"HipifyResult:: current_state: {self.current_state}, hipified_path : {self.hipified_path}, status: {self.status}")
 
 HipifyFinalResult = dict[str, HipifyResult]
@@ -75,11 +74,11 @@ def __str__(self):
 class InputError(Exception):
     # Exception raised for errors in the input.
 
-    def __init__(self, message):
+    def __init__(self, message) -> None:
         super().__init__(message)
         self.message = message
 
-    def __str__(self):
+    def __str__(self) -> str:
         return f"Input error: {self.message}"
 
 
@@ -109,7 +108,7 @@ class bcolors:
 # keep them (e.g. in the CI), this can be used to remove files.
 class GeneratedFileCleaner:
     """Context Manager to clean up generated files"""
-    def __init__(self, keep_intermediates=False):
+    def __init__(self, keep_intermediates=False) -> None:
         self.keep_intermediates = keep_intermediates
         self.files_to_clean = set()
         self.dirs_to_clean = []
@@ -123,7 +122,7 @@ def open(self, fn, *args, **kwargs):
         # pyrefly: ignore [not-iterable]
         return open(fn, *args, **kwargs)
 
-    def makedirs(self, dn, exist_ok=False):
+    def makedirs(self, dn, exist_ok=False) -> None:
         parent, n = os.path.split(dn)
         if not n:
             parent, n = os.path.split(parent)
@@ -222,7 +221,7 @@ def preprocess_file_and_save_result(
     HIPIFY_FINAL_RESULT[fin_path] = result
 
 
-def compute_stats(stats):
+def compute_stats(stats) -> None:
     unsupported_calls = {cuda_call for (cuda_call, _filepath) in stats["unsupported_calls"]}
 
     # Print the number of unsupported calls
@@ -616,7 +615,7 @@ def get_hip_file_path(rel_filepath, is_pytorch_extension=False):
     return os.path.join(dirpath, root + ext)
 
 
-def is_out_of_place(rel_filepath):
+def is_out_of_place(rel_filepath) -> bool:
     if os.path.isabs(rel_filepath):
         raise AssertionError("rel_filepath must be a relative path")
     if rel_filepath.startswith("torch/"):
@@ -629,7 +628,7 @@ def is_out_of_place(rel_filepath):
 
 
 # Keep this synchronized with includes/ignores in build_amd.py
-def is_pytorch_file(rel_filepath):
+def is_pytorch_file(rel_filepath) -> bool:
     if os.path.isabs(rel_filepath):
         raise AssertionError("rel_filepath must be a relative path")
     if rel_filepath.startswith("aten/"):
@@ -653,7 +652,7 @@ def is_cusparse_file(rel_filepath):
     return False
 
 
-def is_special_file(rel_filepath):
+def is_special_file(rel_filepath) -> bool:
     if is_pytorch_file(rel_filepath):
         if "sparse" in rel_filepath.lower():
             return True
@@ -678,20 +677,20 @@ class TrieNode:
        A special char '' represents end of word
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         self.children = {}
 
 class Trie:
     """Creates a Trie out of a list of words. The trie can be exported to a Regex pattern.
     The corresponding Regex should match much faster than a simple Regex union."""
 
-    def __init__(self):
+    def __init__(self) -> None:
         """Initialize the trie with an empty root node."""
         self.root = TrieNode()
         self._hash = hashlib.md5(usedforsecurity=False)
         self._digest = self._hash.digest()
 
-    def add(self, word):
+    def add(self, word) -> None:
         """Add a word to the Trie. """
         self._hash.update(word.encode())
         self._digest = self._hash.digest()
@@ -1011,7 +1010,7 @@ def repl(m):
         hipify_result.current_state = CurrentState.DONE
         return hipify_result
 
-def file_specific_replacement(filepath, search_string, replace_string, strict=False):
+def file_specific_replacement(filepath, search_string, replace_string, strict=False) -> None:
     with openf(filepath, "r+") as f:
         contents = f.read()
         if strict:
@@ -1023,7 +1022,7 @@ def file_specific_replacement(filepath, search_string, replace_string, strict=Fa
         f.truncate()
 
 
-def file_add_header(filepath, header):
+def file_add_header(filepath, header) -> None:
     with openf(filepath, "r+") as f:
         contents = f.read()
         if header[0] != "<" and header[-1] != ">":
@@ -1089,7 +1088,7 @@ def extract_arguments(start, string):
     return arguments
 
 
-def str2bool(v):
+def str2bool(v : str) -> bool:
     """ArgumentParser doesn't support type=bool. Thus, this helper method will convert
     from possible string types to True / False."""
     if v.lower() in ('yes', 'true', 't', 'y', '1'):
@@ -1115,7 +1114,7 @@ def hipify(
     hip_clang_launch: bool = False,
     is_pytorch_extension: bool = False,
     hipify_extra_files_only: bool = False,
-    clean_ctx: Optional[GeneratedFileCleaner] = None
+    clean_ctx: GeneratedFileCleaner | None = None
 ) -> HipifyFinalResult:
     if project_directory == "":
         project_directory = os.getcwd()
diff --git a/torch/utils/hooks.py b/torch/utils/hooks.py
index 3c022a4e85508..8e89d3ec9b3a0 100644
--- a/torch/utils/hooks.py
+++ b/torch/utils/hooks.py
@@ -80,7 +80,7 @@ def unserializable_hook(f):
     return f
 
 
-def warn_if_has_hooks(tensor):
+def warn_if_has_hooks(tensor) -> None:
     if tensor._backward_hooks:
         for k in tensor._backward_hooks:
             hook = tensor._backward_hooks[k]
@@ -101,7 +101,7 @@ class BackwardHook:
       - Calling the user hook once both output and input gradients are available
     """
 
-    def __init__(self, module, user_hooks, user_pre_hooks):
+    def __init__(self, module, user_hooks, user_pre_hooks) -> None:
         self.user_hooks = user_hooks
         self.user_pre_hooks = user_pre_hooks
         self.module = module
@@ -124,7 +124,7 @@ def _unpack_none(self, indices, values):
 
         return tuple(res)
 
-    def _set_user_hook(self, grad_fn):
+    def _set_user_hook(self, grad_fn) -> None:
         def hook(grad_input, _):
             if self.grad_outputs is None:
                 # This happens because the gradient in your nn.Module flows to
@@ -190,7 +190,7 @@ def _apply_on_tensors(self, fn, args):
         return out, tensors_idx
 
     def setup_input_hook(self, args):
-        def fn(grad_fn):
+        def fn(grad_fn) -> None:
             self._set_user_hook(grad_fn)
 
         res, input_idx = self._apply_on_tensors(fn, args)
@@ -199,7 +199,7 @@ def fn(grad_fn):
         return res
 
     def setup_output_hook(self, args):
-        def fn(grad_fn):
+        def fn(grad_fn) -> None:
             def hook(_, grad_output):
                 self.grad_outputs = self._pack_with_none(self.output_tensors_index,
                                                          grad_output,
diff --git a/torch/utils/mkldnn.py b/torch/utils/mkldnn.py
index b6b09937eb90c..11bb4e442b296 100644
--- a/torch/utils/mkldnn.py
+++ b/torch/utils/mkldnn.py
@@ -3,7 +3,7 @@
 
 
 class MkldnnLinear(torch.jit.ScriptModule):
-    def __init__(self, dense_module, dtype):
+    def __init__(self, dense_module, dtype) -> None:
         super().__init__()
         self.register_buffer('weight', dense_module.weight.to_mkldnn(dtype))
         if dense_module.bias is not None:
@@ -39,7 +39,7 @@ class _MkldnnConvNd(torch.jit.ScriptModule):
 
     __constants__ = ['stride', 'padding', 'dilation', 'groups']
 
-    def __init__(self, dense_module):
+    def __init__(self, dense_module) -> None:
         super().__init__()
 
         self.stride = dense_module.stride
@@ -74,7 +74,7 @@ def forward(self, x):
 
 
 class MkldnnConv1d(_MkldnnConvNd):
-    def __init__(self, dense_module, dtype):
+    def __init__(self, dense_module, dtype) -> None:
         super().__init__(dense_module)
 
         self.register_buffer('weight', dense_module.weight.to_mkldnn(dtype))
@@ -87,7 +87,7 @@ def __setstate__(self, state):
 
 
 class MkldnnConv2d(_MkldnnConvNd):
-    def __init__(self, dense_module, dtype):
+    def __init__(self, dense_module, dtype) -> None:
         super().__init__(dense_module)
 
         self.register_buffer('weight', torch._C._nn.mkldnn_reorder_conv2d_weight(
@@ -109,7 +109,7 @@ def __setstate__(self, state):
         self.training = state[2]
 
 class MkldnnConv3d(_MkldnnConvNd):
-    def __init__(self, dense_module, dtype):
+    def __init__(self, dense_module, dtype) -> None:
         super().__init__(dense_module)
 
         self.register_buffer('weight', torch._C._nn.mkldnn_reorder_conv3d_weight(
@@ -134,7 +134,7 @@ def __setstate__(self, state):
 class MkldnnBatchNorm(torch.jit.ScriptModule):
     __constants__ = ['exponential_average_factor', 'eps']
 
-    def __init__(self, dense_module):
+    def __init__(self, dense_module) -> None:
         super().__init__()
 
         if dense_module.training:
@@ -186,7 +186,7 @@ def forward(self, x):
         )
 
 class MkldnnPrelu(torch.jit.ScriptModule):
-    def __init__(self, dense_module, dtype):
+    def __init__(self, dense_module, dtype) -> None:
         super().__init__()
         self.register_buffer('weight', dense_module.weight.to_mkldnn(dtype))
 
diff --git a/torch/utils/mobile_optimizer.py b/torch/utils/mobile_optimizer.py
index 819f19d5b71ea..1ad0a65204a47 100644
--- a/torch/utils/mobile_optimizer.py
+++ b/torch/utils/mobile_optimizer.py
@@ -4,7 +4,7 @@
 import torch
 from enum import Enum
 from torch._C import _MobileOptimizerType as MobileOptimizerType
-from typing import Optional, AnyStr
+from typing import AnyStr
 
 class LintCode(Enum):
     BUNDLED_INPUT = 1
@@ -14,8 +14,8 @@ class LintCode(Enum):
 
 def optimize_for_mobile(
         script_module: torch.jit.ScriptModule,
-        optimization_blocklist: Optional[set[MobileOptimizerType]] = None,
-        preserved_methods: Optional[list[AnyStr]] = None,
+        optimization_blocklist: set[MobileOptimizerType] | None = None,
+        preserved_methods: list[AnyStr] | None = None,
         backend: str = 'CPU') -> torch.jit.RecursiveScriptModule:
     """
     Optimize a torch script module for mobile deployment.
diff --git a/torch/utils/model_dump/__init__.py b/torch/utils/model_dump/__init__.py
index 2ba3ea36088ce..16d1ab1c6dd1a 100644
--- a/torch/utils/model_dump/__init__.py
+++ b/torch/utils/model_dump/__init__.py
@@ -428,7 +428,7 @@ def get_info_and_burn_skeleton(path_or_bytesio, **kwargs):
     return page
 
 
-def main(argv, *, stdout=None):
+def main(argv, *, stdout=None) -> None:
     warnings.warn("torch.utils.model_dump is deprecated and will be removed in a future PyTorch release.", stacklevel=2)
     parser = argparse.ArgumentParser()
     parser.add_argument("--style", choices=["json", "html"])
diff --git a/torch/utils/module_tracker.py b/torch/utils/module_tracker.py
index 4c7dec0481522..7b5a8aad4dda9 100644
--- a/torch/utils/module_tracker.py
+++ b/torch/utils/module_tracker.py
@@ -68,12 +68,12 @@ def __init__(self) -> None:
         self._has_callback = False
         self._hooks: list[RemovableHandle] = []
 
-    def _maybe_set_engine_callback(self):
+    def _maybe_set_engine_callback(self) -> None:
         # This assumes no concurrent calls to backward
         if self._has_callback:
             return
 
-        def callback():
+        def callback() -> None:
             self.parents = {"Global"}
             self._has_callback = False
 
@@ -99,7 +99,7 @@ def _get_mod_name(self, mod):
         return mod_name
 
     def _get_append_fn(self, name, is_bw):
-        def fn(*args):
+        def fn(*args) -> None:
             if is_bw:
                 self._maybe_set_engine_callback()
             if name in self.parents:
@@ -113,7 +113,7 @@ def fn(*args):
         return fn
 
     def _get_pop_fn(self, name, is_bw):
-        def fn(*args):
+        def fn(*args) -> None:
             if name in self.parents:
                 self.parents.remove(name)
             else:
@@ -125,7 +125,7 @@ def fn(*args):
 
         return fn
 
-    def _fw_pre_hook(self, mod, input):
+    def _fw_pre_hook(self, mod, input) -> None:
         name = self._get_mod_name(mod)
         self._get_append_fn(name, False)()
 
@@ -136,7 +136,7 @@ def _fw_pre_hook(self, mod, input):
                 register_multi_grad_hook(tensors, self._get_pop_fn(name, True))
             )
 
-    def _fw_post_hook(self, mod, input, output):
+    def _fw_post_hook(self, mod, input, output) -> None:
         name = self._get_mod_name(mod)
         self._get_pop_fn(name, False)()
 
diff --git a/torch/utils/serialization/config.py b/torch/utils/serialization/config.py
index 0a3fba9f5b82f..c3e6729c68583 100644
--- a/torch/utils/serialization/config.py
+++ b/torch/utils/serialization/config.py
@@ -12,7 +12,7 @@ class load:
     mmap: bool = False
     endianness: _Optional["_LoadEndianess"] = None
     # MAP_PRIVATE = 2
-    mmap_flags: _Optional[int] = None if sys.platform == "win32" else 2
+    mmap_flags: int | None = None if sys.platform == "win32" else 2
     calculate_storage_offsets: bool = False
 
 
diff --git a/torch/utils/show_pickle.py b/torch/utils/show_pickle.py
index cd8b6c2b8ab9c..269ba3fbda423 100644
--- a/torch/utils/show_pickle.py
+++ b/torch/utils/show_pickle.py
@@ -11,14 +11,14 @@
 __all__ = ["FakeObject", "FakeClass", "DumpUnpickler", "main"]
 
 class FakeObject:
-    def __init__(self, module, name, args):
+    def __init__(self, module, name, args) -> None:
         self.module = module
         self.name = name
         self.args = args
         # NOTE: We don't distinguish between state never set and state set to None.
         self.state = None
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         state_str = "" if self.state is None else f"(state={self.state!r})"
         return f"{self.module}.{self.name}{self.args!r}{state_str}"
 
@@ -26,7 +26,7 @@ def __setstate__(self, state):
         self.state = state
 
     @staticmethod
-    def pp_format(printer, obj, stream, indent, allowance, context, level):
+    def pp_format(printer, obj, stream, indent, allowance, context, level) -> None:
         if not obj.args and obj.state is None:
             stream.write(repr(obj))
             return
@@ -45,12 +45,12 @@ def pp_format(printer, obj, stream, indent, allowance, context, level):
 
 
 class FakeClass:
-    def __init__(self, module, name):
+    def __init__(self, module, name) -> None:
         self.module = module
         self.name = name
         self.__new__ = self.fake_new  # type: ignore[assignment]
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return f"{self.module}.{self.name}"
 
     def __call__(self, *args):
@@ -66,7 +66,7 @@ def __init__(
             file,
             *,
             catch_invalid_utf8=False,
-            **kwargs):
+            **kwargs) -> None:
         super().__init__(file, **kwargs)
         self.catch_invalid_utf8 = catch_invalid_utf8
 
@@ -82,7 +82,7 @@ def persistent_load(self, pid):
     # from their pickle (__getstate__) functions.  Install a custom loader
     # for strings that catches the decode exception and replaces it with
     # a sentinel object.
-    def load_binunicode(self):
+    def load_binunicode(self) -> None:
         strlen, = struct.unpack("<I", self.read(4))  # type: ignore[attr-defined]
         if strlen > sys.maxsize:
             raise Exception("String too long.")  # noqa: TRY002
@@ -104,7 +104,7 @@ def dump(cls, in_stream, out_stream):
         return value
 
 
-def main(argv, output_stream=None):
+def main(argv, output_stream=None) -> int | None:
     if len(argv) != 2:
         # Don't spam stderr if not using stdout.
         if output_stream is not None:
diff --git a/torch/utils/tensorboard/_embedding.py b/torch/utils/tensorboard/_embedding.py
index 28385426c280c..73413e219d0ef 100644
--- a/torch/utils/tensorboard/_embedding.py
+++ b/torch/utils/tensorboard/_embedding.py
@@ -21,7 +21,7 @@ def _gfile_join(a, b):
         return fs.join(a, b)
 
 
-def make_tsv(metadata, save_path, metadata_header=None):
+def make_tsv(metadata, save_path, metadata_header=None) -> None:
     if not metadata_header:
         metadata = [str(x) for x in metadata]
     else:
@@ -37,7 +37,7 @@ def make_tsv(metadata, save_path, metadata_header=None):
 
 
 # https://github.com/tensorflow/tensorboard/issues/44 image label will be squared
-def make_sprite(label_img, save_path):
+def make_sprite(label_img, save_path) -> None:
     from PIL import Image
     from io import BytesIO
 
@@ -74,13 +74,13 @@ def get_embedding_info(metadata, label_img, subdir, global_step, tag):
     return info
 
 
-def write_pbtxt(save_path, contents):
+def write_pbtxt(save_path, contents) -> None:
     config_path = _gfile_join(save_path, "projector_config.pbtxt")
     with tf.io.gfile.GFile(config_path, "wb") as f:
         f.write(tf.compat.as_bytes(contents))
 
 
-def make_mat(matlist, save_path):
+def make_mat(matlist, save_path) -> None:
     with tf.io.gfile.GFile(_gfile_join(save_path, "tensors.tsv"), "wb") as f:
         for x in matlist:
             x = [str(i.item()) for i in x]
diff --git a/torch/utils/tensorboard/_proto_graph.py b/torch/utils/tensorboard/_proto_graph.py
index c32be5b2cae36..b79ba0dfac048 100644
--- a/torch/utils/tensorboard/_proto_graph.py
+++ b/torch/utils/tensorboard/_proto_graph.py
@@ -1,6 +1,5 @@
 import torch
 
-from typing import Optional, Union
 from collections.abc import Sequence
 from tensorboard.compat.proto.node_def_pb2 import NodeDef
 from tensorboard.compat.proto.attr_value_pb2 import AttrValue
@@ -8,7 +7,7 @@
 
 
 # pyrefly: ignore [not-a-type]
-def attr_value_proto(dtype: object, shape: Optional[Sequence[int]], s: Optional[str]) -> dict[str, AttrValue]:
+def attr_value_proto(dtype: object, shape: Sequence[int] | None, s: str | None) -> dict[str, AttrValue]:
     """Create a dict of objects matching a NodeDef's attr field.
 
     Follows https://github.com/tensorflow/tensorboard/blob/master/tensorboard/compat/proto/attr_value.proto
@@ -38,10 +37,10 @@ def tensor_shape_proto(outputsize: Sequence[int]) -> TensorShapeProto:
 def node_proto(
     name: str,
     op: str = "UnSpecified",
-    input: Optional[Union[list[str], str]] = None,
-    dtype: Optional[torch.dtype] = None,
-    shape: Optional[tuple[int, ...]] = None,
-    outputsize: Optional[Sequence[int]] = None,
+    input: list[str] | str | None = None,
+    dtype: torch.dtype | None = None,
+    shape: tuple[int, ...] | None = None,
+    outputsize: Sequence[int] | None = None,
     attributes: str = "",
 ) -> NodeDef:  # pyrefly: ignore [not-a-type]
     """Create an object matching a NodeDef.
diff --git a/torch/utils/tensorboard/_pytorch_graph.py b/torch/utils/tensorboard/_pytorch_graph.py
index 859f80e691ce5..5a052016130b1 100644
--- a/torch/utils/tensorboard/_pytorch_graph.py
+++ b/torch/utils/tensorboard/_pytorch_graph.py
@@ -42,7 +42,7 @@ def __init__(
         tensor_size=None,
         op_type="UnSpecified",
         attributes="",
-    ):
+    ) -> None:
         # TODO; Specify a __slots__ for this class or potentially
         # used namedtuple instead
         self.debugName = debugName
@@ -52,7 +52,7 @@ def __init__(
         self.attributes = attributes
         self.scope = scope
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         repr = []
         repr.append(str(type(self)))
         repr.extend(
@@ -64,7 +64,7 @@ def __repr__(self):
 
 
 class NodePy(NodeBase):
-    def __init__(self, node_cpp, valid_methods):
+    def __init__(self, node_cpp, valid_methods) -> None:
         super().__init__(node_cpp)
         valid_methods = valid_methods[:]
         self.inputs = []
@@ -89,7 +89,7 @@ def __init__(self, node_cpp, valid_methods):
 
 
 class NodePyIO(NodePy):
-    def __init__(self, node_cpp, input_or_output=None):
+    def __init__(self, node_cpp, input_or_output=None) -> None:
         super().__init__(node_cpp, methods_IO)
         try:
             tensor_size = node_cpp.type().sizes()
@@ -109,7 +109,7 @@ def __init__(self, node_cpp, input_or_output=None):
 
 
 class NodePyOP(NodePy):
-    def __init__(self, node_cpp):
+    def __init__(self, node_cpp) -> None:
         super().__init__(node_cpp, methods_OP)
         # Replace single quote which causes strange behavior in TensorBoard
         # TODO: See if we can remove this in the future
@@ -140,32 +140,32 @@ class GraphPy:
     and scope_name_appeared.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         self.nodes_op = []
         self.nodes_io = OrderedDict()
         self.unique_name_to_scoped_name = {}
         self.shallowest_scope_name = "default"
         self.scope_name_appeared = []
 
-    def append(self, x):
+    def append(self, x) -> None:
         if isinstance(x, NodePyIO):
             self.nodes_io[x.debugName] = x
         if isinstance(x, NodePyOP):
             self.nodes_op.append(x)
 
-    def printall(self):
+    def printall(self) -> None:
         print("all nodes")
         for node in self.nodes_op:
             print(node)
         for key in self.nodes_io:
             print(self.nodes_io[key])
 
-    def find_common_root(self):
+    def find_common_root(self) -> None:
         for fullscope in self.scope_name_appeared:
             if fullscope:
                 self.shallowest_scope_name = fullscope.split("/")[0]
 
-    def populate_namespace_from_OP_to_IO(self):
+    def populate_namespace_from_OP_to_IO(self) -> None:
         for node in self.nodes_op:
             for node_output, outputSize in zip(node.outputs, node.outputstensor_size, strict=True):
                 self.scope_name_appeared.append(node.scopeName)
diff --git a/torch/utils/tensorboard/summary.py b/torch/utils/tensorboard/summary.py
index f36382cb42e16..3e538ddc4c02d 100644
--- a/torch/utils/tensorboard/summary.py
+++ b/torch/utils/tensorboard/summary.py
@@ -1,10 +1,9 @@
 # mypy: allow-untyped-defs
 import json
 import logging
-import os
 import struct
 
-from typing import Any, Optional
+from typing import Any
 
 import torch
 import numpy as np
@@ -115,7 +114,7 @@ def _tensor_to_list(t: torch.Tensor) -> list[Any]:
 }
 
 
-def _calc_scale_factor(tensor):
+def _calc_scale_factor(tensor) -> int:
     converted = tensor.numpy() if not isinstance(tensor, np.ndarray) else tensor
     return 1 if converted.dtype == np.uint8 else 255
 
@@ -249,7 +248,7 @@ def hparams(hparam_dict=None, metric_dict=None, hparam_domain_discrete=None):
             ssi.hparams[k].number_value = v
 
             if k in hparam_domain_discrete:
-                domain_discrete: Optional[struct_pb2.ListValue] = struct_pb2.ListValue(
+                domain_discrete: struct_pb2.ListValue | None = struct_pb2.ListValue(
                     values=[
                         struct_pb2.Value(number_value=d)
                         for d in hparam_domain_discrete[k]
@@ -334,7 +333,7 @@ def hparams(hparam_dict=None, metric_dict=None, hparam_domain_discrete=None):
     # pyrefly: ignore [missing-attribute]
     ssi = Summary(value=[Summary.Value(tag=SESSION_START_INFO_TAG, metadata=smd)])
 
-    mts = [MetricInfo(name=MetricName(tag=k)) for k in metric_dict.keys()]
+    mts = [MetricInfo(name=MetricName(tag=k)) for k in metric_dict]
 
     exp = Experiment(hparam_infos=hps, metric_infos=mts)
 
@@ -695,27 +694,23 @@ def make_video(tensor, fps):
     # encode sequence of images into gif string
     clip = mpy.ImageSequenceClip(list(tensor), fps=fps)
 
-    filename = tempfile.NamedTemporaryFile(suffix=".gif", delete=False).name
-    try:  # newer version of moviepy use logger instead of progress_bar argument.
-        clip.write_gif(filename, verbose=False, logger=None)
-    except TypeError:
-        try:  # older version of moviepy does not support progress_bar argument.
-            clip.write_gif(filename, verbose=False, progress_bar=False)
+    with tempfile.NamedTemporaryFile(suffix=".gif") as f:
+        filename = f.name
+        try:  # newer version of moviepy use logger instead of progress_bar argument.
+            clip.write_gif(filename, verbose=False, logger=None)
         except TypeError:
-            clip.write_gif(filename, verbose=False)
+            try:  # older version of moviepy does not support progress_bar argument.
+                clip.write_gif(filename, verbose=False, progress_bar=False)
+            except TypeError:
+                clip.write_gif(filename, verbose=False)
 
-    with open(filename, "rb") as f:
+        f.seek(0)
         tensor_string = f.read()
 
-    try:
-        os.remove(filename)
-    except OSError:
-        logger.warning("The temporary file used by moviepy cannot be deleted.")
-
-    # pyrefly: ignore [missing-attribute]
-    return Summary.Image(
-        height=h, width=w, colorspace=c, encoded_image_string=tensor_string
-    )
+        # pyrefly: ignore [missing-attribute]
+        return Summary.Image(
+            height=h, width=w, colorspace=c, encoded_image_string=tensor_string
+        )
 
 
 def audio(tag, tensor, sample_rate=44100):
diff --git a/torch/utils/tensorboard/writer.py b/torch/utils/tensorboard/writer.py
index 4fab33dc7ff09..008ae59e94e6a 100644
--- a/torch/utils/tensorboard/writer.py
+++ b/torch/utils/tensorboard/writer.py
@@ -3,7 +3,7 @@
 
 import os
 import time
-from typing import Optional, TYPE_CHECKING, Union
+from typing import TYPE_CHECKING, Union
 
 import torch
 
@@ -50,7 +50,7 @@ class FileWriter:
     training.
     """
 
-    def __init__(self, log_dir, max_queue=10, flush_secs=120, filename_suffix=""):
+    def __init__(self, log_dir, max_queue=10, flush_secs=120, filename_suffix="") -> None:
         """Create a `FileWriter` and an event file.
 
         On construction the writer creates a new event file in `log_dir`.
@@ -81,7 +81,7 @@ def get_logdir(self):
         """Return the directory where event file will be written."""
         return self.event_writer.get_logdir()
 
-    def add_event(self, event, step=None, walltime=None):
+    def add_event(self, event, step=None, walltime=None) -> None:
         """Add an event to the event file.
 
         Args:
@@ -98,7 +98,7 @@ def add_event(self, event, step=None, walltime=None):
             event.step = int(step)
         self.event_writer.add_event(event)
 
-    def add_summary(self, summary, global_step=None, walltime=None):
+    def add_summary(self, summary, global_step=None, walltime=None) -> None:
         """Add a `Summary` protocol buffer to the event file.
 
         This method wraps the provided summary in an `Event` protocol buffer
@@ -114,7 +114,7 @@ def add_summary(self, summary, global_step=None, walltime=None):
         event = event_pb2.Event(summary=summary)
         self.add_event(event, global_step, walltime)
 
-    def add_graph(self, graph_profile, walltime=None):
+    def add_graph(self, graph_profile, walltime=None) -> None:
         """Add a `Graph` and step stats protocol buffer to the event file.
 
         Args:
@@ -133,7 +133,7 @@ def add_graph(self, graph_profile, walltime=None):
         event = event_pb2.Event(tagged_run_metadata=trm)
         self.add_event(event, None, walltime)
 
-    def add_onnx_graph(self, graph, walltime=None):
+    def add_onnx_graph(self, graph, walltime=None) -> None:
         """Add a `Graph` protocol buffer to the event file.
 
         Args:
@@ -144,7 +144,7 @@ def add_onnx_graph(self, graph, walltime=None):
         event = event_pb2.Event(graph_def=graph.SerializeToString())
         self.add_event(event, None, walltime)
 
-    def flush(self):
+    def flush(self) -> None:
         """Flushes the event file to disk.
 
         Call this method to make sure that all pending events have been written to
@@ -152,14 +152,14 @@ def flush(self):
         """
         self.event_writer.flush()
 
-    def close(self):
+    def close(self) -> None:
         """Flushes the event file to disk and close the file.
 
         Call this method when you do not need the summary writer anymore.
         """
         self.event_writer.close()
 
-    def reopen(self):
+    def reopen(self) -> None:
         """Reopens the EventFileWriter.
 
         Can be called after `close()` to add more events in the same directory.
@@ -188,7 +188,7 @@ def __init__(
         max_queue=10,
         flush_secs=120,
         filename_suffix="",
-    ):
+    ) -> None:
         """Create a `SummaryWriter` that will write out events and summaries to the event file.
 
         Args:
@@ -299,7 +299,7 @@ def add_hparams(
         hparam_domain_discrete=None,
         run_name=None,
         global_step=None,
-    ):
+    ) -> None:
         """Add a set of hyperparameters to be compared in TensorBoard.
 
         Args:
@@ -355,7 +355,7 @@ def add_scalar(
         walltime=None,
         new_style=False,
         double_precision=False,
-    ):
+    ) -> None:
         """Add scalar data to summary.
 
         Args:
@@ -388,7 +388,7 @@ def add_scalar(
         )
         self._get_file_writer().add_summary(summary, global_step, walltime)
 
-    def add_scalars(self, main_tag, tag_scalar_dict, global_step=None, walltime=None):
+    def add_scalars(self, main_tag, tag_scalar_dict, global_step=None, walltime=None) -> None:
         """Add many scalar data to summary.
 
         Args:
@@ -424,7 +424,7 @@ def add_scalars(self, main_tag, tag_scalar_dict, global_step=None, walltime=None
             fw_tag = fw_logdir + "/" + main_tag.replace("/", "_") + "_" + tag
             if self.all_writers is None:
                 raise AssertionError("self.all_writers is None")
-            if fw_tag in self.all_writers.keys():
+            if fw_tag in self.all_writers:
                 fw = self.all_writers[fw_tag]
             else:
                 fw = FileWriter(
@@ -439,7 +439,7 @@ def add_tensor(
         tensor,
         global_step=None,
         walltime=None,
-    ):
+    ) -> None:
         """Add tensor data to summary.
 
         Args:
@@ -473,7 +473,7 @@ def add_histogram(
         bins="tensorflow",
         walltime=None,
         max_bins=None,
-    ):
+    ) -> None:
         """Add histogram to summary.
 
         Args:
@@ -520,7 +520,7 @@ def add_histogram_raw(
         bucket_counts,
         global_step=None,
         walltime=None,
-    ):
+    ) -> None:
         """Add histogram with raw data.
 
         Args:
@@ -585,7 +585,7 @@ def add_histogram_raw(
 
     def add_image(
         self, tag, img_tensor, global_step=None, walltime=None, dataformats="CHW"
-    ):
+    ) -> None:
         """Add image data to summary.
 
         Note that this requires the ``pillow`` package.
@@ -636,7 +636,7 @@ def add_image(
 
     def add_images(
         self, tag, img_tensor, global_step=None, walltime=None, dataformats="NCHW"
-    ):
+    ) -> None:
         """Add batched image data to summary.
 
         Note that this requires the ``pillow`` package.
@@ -688,7 +688,7 @@ def add_image_with_boxes(
         rescale=1,
         dataformats="CHW",
         labels=None,
-    ):
+    ) -> None:
         """Add image and draw bounding boxes on the image.
 
         Args:
@@ -733,9 +733,9 @@ def add_figure(
         self,
         tag: str,
         figure: Union["Figure", list["Figure"]],
-        global_step: Optional[int] = None,
+        global_step: int | None = None,
         close: bool = True,
-        walltime: Optional[float] = None,
+        walltime: float | None = None,
     ) -> None:
         """Render matplotlib figure into an image and add it to summary.
 
@@ -767,7 +767,7 @@ def add_figure(
                 dataformats="CHW",
             )
 
-    def add_video(self, tag, vid_tensor, global_step=None, fps=4, walltime=None):
+    def add_video(self, tag, vid_tensor, global_step=None, fps=4, walltime=None) -> None:
         """Add video data to summary.
 
         Note that this requires the ``moviepy`` package.
@@ -789,7 +789,7 @@ def add_video(self, tag, vid_tensor, global_step=None, fps=4, walltime=None):
 
     def add_audio(
         self, tag, snd_tensor, global_step=None, sample_rate=44100, walltime=None
-    ):
+    ) -> None:
         """Add audio data to summary.
 
         Args:
@@ -807,7 +807,7 @@ def add_audio(
             audio(tag, snd_tensor, sample_rate=sample_rate), global_step, walltime
         )
 
-    def add_text(self, tag, text_string, global_step=None, walltime=None):
+    def add_text(self, tag, text_string, global_step=None, walltime=None) -> None:
         """Add text data to summary.
 
         Args:
@@ -826,13 +826,13 @@ def add_text(self, tag, text_string, global_step=None, walltime=None):
             text(tag, text_string), global_step, walltime
         )
 
-    def add_onnx_graph(self, prototxt):
+    def add_onnx_graph(self, prototxt) -> None:
         torch._C._log_api_usage_once("tensorboard.logging.add_onnx_graph")
         self._get_file_writer().add_onnx_graph(load_onnx_graph(prototxt))
 
     def add_graph(
         self, model, input_to_model=None, verbose=False, use_strict_trace=True
-    ):
+    ) -> None:
         """Add graph data to summary.
 
         Args:
@@ -867,7 +867,7 @@ def add_embedding(
         global_step=None,
         tag="default",
         metadata_header=None,
-    ):
+    ) -> None:
         """Add embedding projector data to summary.
 
         Args:
@@ -973,7 +973,7 @@ def add_pr_curve(
         num_thresholds=127,
         weights=None,
         walltime=None,
-    ):
+    ) -> None:
         """Add precision recall curve.
 
         Plotting a precision-recall curve lets you understand your model's
@@ -1026,7 +1026,7 @@ def add_pr_curve_raw(
         num_thresholds=127,
         weights=None,
         walltime=None,
-    ):
+    ) -> None:
         """Add precision recall curve with raw data.
 
         Args:
@@ -1062,7 +1062,7 @@ def add_pr_curve_raw(
 
     def add_custom_scalars_multilinechart(
         self, tags, category="default", title="untitled"
-    ):
+    ) -> None:
         """Shorthand for creating multilinechart. Similar to ``add_custom_scalars()``, but the only necessary argument is *tags*.
 
         Args:
@@ -1080,7 +1080,7 @@ def add_custom_scalars_multilinechart(
 
     def add_custom_scalars_marginchart(
         self, tags, category="default", title="untitled"
-    ):
+    ) -> None:
         """Shorthand for creating marginchart.
 
         Similar to ``add_custom_scalars()``, but the only necessary argument is *tags*,
@@ -1101,7 +1101,7 @@ def add_custom_scalars_marginchart(
         layout = {category: {title: ["Margin", tags]}}
         self._get_file_writer().add_summary(custom_scalars(layout))
 
-    def add_custom_scalars(self, layout):
+    def add_custom_scalars(self, layout) -> None:
         """Create special chart by collecting charts tags in 'scalars'.
 
         NOTE: This function can only be called once for each SummaryWriter() object.
@@ -1134,7 +1134,7 @@ def add_mesh(
         config_dict=None,
         global_step=None,
         walltime=None,
-    ):
+    ) -> None:
         """Add meshes or 3D point clouds to TensorBoard.
 
         The visualization is based on Three.js,
@@ -1192,7 +1192,7 @@ def add_mesh(
             mesh(tag, vertices, colors, faces, config_dict), global_step, walltime
         )
 
-    def flush(self):
+    def flush(self) -> None:
         """Flushes the event file to disk.
 
         Call this method to make sure that all pending events have been written to
@@ -1203,7 +1203,7 @@ def flush(self):
         for writer in self.all_writers.values():
             writer.flush()
 
-    def close(self):
+    def close(self) -> None:
         if self.all_writers is None:
             return  # ignore double close
         for writer in self.all_writers.values():
diff --git a/torch/utils/throughput_benchmark.py b/torch/utils/throughput_benchmark.py
index 3f06f6220eef2..d4b94e0b13a39 100644
--- a/torch/utils/throughput_benchmark.py
+++ b/torch/utils/throughput_benchmark.py
@@ -3,7 +3,7 @@
 import torch._C
 
 
-def format_time(time_us=None, time_ms=None, time_s=None):
+def format_time(time_us=None, time_ms=None, time_s=None) -> str:
     """Define time formatting."""
     if sum([time_us is not None, time_ms is not None, time_s is not None]) != 1:
         raise AssertionError("Expected only one of time_us, time_ms, time_s is given.")
@@ -27,7 +27,7 @@ def format_time(time_us=None, time_ms=None, time_s=None):
 
 
 class ExecutionStats:
-    def __init__(self, c_stats, benchmark_config):
+    def __init__(self, c_stats, benchmark_config) -> None:
         self._c_stats = c_stats
         self.benchmark_config = benchmark_config
 
@@ -49,7 +49,7 @@ def total_time_seconds(self):
         return self.num_iters * (
             self.latency_avg_ms / 1000.0) / self.benchmark_config.num_calling_threads
 
-    def __str__(self):
+    def __str__(self) -> str:
         return '\n'.join([
             "Average latency per example: " + format_time(time_ms=self.latency_avg_ms),
             f"Total number of iterations: {self.num_iters}",
@@ -93,7 +93,7 @@ class ThroughputBenchmark:
         >>> print("Number of iterations: {}".format(stats.num_iters))
     """
 
-    def __init__(self, module):
+    def __init__(self, module) -> None:
         if isinstance(module, torch.jit.ScriptModule):
             self._benchmark = torch._C.ThroughputBenchmark(module._c)
         else:
@@ -109,7 +109,7 @@ def run_once(self, *args, **kwargs):
         """
         return self._benchmark.run_once(*args, **kwargs)
 
-    def add_input(self, *args, **kwargs):
+    def add_input(self, *args, **kwargs) -> None:
         """
         Store a single input to a module into the benchmark memory and keep it there.
 
diff --git a/torch/utils/viz/MemoryViz.js b/torch/utils/viz/MemoryViz.js
index 09f8c444f600c..dfeae36cebab7 100644
--- a/torch/utils/viz/MemoryViz.js
+++ b/torch/utils/viz/MemoryViz.js
@@ -806,7 +806,29 @@ function format_frames(frames) {
   }
   const frame_strings = frames
     .filter(frameFilter)
-    .map(f => `${f.filename}:${f.line}:${f.name}`);
+    .map(f => {
+      let frame_str = `${f.filename}:${f.line}:${f.name}`;
+
+      // Add FX debug information if available
+      if (f.fx_node_op || f.fx_node_name || f.fx_node_target) {
+        const fx_parts = [];
+        if (f.fx_node_name) fx_parts.push(`node=${f.fx_node_name}`);
+        if (f.fx_node_op) fx_parts.push(`op=${f.fx_node_op}`);
+        if (f.fx_node_target) fx_parts.push(`target=${f.fx_node_target}`);
+        frame_str += `\n    >> FX: ${fx_parts.join(', ')}`;
+      }
+
+      if (f.fx_original_trace) {
+        frame_str += `\n    >> Original Model Code:`;
+        const original_lines = f.fx_original_trace.trim().split('\n');
+        // Show all lines of the original trace
+        for (const line of original_lines) {
+          frame_str += `\n       ${line}`;
+        }
+      }
+
+      return frame_str;
+    });
   return elideRepeats(frame_strings).join('\n');
 }
 
diff --git a/torch/utils/viz/_cycles.py b/torch/utils/viz/_cycles.py
index 9587a8d682e5b..8abb547d500f8 100644
--- a/torch/utils/viz/_cycles.py
+++ b/torch/utils/viz/_cycles.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 import gc
 import sys
-from typing import Any, NamedTuple, Optional
+from typing import Any, NamedTuple
 import types
 import weakref
 import json
@@ -15,14 +15,14 @@
 def observe_garbage(observer):
     enabled = True
 
-    def disable():
+    def disable() -> None:
         # when GC runs during exit, things like `sys` will already be unloaded
         # so we have to disable the callback to avoid hitting errors.
         nonlocal enabled
         enabled = False
     atexit.register(disable)
 
-    def gc_callback(phase, info):
+    def gc_callback(phase, info) -> None:
         nonlocal enabled
         if not enabled:
             return
@@ -66,7 +66,7 @@ def do_collect(*args, **kwargs):
     gc.callbacks.append(gc_callback)
 
     # provide a way to disarm the callback
-    def remove():
+    def remove() -> None:
         gc.callbacks.remove(gc_callback)
     return remove
 
@@ -103,15 +103,15 @@ def annotated_references(obj):
     """
     references: dict[int, list[str]] = {}
 
-    def add_reference(name, obj):
+    def add_reference(name, obj) -> None:
         references.setdefault(id(obj), []).append(name)
 
-    def add_attrs(*attrs):
+    def add_attrs(*attrs) -> None:
         for attr in attrs:
             if hasattr(obj, attr):
                 add_reference(attr, getattr(obj, attr))
 
-    def add_cell_references():
+    def add_cell_references() -> None:
         try:
             add_attrs("cell_contents")
         except ValueError:
@@ -121,7 +121,7 @@ def add_cell_references():
             # annotate
             pass
 
-    def add_function_references():
+    def add_function_references() -> None:
         add_attrs("__defaults__",
                   "__closure__",
                   "__globals__",
@@ -134,23 +134,23 @@ def add_function_references():
                   "__kwdefaults__")
 
 
-    def add_sequence_references():
+    def add_sequence_references() -> None:
         for position, item in enumerate(obj):
             add_reference(f"[{position}]", item)
 
-    def add_dict_references():
+    def add_dict_references() -> None:
         for key, value in obj.items():
             add_reference("key", key)
             add_reference(f"[{repr(key)}]", value)
 
-    def add_set_references():
+    def add_set_references() -> None:
         for elt in obj:
             add_reference("element", elt)
 
-    def add_bound_method_references():
+    def add_bound_method_references() -> None:
         add_attrs("__self__", "__func__", "im_class")
 
-    def add_weakref_references():
+    def add_weakref_references() -> None:
         # For subclasses of weakref, we can't reliably distinguish the
         # callback (if any) from other attributes.
         if type(obj) is weakref.ref:
@@ -160,7 +160,7 @@ def add_weakref_references():
                 add_reference("__callback__", target)
 
 
-    def add_frame_references():
+    def add_frame_references() -> None:
         f_locals = obj.f_locals
         add_attrs("f_back", "f_code", "f_builtins", "f_globals", "f_trace", "f_locals")
         # Some badly-behaved code replaces the f_locals dict with
@@ -170,7 +170,7 @@ def add_frame_references():
             for name, local in obj.f_locals.items():
                 add_reference(f"local {name}", local)
 
-    def add_getset_descriptor_references():
+    def add_getset_descriptor_references() -> None:
         add_attrs("__objclass__", "__name__", "__doc__")
 
     type_based_references = {
@@ -256,7 +256,7 @@ def format_sequence(obj):
 
 class Node(NamedTuple):
     label: str
-    context: Optional[str]
+    context: str | None
     root: bool
     referrents: list[tuple[str, int]]
 
@@ -473,7 +473,7 @@ def to_html(nodes):
 def observe_tensor_cycles(callback):
     torch.cuda.memory._record_memory_history(max_entries=100000)
 
-    def observer(garbage):
+    def observer(garbage) -> None:
         if garbage:
             if not any(is_cuda_tensor(obj) for obj in garbage):
                 logger.info("No CUDA Tensors found in garbage")
@@ -497,8 +497,8 @@ def warn_tensor_cycles():
     """
     logger.info("Watching Python reference cycles for CUDA Tensors.")
 
-    def write_and_log(html):
-        with NamedTemporaryFile('w', suffix='.html', delete=False) as f:
+    def write_and_log(html) -> None:
+        with NamedTemporaryFile('w', suffix='.html') as f:
             f.write(html)
             logger.warning('Reference cycle includes a CUDA Tensor see visualization of cycle %s', f.name)
     return observe_tensor_cycles(write_and_log)
diff --git a/torch/utils/weak.py b/torch/utils/weak.py
index cd829e531b46c..f71912b59f53a 100644
--- a/torch/utils/weak.py
+++ b/torch/utils/weak.py
@@ -28,7 +28,7 @@ class _IterationGuard:
     # exits.
     # This technique should be relatively thread-safe (since sets are).
 
-    def __init__(self, weakcontainer):
+    def __init__(self, weakcontainer) -> None:
         # Don't create cycles
         self.weakcontainer = ref(weakcontainer)
 
@@ -75,7 +75,7 @@ def __exit__(self, e, t, b):
 class WeakIdRef(weakref.ref):
     __slots__ = ["_id"]
 
-    def __init__(self, key, callback=None):
+    def __init__(self, key, callback=None) -> None:
         # Unlike stock weakref, which preserves hash semantics of the
         # original object but lazily defers hash calls until the first
         # time the user attempts to hash the weakref, we can eagerly
@@ -119,7 +119,7 @@ def __eq__(self, other):
 class _WeakHashRef(weakref.ref):
     __slots__ = ["_id"]
 
-    def __init__(self, key, callback=None):
+    def __init__(self, key, callback=None) -> None:
         # Unlike stock weakref, which preserves hash semantics of the
         # original object but lazily defers hash calls until the first
         # time the user attempts to hash the weakref, we can eagerly
@@ -151,12 +151,12 @@ def __eq__(self, other):
 
 # This is directly adapted from cpython/Lib/weakref.py
 class WeakIdKeyDictionary(MutableMapping):
-    def __init__(self, dict=None, ref_type=WeakIdRef):  # CHANGED
+    def __init__(self, dict=None, ref_type=WeakIdRef) -> None:  # CHANGED
         self.data = {}
 
         self.ref_type = ref_type  # CHANGED
 
-        def remove(k, selfref=ref(self)):
+        def remove(k, selfref=ref(self)) -> None:
             self = selfref()
             if self is not None:
                 if self._iterating:
@@ -175,7 +175,7 @@ def remove(k, selfref=ref(self)):
         if dict is not None:
             self.update(dict)
 
-    def _commit_removals(self):
+    def _commit_removals(self) -> None:
         # NOTE: We don't need to call this method before mutating the dict,
         # because a dead weakref never compares equal to a live weakref,
         # even if they happened to refer to equal objects.
@@ -193,29 +193,29 @@ def _commit_removals(self):
             except KeyError:
                 pass
 
-    def _scrub_removals(self):
+    def _scrub_removals(self) -> None:
         d = self.data
         self._pending_removals = [k for k in self._pending_removals if k in d]
         self._dirty_len = False
 
-    def __delitem__(self, key):
+    def __delitem__(self, key) -> None:
         self._dirty_len = True
         del self.data[self.ref_type(key)]  # CHANGED
 
     def __getitem__(self, key):
         return self.data[self.ref_type(key)]  # CHANGED
 
-    def __len__(self):
+    def __len__(self) -> int:
         if self._dirty_len and self._pending_removals:
             # self._pending_removals may still contain keys which were
             # explicitly removed, we have to scrub them (see issue #21173).
             self._scrub_removals()
         return len(self.data) - len(self._pending_removals)
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return f"<{self.__class__.__name__} at {id(self):#x}>"
 
-    def __setitem__(self, key, value):
+    def __setitem__(self, key, value) -> None:
         self.data[self.ref_type(key, self._remove)] = value  # CHANGED
 
     def copy(self):
@@ -243,7 +243,7 @@ def __deepcopy__(self, memo):
     def get(self, key, default=None):
         return self.data.get(self.ref_type(key), default)  # CHANGED
 
-    def __contains__(self, key):
+    def __contains__(self, key) -> bool:
         try:
             wr = self.ref_type(key)  # CHANGED
         except TypeError:
@@ -303,7 +303,7 @@ def setdefault(self, key, default=None):
             self.ref_type(key, self._remove), default
         )  # CHANGED
 
-    def update(self, dict=None, **kwargs):  # type: ignore[override]
+    def update(self, dict=None, **kwargs) -> None:  # type: ignore[override]
         d = self.data
         if dict is not None:
             if not hasattr(dict, "items"):
@@ -351,7 +351,7 @@ class TensorWeakRef:
 
     ref: WeakRef[Tensor]
 
-    def __init__(self, tensor: Tensor):
+    def __init__(self, tensor: Tensor) -> None:
         if not isinstance(tensor, Tensor):
             raise AssertionError(f"expected torch.Tensor, got {type(tensor)}.")
         self.ref = weakref.ref(tensor)
diff --git a/torch/xpu/__init__.py b/torch/xpu/__init__.py
index 6f1671e4e7a43..194684e3388e4 100644
--- a/torch/xpu/__init__.py
+++ b/torch/xpu/__init__.py
@@ -94,7 +94,7 @@ def is_initialized():
     return _initialized and not _is_in_bad_fork()
 
 
-def _lazy_call(callable, **kwargs):
+def _lazy_call(callable, **kwargs) -> None:
     if is_initialized():
         callable()
     else:
@@ -108,7 +108,7 @@ def _lazy_call(callable, **kwargs):
             _queued_calls.append((callable, traceback.format_stack()))
 
 
-def init():
+def init() -> None:
     r"""Initialize PyTorch's XPU state.
     This is a Python API about lazy initialization that avoids initializing
     XPU until the first time it is accessed. Does nothing if the XPU state is
@@ -117,7 +117,7 @@ def init():
     _lazy_init()
 
 
-def _lazy_init():
+def _lazy_init() -> None:
     global _initialized, _queued_calls
     if is_initialized() or hasattr(_tls, "is_initializing"):
         return
@@ -158,7 +158,7 @@ def _lazy_init():
 
 
 class _DeviceGuard:
-    def __init__(self, index: int):
+    def __init__(self, index: int) -> None:
         self.idx = index
         self.prev_idx = -1
 
@@ -178,7 +178,7 @@ class device:
             this argument is a negative integer or ``None``.
     """
 
-    def __init__(self, device: Any):
+    def __init__(self, device: Any) -> None:
         self.idx = _get_device_index(device, optional=True)
         self.prev_idx = -1
 
@@ -200,7 +200,7 @@ class device_of(device):
         obj (Tensor or Storage): object allocated on the selected device.
     """
 
-    def __init__(self, obj):
+    def __init__(self, obj) -> None:
         idx = obj.get_device() if obj.is_xpu else -1
         super().__init__(idx)
 
@@ -324,7 +324,7 @@ class StreamContext:
 
     cur_stream: Optional["torch.xpu.Stream"]
 
-    def __init__(self, stream: Optional["torch.xpu.Stream"]):
+    def __init__(self, stream: Optional["torch.xpu.Stream"]) -> None:
         self.stream = stream
         self.idx = _get_device_index(None, True)
         if self.idx is None:
@@ -362,7 +362,7 @@ def stream(stream: Optional["torch.xpu.Stream"]) -> StreamContext:
     return StreamContext(stream)
 
 
-def _set_stream_by_id(stream_id, device_index, device_type):
+def _set_stream_by_id(stream_id, device_index, device_type) -> None:
     r"""set stream specified by the stream id, device index and device type
 
     Args: stream_id (int): not visible to the user, used to assigned to the specific stream.
@@ -376,7 +376,7 @@ def _set_stream_by_id(stream_id, device_index, device_type):
     )
 
 
-def set_stream(stream: Stream):
+def set_stream(stream: Stream) -> None:
     r"""Set the current stream.This is a wrapper API to set the stream.
         Usage of this function is discouraged in favor of the ``stream``
         context manager.
@@ -495,7 +495,7 @@ def _set_rng_state_offset(
     """
     final_device = _get_device(device)
 
-    def cb():
+    def cb() -> None:
         default_generator = _get_generator(final_device)
         default_generator.set_offset(offset)
 
diff --git a/torch/xpu/memory.py b/torch/xpu/memory.py
index 069d93cefa9b6..3a9c7d7c83ee4 100644
--- a/torch/xpu/memory.py
+++ b/torch/xpu/memory.py
@@ -190,6 +190,7 @@ def mem_get_info(device: _device_t = None) -> tuple[int, int]:
         int: the memory available on the device in units of bytes.
         int: the total memory on the device in units of bytes
     """
+    _lazy_init()
     device = _get_device_index(device, optional=True)
     return torch._C._xpu_getMemoryInfo(device)
 
diff --git a/torch/xpu/random.py b/torch/xpu/random.py
index 8cd74d385defd..ec770225aef39 100644
--- a/torch/xpu/random.py
+++ b/torch/xpu/random.py
@@ -53,7 +53,7 @@ def set_rng_state(
     elif isinstance(device, int):
         device = torch.device("xpu", device)
 
-    def cb():
+    def cb() -> None:
         idx = device.index
         if idx is None:
             idx = current_device()
@@ -87,7 +87,7 @@ def manual_seed(seed: int) -> None:
     """
     seed = int(seed)
 
-    def cb():
+    def cb() -> None:
         idx = current_device()
         default_generator = torch.xpu.default_generators[idx]
         default_generator.manual_seed(seed)
@@ -105,7 +105,7 @@ def manual_seed_all(seed: int) -> None:
     """
     seed = int(seed)
 
-    def cb():
+    def cb() -> None:
         for i in range(device_count()):
             default_generator = torch.xpu.default_generators[i]
             default_generator.manual_seed(seed)
@@ -123,7 +123,7 @@ def seed() -> None:
         the seed on one GPU.  To initialize all GPUs, use :func:`seed_all`.
     """
 
-    def cb():
+    def cb() -> None:
         idx = current_device()
         default_generator = torch.xpu.default_generators[idx]
         default_generator.seed()
@@ -137,7 +137,7 @@ def seed_all() -> None:
     It's safe to call this function if XPU is not available; in that case, it is silently ignored.
     """
 
-    def cb():
+    def cb() -> None:
         random_seed = 0
         seeded = False
         for i in range(device_count()):
diff --git a/torch/xpu/streams.py b/torch/xpu/streams.py
index a1d78305f0a5e..2f10f1f14dd67 100644
--- a/torch/xpu/streams.py
+++ b/torch/xpu/streams.py
@@ -96,7 +96,7 @@ def __eq__(self, o):
     def __hash__(self):
         return hash((self.sycl_queue, self.device))
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return f"torch.xpu.Stream(device={self.device} sycl_queue={self.sycl_queue:#x})"
 
 
@@ -166,7 +166,7 @@ def synchronize(self) -> None:
     def _as_parameter_(self):
         return ctypes.c_void_p(self.sycl_event)
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         if self.sycl_event:
             return f"torch.xpu.Event(sycl_event={self.sycl_event:#x})"
         else:
diff --git a/torchgen/gen_backend_stubs.py b/torchgen/gen_backend_stubs.py
index 07097010f8f28..c9f1b660f02c5 100644
--- a/torchgen/gen_backend_stubs.py
+++ b/torchgen/gen_backend_stubs.py
@@ -287,8 +287,7 @@ def error_on_missing_kernels(
     expected_backend_native_funcs: list[NativeFunction] = [
         f
         for f in native_functions
-        if f.func.name in expected_backend_op_names.keys()
-        and f.func.name not in full_codegen
+        if f.func.name in expected_backend_op_names and f.func.name not in full_codegen
     ]
     expected_backend_kernel_name_counts: dict[str, list[NativeFunction]] = defaultdict(
         list
diff --git a/torchgen/native_function_generation.py b/torchgen/native_function_generation.py
index f986c77f8faaa..6cbb05682894e 100644
--- a/torchgen/native_function_generation.py
+++ b/torchgen/native_function_generation.py
@@ -55,6 +55,7 @@
 
 # All of these operators don't have any tensor like returns
 FUNCTIONAL_OPS_THAT_CANNOT_GET_AN_OUT_VARIANT = [
+    "_async_error",
     "_assert_async",  # no return
     "_assert_async.msg",  # no return
     "_assert_tensor_metadata",  # no return